# BERT entrenado en WEBNLG


## Comprobación de especificaciones del sistema

In [None]:
# Número de GPUs y modelo
!nvidia-smi -L

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [None]:
# Modelo de CPU
!lscpu |grep 'Model name'

Model name:          Intel(R) Xeon(R) CPU @ 2.20GHz


In [None]:
# Frecuencia del procesador
!lscpu | grep "MHz"

CPU MHz:             2200.156


In [None]:
# Tamaño de memoria RAM
!free -h --si | awk  '/Mem:/{print $2}'

13G


In [None]:
# Tamaño de disco
!df -h / | awk '{print $4}'

Avail
40G


## Instalación de los componentes necesarios

In [None]:
!pip install transformers
!pip install datasets

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 7.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 55.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 51.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 7.6 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 48.5 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses:

## Preprocesado de datos

In [None]:
import pandas as pd
import torch
import warnings
warnings.filterwarnings('ignore')

In [None]:
import urllib.request
import zipfile
url = 'https://gitlab.com/shimorina/webnlg-dataset/-/archive/master/webnlg-dataset-master.zip?path=release_v3.0/en/train'
urllib.request.urlretrieve(url, 'web.zip')
with zipfile.ZipFile('web.zip', 'r') as zip_ref:
    zip_ref.extractall('web')
import glob
import re
import xml.etree.ElementTree as ET
files = glob.glob("/content/web/webnlg-dataset-master-release_v3.0-en-train/release_v3.0/en/train/**/*.xml", recursive=True)
triple_re=re.compile('(\d)triples')


In [None]:
data_dct={}
for file in files:
  tree = ET.parse(file)
  root = tree.getroot()
  triples_num=int(triple_re.findall(file)[0])

  for sub_root in root:

    for ss_root in sub_root:

      strutured_master=[]
      unstructured=[]

      for entry in ss_root:     
        unstructured.append(entry.text)
        strutured=[triple.text for triple in entry]
        strutured_master.extend(strutured)
 
      unstructured=[i for i in unstructured if i.replace('\n','').strip()!='' ]

      for i in range(0,len(strutured_master),triples_num):
        structure_triple = strutured_master[i:i+triples_num]
        strutured_master_str=(' && ').join(structure_triple)
        data_dct[strutured_master_str] = unstructured



In [None]:
len(data_dct)

27100

In [None]:
mdata_dct={"prefix":[], "input_text":[], "target_text":[]}

for st,unst in data_dct.items():
    for i in unst:
        mdata_dct['prefix'].append('webNLG')
        mdata_dct['input_text'].append(st)
        mdata_dct['target_text'].append(i)

df=pd.DataFrame(mdata_dct)
df.to_csv('webNLG2020_train.csv')

In [None]:
train_df=pd.read_csv('webNLG2020_train.csv', index_col=[0])

In [None]:
len(train_df)

73104

In [None]:
train_df=train_df.sample(frac = 1, random_state = 13)
train_df

Unnamed: 0,prefix,input_text,target_text
43118,webNLG,Athens_International_Airport | elevationM | 94,Athens International Airport is 94 metres abov...
48329,webNLG,Aaron_Turner | associatedBand/associatedMusica...,"Aaron Turner, who plays Drone music, played wi..."
55274,webNLG,Asher_and_Mary_Isabelle_Richardson_House | arc...,Alfred Giles was born in England and died in K...
37642,webNLG,"AMC_Matador | alternativeName | ""Rambler Matador""",The AMC Matador is also known as the Rambler M...
37201,webNLG,"1101_Clematis | avgSpeed | ""16.54""^^<http://db...",The celestial body known as 1101 Clematis has ...
...,...,...,...
36794,webNLG,"101_Helena | averageSpeed | ""66384.0""^^xsd:double",101 Helena has an average speed of 18.44 km pe...
25324,webNLG,"Adisham_Hall | architecturalStyle | ""Tudor and...","The architecture style of Adisham Hall, locate..."
65689,webNLG,Alan_Frew | genre | Rock_music && Alan_Frew | ...,"Alan Frew is a solo singer, who plays rock mus..."
33634,webNLG,"Aleksander_Barkov,_Jr. | weight | 96.1632 (kil...",Aleksander Barkov Jr's weight is 96.1632 kg.


In [None]:
def parse(example):
  # remove @en
  example = re.sub('@en','', example)

  # change _ to ' '
  example = re.sub('[_"]',' ', example)

  # remove urls
  example = re.sub("<http.*>", '',example)

  #  split relations according to uppercase tokens 
  triplets = re.split("&&", example)
  for triple in triplets:
    entity = re.split("\|", triple)[1]
    entity2 = entity[1].upper() + entity[2:]
    uppercase = re.findall(r'[A-Z](?:[A-Z]*(?![a-z])|[a-z]*)', entity2)
    if(len(uppercase)>1):
      uppercase = ' '.join(uppercase)
      example = re.sub("{}".format(entity), " {} ".format(uppercase.lower()), example) 

  example = re.sub(r"xsd:[^\s]*\s", "",example)
  example = re.sub(r"xsd:[^\s]*$", "",example)

  example = re.sub('\^','', example)
  return example


In [None]:
train_df['input_text'] = train_df['input_text'].map(parse)

In [None]:
train_df

Unnamed: 0,prefix,input_text,target_text
43118,webNLG,Athens International Airport | elevation m | 94,Athens International Airport is 94 metres abov...
48329,webNLG,Aaron Turner | associated band musical artist ...,"Aaron Turner, who plays Drone music, played wi..."
55274,webNLG,Asher and Mary Isabelle Richardson House | arc...,Alfred Giles was born in England and died in K...
37642,webNLG,AMC Matador | alternative name | Rambler Mata...,The AMC Matador is also known as the Rambler M...
37201,webNLG,1101 Clematis | avg speed | 16.54,The celestial body known as 1101 Clematis has ...
...,...,...,...
36794,webNLG,101 Helena | average speed | 66384.0,101 Helena has an average speed of 18.44 km pe...
25324,webNLG,Adisham Hall | architectural style | Tudor an...,"The architecture style of Adisham Hall, locate..."
65689,webNLG,Alan Frew | genre | Rock music && Alan Frew | ...,"Alan Frew is a solo singer, who plays rock mus..."
33634,webNLG,"Aleksander Barkov, Jr. | weight | 96.1632 (kil...",Aleksander Barkov Jr's weight is 96.1632 kg.


In [None]:
train_df = train_df.iloc[  :10000,:]

In [None]:
train_df

Unnamed: 0,prefix,input_text,target_text
43118,webNLG,Athens International Airport | elevation m | 94,Athens International Airport is 94 metres abov...
48329,webNLG,Aaron Turner | associated band musical artist ...,"Aaron Turner, who plays Drone music, played wi..."
55274,webNLG,Asher and Mary Isabelle Richardson House | arc...,Alfred Giles was born in England and died in K...
37642,webNLG,AMC Matador | alternative name | Rambler Mata...,The AMC Matador is also known as the Rambler M...
37201,webNLG,1101 Clematis | avg speed | 16.54,The celestial body known as 1101 Clematis has ...
...,...,...,...
62258,webNLG,Atatürk Monument (İzmir) | designer | Pietro C...,"Pietro Canonica designed the Ataturk Monument,..."
60017,webNLG,Accademia di Architettura di Mendrisio | count...,Accademia di Architettura di Mendrisio in Mend...
36621,webNLG,(66063) 1998 RO1 | discoverer | Lincoln Near-E...,(66063) 1998 RO1 was discovered by Lincoln Nea...
2638,webNLG,Abel Caballero | monarch | Juan Carlos I of Sp...,Abel Caballero was in office while Juan Carlos...


## Carga del modelo y tokenizer

In [None]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM
from transformers import DataCollatorForLanguageModeling
from datasets import Dataset
from transformers import Trainer, TrainingArguments

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
config = AutoConfig.from_pretrained('bert-base-uncased', output_hidden_states=True)
model = AutoModelForMaskedLM.from_config(config)  # BertForMaskedLM.from_pretrained(path)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
dataset = Dataset.from_pandas(train_df)

In [None]:
dataset

Dataset({
    features: ['prefix', 'input_text', 'target_text', '__index_level_0__'],
    num_rows: 10000
})

In [None]:
def tokenize_function(example):
    text = example['input_text'] + '[SEP]' + example['target_text']
    example['text'] = text
    tokenize_text = tokenizer(text, padding='max_length')
    example['input_ids'], example['token_type_ids'], example['attention_mask'] = tokenize_text['input_ids'], tokenize_text['token_type_ids'], tokenize_text['attention_mask']
    return example

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
tokenized_dataset = dataset.map(tokenize_function, remove_columns=["target_text","input_text","prefix","__index_level_0__"])

  0%|          | 0/10000 [00:00<?, ?ex/s]

In [None]:
tokenized_dataset

Dataset({
    features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10000
})

In [None]:
tokenizer.decode(tokenized_dataset["input_ids"][1])

'[CLS] aaron turner | associated band musical artist | twilight ( band ) & & aaron turner | associated band musical artist | mamiffer & & aaron turner | genre | drone music [SEP] aaron turner, who plays drone music, played with the band twilight and mamiffer. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P

In [None]:
training_args = TrainingArguments(output_dir="train_model/webnlg/bert/",
                                 overwrite_output_dir=True,
                                 num_train_epochs=2,
                                 per_device_train_batch_size=8,
                                 save_steps = 10_000,
                                 save_total_limit=2)

trainer = Trainer(model = model, args = training_args, data_collator = data_collator, train_dataset = tokenized_dataset['input_ids'])

In [None]:
trainer.train()

***** Running training *****
  Num examples = 10000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2500


Step,Training Loss
500,2.6191
1000,2.5307
1500,2.4108
2000,2.3657
2500,2.3778




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=2500, training_loss=2.460812109375, metrics={'train_runtime': 4694.1474, 'train_samples_per_second': 4.261, 'train_steps_per_second': 0.533, 'total_flos': 5264096256000000.0, 'train_loss': 2.460812109375, 'epoch': 2.0})

In [None]:
trainer.save_model("Prototipos/Prueba datos biográficos D2T/Modelos/bert/bert_12epoch_10000examples")

Saving model checkpoint to train_model/webnlg/bert/bert_12epoch_10000examples
Configuration saved in train_model/webnlg/bert/bert_12epoch_10000examples/config.json
Model weights saved in train_model/webnlg/bert/bert_12epoch_10000examples/pytorch_model.bin


In [None]:
model = model.from_pretrained("Prototipos/Prueba datos biográficos D2T/Modelos/bert/bert_12epoch_10000examples")

loading configuration file train_model/webnlg/bert/bert_12epoch_10000examples/config.json
Model config BertConfig {
  "_name_or_path": "train_model/webnlg/bert/bert_10epoch_10000examples",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file train_model/webnlg/bert/bert_12epoch_10000examples/pytorch_model.bin
All model checkpoint weights were used when initializing BertForMa

## Prueba de resultados

In [None]:
from transformers import pipeline

In [None]:
mlm = pipeline('fill-mask', model="Prototipos/Prueba datos biográficos D2T/Modelos/bert/bert_2epoch_10000examples", tokenizer=tokenizer)

loading configuration file train_model/webnlg/bert_2epoch_10000examples/config.json
Model config BertConfig {
  "_name_or_path": "train_model/webnlg/bert_2epoch_10000examples",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading configuration file train_model/webnlg/bert_2epoch_10000examples/config.json
Model config BertConfig {
  "_name_or_path": "train_model/webnlg/bert_2epoch_10000ex

In [None]:
# Mask token
mask = mlm.tokenizer.mask_token

# oración de prueba
phrase = f'[CLS] Alan Shepard | birth place | New Hampshire [SEP] Alan Shepard was {mask} in New Hampshire [SEP]'

result = mlm(phrase)
result

[{'score': 0.5806906819343567,
  'sequence': 'alan shepard | birth place | new hampshire alan shepard was & in new hampshire',
  'token': 1004,
  'token_str': '&'},
 {'score': 0.021990709006786346,
  'sequence': 'alan shepard | birth place | new hampshire alan shepard was united in new hampshire',
  'token': 2142,
  'token_str': 'united'},
 {'score': 0.013880938291549683,
  'sequence': 'alan shepard | birth place | new hampshire alan shepard was states in new hampshire',
  'token': 2163,
  'token_str': 'states'},
 {'score': 0.013802804052829742,
  'sequence': 'alan shepard | birth place | new hampshire alan shepard was, in new hampshire',
  'token': 1010,
  'token_str': ','},
 {'score': 0.009649216197431087,
  'sequence': 'alan shepard | birth place | new hampshire alan shepard was new in new hampshire',
  'token': 2047,
  'token_str': 'new'}]

In [None]:
mlm = pipeline('fill-mask', model="Prototipos/Prueba datos biográficos D2T/Modelos/bert/bert_4epoch_10000examples", tokenizer=tokenizer)

loading configuration file train_model/webnlg/bert/bert_4epoch_10000examples/config.json
Model config BertConfig {
  "_name_or_path": "train_model/webnlg/bert/bert_4epoch_10000examples",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading configuration file train_model/webnlg/bert/bert_4epoch_10000examples/config.json
Model config BertConfig {
  "_name_or_path": "train_model/webnlg/bert

In [None]:
# Mask token
mask = mlm.tokenizer.mask_token

# oración de prueba
phrase = f'[CLS] Alan Shepard | birth place | New Hampshire [SEP] Alan Shepard was {mask} in New Hampshire [SEP]'

result = mlm(phrase)
result

[{'score': 0.04941052198410034,
  'sequence': 'alan shepard | birth place | new hampshire alan shepard was | in new hampshire',
  'token': 1064,
  'token_str': '|'},
 {'score': 0.03397693857550621,
  'sequence': 'alan shepard | birth place | new hampshire alan shepard was in in new hampshire',
  'token': 1999,
  'token_str': 'in'},
 {'score': 0.02579350583255291,
  'sequence': 'alan shepard | birth place | new hampshire alan shepard was of in new hampshire',
  'token': 1997,
  'token_str': 'of'},
 {'score': 0.024289539083838463,
  'sequence': 'alan shepard | birth place | new hampshire alan shepard was, in new hampshire',
  'token': 1010,
  'token_str': ','},
 {'score': 0.024125980213284492,
  'sequence': 'alan shepard | birth place | new hampshire alan shepard was birth in new hampshire',
  'token': 4182,
  'token_str': 'birth'}]

In [None]:
mlm = pipeline('fill-mask', model="Prototipos/Prueba datos biográficos D2T/Modelos/bert/bert_8epoch_10000examples", tokenizer=tokenizer)

OSError: ignored

In [None]:
# Mask token
mask = mlm.tokenizer.mask_token

# oración de prueba
phrase = f'[CLS] Alan Shepard | birth place | New Hampshire [SEP] Alan Shepard was {mask} in New Hampshire [SEP]'

result = mlm(phrase)
result

[{'score': 0.04941052198410034,
  'sequence': 'alan shepard | birth place | new hampshire alan shepard was | in new hampshire',
  'token': 1064,
  'token_str': '|'},
 {'score': 0.03397693857550621,
  'sequence': 'alan shepard | birth place | new hampshire alan shepard was in in new hampshire',
  'token': 1999,
  'token_str': 'in'},
 {'score': 0.02579350583255291,
  'sequence': 'alan shepard | birth place | new hampshire alan shepard was of in new hampshire',
  'token': 1997,
  'token_str': 'of'},
 {'score': 0.024289539083838463,
  'sequence': 'alan shepard | birth place | new hampshire alan shepard was, in new hampshire',
  'token': 1010,
  'token_str': ','},
 {'score': 0.024125980213284492,
  'sequence': 'alan shepard | birth place | new hampshire alan shepard was birth in new hampshire',
  'token': 4182,
  'token_str': 'birth'}]

In [None]:
mlm = pipeline('fill-mask', model="Prototipos/Prueba datos biográficos D2T/Modelos/bert/bert_12epoch_10000examples", tokenizer=tokenizer)

loading configuration file train_model/webnlg/bert/bert_12epoch_10000examples/config.json
Model config BertConfig {
  "_name_or_path": "train_model/webnlg/bert/bert_12epoch_10000examples",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading configuration file train_model/webnlg/bert/bert_12epoch_10000examples/config.json
Model config BertConfig {
  "_name_or_path": "train_model/webnlg/b

In [None]:
# Mask token
mask = mlm.tokenizer.mask_token

# oración de prueba
phrase = f'[CLS] Alan Shepard | birth place | New Hampshire [SEP] Alan Shepard was {mask} in New Hampshire [SEP]'

result = mlm(phrase)
result

[{'score': 0.6572141051292419,
  'sequence': 'alan shepard | birth place | new hampshire alan shepard was born in new hampshire',
  'token': 2141,
  'token_str': 'born'},
 {'score': 0.0822424590587616,
  'sequence': 'alan shepard | birth place | new hampshire alan shepard was birth in new hampshire',
  'token': 4182,
  'token_str': 'birth'},
 {'score': 0.029422596096992493,
  'sequence': 'alan shepard | birth place | new hampshire alan shepard was was in new hampshire',
  'token': 2001,
  'token_str': 'was'},
 {'score': 0.022341931238770485,
  'sequence': 'alan shepard | birth place | new hampshire alan shepard was died in new hampshire',
  'token': 2351,
  'token_str': 'died'},
 {'score': 0.022058486938476562,
  'sequence': 'alan shepard | birth place | new hampshire alan shepard was alma in new hampshire',
  'token': 11346,
  'token_str': 'alma'}]

## Prueba para CLM

In [None]:
# Pruebo casual language modelling
original_phrase = '[CLS] Alan Shepard | birth date | 1923-11-18 [SEP] Alan'
SIZE = 1
phrase_aux = original_phrase

for _ in range(SIZE):
  print(phrase_aux)
  result = mlm(f'{phrase_aux} {mask} [SEP]')
  print(result)
  phrase_aux += " " + result[0]['token_str']

print(phrase_aux)

[CLS] Alan Shepard | birth date | 1923-11-18 [SEP] Alan
[{'score': 0.23580650985240936, 'token': 22189, 'token_str': 'shepard', 'sequence': 'alan shepard | birth date | 1923 - 11 - 18 alan shepard'}, {'score': 0.047308988869190216, 'token': 1011, 'token_str': '-', 'sequence': 'alan shepard | birth date | 1923 - 11 - 18 alan -'}, {'score': 0.045067016035318375, 'token': 5070, 'token_str': 'alan', 'sequence': 'alan shepard | birth date | 1923 - 11 - 18 alan alan'}, {'score': 0.03515009582042694, 'token': 1007, 'token_str': ')', 'sequence': 'alan shepard | birth date | 1923 - 11 - 18 alan )'}, {'score': 0.03253121301531792, 'token': 7035, 'token_str': 'hampshire', 'sequence': 'alan shepard | birth date | 1923 - 11 - 18 alan hampshire'}]
[CLS] Alan Shepard | birth date | 1923-11-18 [SEP] Alan shepard
[{'score': 0.15950722992420197, 'token': 22189, 'token_str': 'shepard', 'sequence': 'alan shepard | birth date | 1923 - 11 - 18 alan shepard shepard'}, {'score': 0.05258994549512863, 'token': 