## Treinando BERT do in√≠cio (from scratch)

In [1]:
!pip install torch
!pip install tokenizers
!pip install transformers
!pip install accelerate>=0.21.1

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
import tokenizers

from transformers import EarlyStoppingCallback
from transformers import Trainer, TrainingArguments
from transformers import BertTokenizer, LineByLineTextDataset
from transformers import BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling

In [3]:
!ls '../data'

D_p-pts.pickle	exp1-trj.h5	exp1-trj.t   README.md	   train.trg  vocab.txt
D_q-pts.pickle	exp1-trj.label	exp2-trj.h5  saved_models  val.src
Dq-pts.pickle	exp1-trj.pts	porto.csv    train.src	   val.trg


In [4]:
# Pegando o tamanho do vocabul√°rio:
cels_list = []
with open('../data/train.src') as f:
  for line in f:
    cels_traj = line.strip().split() # divide usando o espa√ßo como delimitador
    
    cels_traj = [int(cel) for cel in cels_traj]
    cels_list.extend(cels_traj)


size_vocab = len(set(cels_list))
print('vocabulary size:', size_vocab)

del cels_list, cels_traj  # liberando RAM...

vocabulary size: 18827


In [5]:
!head -2 '../data/train.src'

506 112 144 148 250 258 384 106 15 4 71 1179 93 165 160 211 300 1245 547
506 4846 506 112 144 148 250 258 384 106 15 4 1179 93 165 160 211 300 1245 547


In [6]:
!wc -l '../data/train.src'

13369584 ../data/train.src


In [108]:
# Treinando o tokenizer:
bwpt = tokenizers.BertWordPieceTokenizer(vocab=None)

train_file = "../data/train.src"

bwpt.train(
    files=[train_file],
    vocab_size=size_vocab,
    min_frequency=1,
    limit_alphabet=1000
)

bwpt.save_model('../data/')






['../data/vocab.txt']

In [7]:
# Load the tokenizer
vocab_file_dir = '../data/vocab.txt'

tokenizer =  BertTokenizer.from_pretrained(vocab_file_dir)

sentence = '506 112 144 148 250 258 384'

encoded_input = tokenizer.tokenize(sentence)
print(encoded_input)

['506', '112', '144', '148', '250', '258', '384']




In [8]:
%%time
# Carregando o df de treino
'''
transformers has a predefined class LineByLineTextDataset()
which reads your text line by line and converts them to tokens
'''

train_dataset = LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = "../data/train.src",
    block_size = 128  # maximum sequence length
)

print('No. of lines: ', len(train_dataset)) # No of lines in your datset



No. of lines:  13369584
CPU times: user 1h 17min 31s, sys: 14.5 s, total: 1h 17min 45s
Wall time: 1h 17min 45s


In [9]:
%%time
# Carregando o df de valida√ß√£o
'''
transformers has a predefined class LineByLineTextDataset()
which reads your text line by line and converts them to tokens
'''

eval_dataset = LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = "../data/val.src",
    block_size = 128  # maximum sequence length
)

print('No. of lines: ', len(eval_dataset)) # No of lines in your datset

No. of lines:  134016
CPU times: user 48.4 s, sys: 160 ms, total: 48.6 s
Wall time: 48.5 s


In [10]:
config = BertConfig(
    vocab_size=size_vocab,
    hidden_size=2048, # Dimensao do embedding (default --> 768). OBS: tem que ser um num m√∫ltimplo de "num_attention_heads"
    num_hidden_layers=6, # 6
    num_attention_heads=16, # 12
    max_position_embeddings=512 # 1024
)

model = BertForMaskedLM(config)
print('No of parameters: ', model.num_parameters())

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

No of parameters:  220123531


In [11]:
# Defindo os valores do "early stopping"
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,  # N√∫mero de avalia√ß√µes consecutivas sem melhora
    early_stopping_threshold=0.02,  # Mudan√ßa m√≠nima na m√©trica para considerar melhora
)

In [12]:
# Configura√ß√µes de treino:
training_args = TrainingArguments(
    output_dir='../data/saved_models/BERT/',
    load_best_model_at_end=True,
    overwrite_output_dir=True,
    num_train_epochs=10,
    evaluation_strategy="steps",
    per_device_train_batch_size=32,
    logging_steps=5000,
    save_steps=5000,
    save_total_limit=3,
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[early_stopping_callback]  # Adicione o callback de "early stopping"
)

In [14]:
%%time
trainer.train()
trainer.save_model('../data/saved_models/BERT/best_model')



Step,Training Loss,Validation Loss
5000,8.4116,8.344678
10000,8.3372,8.326161
15000,8.3283,8.316226
20000,8.3189,8.310905


CPU times: user 2h 53min 15s, sys: 11.7 s, total: 2h 53min 27s
Wall time: 2h 53min 23s
