## Treinando BERT do início (from scratch)

In [1]:
!pip install torch
!pip install tokenizers
!pip install transformers
!pip install accelerate>=0.21.1

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
import tokenizers

from transformers import EarlyStoppingCallback
from transformers import Trainer, TrainingArguments
from transformers import BertTokenizer, LineByLineTextDataset
from transformers import BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling

In [3]:
!ls '../data'

df_tdrive_ALL-taxis.csv  exp1-trj.h5	 tdrive.csv		   val.mta
D_p-pts.pickle		 exp1-trj.label  tdrive_formato_t2vec.csv  val.src
D_q-pts.pickle		 exp1-trj.t	 train.mta		   val.trg
Dq-pts.pickle		 mv.csv		 train.src		   vocab.txt
exp1-querydb.h5		 saved_models	 train.trg


In [4]:
# Pegando o tamanho do vocabulário:
cels_list = []
with open('../data/train.src') as f:
  for line in f:
    cels_traj = line.strip().split() # divide usando o espaço como delimitador
    
    cels_traj = [int(cel) for cel in cels_traj]
    cels_list.extend(cels_traj)


size_vocab = len(set(cels_list))
print('vocabulary size:', size_vocab)

del cels_list, cels_traj  # liberando RAM...

vocabulary size: 19690


In [5]:
!head -5 '../data/train.src'

17321 10721 11693
17321 10721 11693
17466 10721
17321 10721 17321 10721 11693
17321 10721 11693


In [6]:
!wc -lc '../data/train.src'

  6352096 289935842 ../data/train.src


In [12]:
# Treinando o tokenizer:
bwpt = tokenizers.BertWordPieceTokenizer(vocab=None)

train_file = "../data/train.src"

bwpt.train(
    files=[train_file],
    vocab_size=size_vocab,
    min_frequency=1,
    limit_alphabet=1000
)

bwpt.save_model('../data/')






['../data/vocab.txt']

In [7]:
# Load the tokenizer
vocab_file_dir = '../data/vocab.txt'

tokenizer =  BertTokenizer.from_pretrained(vocab_file_dir)

sentence = '17321 10721 17321 10721 11693'

encoded_input = tokenizer.tokenize(sentence)
print(encoded_input)

['17321', '10721', '17321', '10721', '11693']




In [8]:
%%time
# Carregando o df de treino
'''
transformers has a predefined class LineByLineTextDataset()
which reads your text line by line and converts them to tokens
'''

train_dataset = LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = "../data/train.src",
    block_size = 128  # maximum sequence length
)

print('No. of lines: ', len(train_dataset)) # No of lines in your datset



No. of lines:  6352096
CPU times: user 26min 15s, sys: 6.89 s, total: 26min 22s
Wall time: 26min 22s


In [66]:
%%time
# Carregando o df de validação
'''
transformers has a predefined class LineByLineTextDataset()
which reads your text line by line and converts them to tokens
'''

eval_dataset = LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = "../data/val2.src", # trajs com len >= 4 para evitar valores "nan" na Validation Loss
    block_size = 128  # maximum sequence length
)

print('No. of lines: ', len(eval_dataset)) # No of lines in your datset

No. of lines:  60858
CPU times: user 14.7 s, sys: 48 ms, total: 14.7 s
Wall time: 14.7 s


In [73]:
config = BertConfig(
    vocab_size=size_vocab,
    hidden_size=768, # Dimensao do embedding (default --> 768). OBS: tem que ser um num múltimplo de "num_attention_heads"
    num_hidden_layers=6, # 6
    num_attention_heads=12, # 12
    max_position_embeddings=512 # 1024
)

model = BertForMaskedLM(config)
print('No of parameters: ', model.num_parameters())

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.30)

No of parameters:  58657258


In [74]:
# Defindo os valores do "early stopping"
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,  # Número de avaliações consecutivas sem melhora
    early_stopping_threshold=0.02,  # Mudança mínima na métrica para considerar melhora
)

In [82]:
# Configurações de treino:
training_args = TrainingArguments(
    output_dir='../data/saved_models/BERT/',
    load_best_model_at_end=True,
    overwrite_output_dir=True,
    num_train_epochs=10,
    evaluation_strategy="steps",
    per_device_train_batch_size=32,
    logging_steps=5000,
    save_steps=5000,
    save_total_limit=5,
)

In [83]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[early_stopping_callback]  # Adicione o callback de "early stopping"
)

In [84]:
%%time
trainer.train()
trainer.save_model('../data/saved_models/BERT/best_model')



Step,Training Loss,Validation Loss
5000,6.5828,5.945213
10000,5.5295,5.135664
15000,5.0567,4.734166
20000,4.8158,4.4688
25000,4.6836,4.33044
30000,4.5617,4.201778
35000,4.4986,4.114286
40000,4.4237,4.030437
45000,4.3827,4.008302
50000,4.3428,3.900366


CPU times: user 3h 32min 36s, sys: 1min 6s, total: 3h 33min 43s
Wall time: 5h 11min 25s
