
## Configurações do Ambiente:

In [1]:
import platform
print(platform.python_version())

3.7.12


In [2]:
import torch
print(torch.__version__)

1.13.1+cu117


In [3]:
torch.cuda.is_available()

True

In [4]:
import transformers
print(transformers.__version__)

4.30.2



## Treinando SBERT do início (from scratch)

In [6]:
!pip install -U sentence-transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
     |████████████████████████████████| 85 kB 5.2 MB/s             
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting torchvision
  Downloading torchvision-0.14.1-cp37-cp37m-manylinux1_x86_64.whl (24.2 MB)
     |████████████████████████████████| 24.2 MB 22.0 MB/s            
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
     |████████████████████████████████| 1.3 MB 60.3 MB/s            
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125926 sha256=ddc346bc77be3ab488cc3e0d89ee4d5eca48b28ccc6c897f6ee4135189328108
  Stored in directory: /home

In [5]:
import sys

from collections import deque
from torch.utils.data import DataLoader
from sentence_transformers import losses
from sentence_transformers import InputExample
from sentence_transformers import SentenceTransformer, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

## Iniciando a rede SBERT:

In [6]:
ls '../data/saved_models/BERT/'

[0m[01;34mbest_model[0m/        [01;34mcheckpoint-15000[0m/  [01;34mcheckpoint-5000[0m/
[01;34mcheckpoint-10000[0m/  [01;34mcheckpoint-20000[0m/


In [7]:
# Step 1: Usando o modelo BERT treinado do zero:
word_embedding_model = models.Transformer('../data/saved_models/BERT/best_model/')

# Step 2: use a pool function over the token embeddings
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

# Join steps 1 and 2 using the modules argument
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Some weights of the model checkpoint at ../data/saved_models/BERT/best_model/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ../data/saved_models/BERT/best_model/ and are newly initialized: ['bert.pooler.dense.bi

In [8]:
pooling_model

Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})

In [9]:
total_parameters = sum(p.numel() for p in model.parameters())
print("Total de Parâmetros:", total_parameters)

Total de Parâmetros: 57973248


## Carregando os dados:

In [10]:
ls '../data/'

exp1-trj.t  [0m[01;34msaved_models[0m/  train.trg  val.trg
README.md   train.src      val.src    vocab.txt


In [11]:
!head -n 2 '../data/train.src'

506 112 144 148 250 258 384 106 15 4 71 1179 93 165 160 211 300 1245 547
506 4846 506 112 144 148 250 258 384 106 15 4 1179 93 165 160 211 300 1245 547


In [12]:
!head -n 2 '../data/train.trg'

506 112 144 148 250 258 384 106 15 4 71 1179 93 165 160 211 300 1245 547
506 112 144 148 250 258 384 106 15 4 71 1179 93 165 160 211 300 1245 547


In [13]:
train_src = []
with open('../data/train.src', 'r') as file:
    for line in file:
        train_src.append(line.strip().replace("'", ''))

In [14]:
train_trg = []
with open('../data/train.trg', 'r') as file:
    for line in file:
        train_trg.append(line.strip().replace("'", ''))

In [15]:
train_src[:16]

['506 112 144 148 250 258 384 106 15 4 71 1179 93 165 160 211 300 1245 547',
 '506 4846 506 112 144 148 250 258 384 106 15 4 1179 93 165 160 211 300 1245 547',
 '506 112 144 148 250 258 384 106 116 106 15 4 5112 1179 93 165 160 211 300 1245 547',
 '90 506 90 297 144 148 380 384 106 373 15 4 71 1179 9 165 1834 160 211 880 1245 4924 547',
 '506 112 144 250 106 4 71 1179 93 165 160 211 1245 547',
 '506 669 506 112 374 250 106 15 71 58 93 165 160 211 4875 547',
 '506 90 506 297 144 250 106 4 71 1179 93 165 160 211 1245 547',
 '506 669 90 112 374 250 106 116 106 4 71 1179 93 141 301 2334 1245 4875 547',
 '506 112 144 250 258 384 106 15 4 93 165 160 211 547',
 '506 112 374 250 258 384 106 15 4 93 165 160 211 547 4924',
 '669 112 144 250 258 384 106 15 4 93 165 160 300 547 4924',
 '90 112 21 250 258 1938 106 116 15 4 93 165 160 211 4875 547',
 '506 112 384 15 4 1179 93 165 160 547',
 '506 24 384 15 4 1179 93 165 160 547',
 '506 112 384 15 4 6 93 165 160 1245',
 '506 297 258 15 4 1179 93 165 1

In [16]:
train_trg[:16]

['506 112 144 148 250 258 384 106 15 4 71 1179 93 165 160 211 300 1245 547',
 '506 112 144 148 250 258 384 106 15 4 71 1179 93 165 160 211 300 1245 547',
 '506 112 144 148 250 258 384 106 15 4 71 1179 93 165 160 211 300 1245 547',
 '506 112 144 148 250 258 384 106 15 4 71 1179 93 165 160 211 300 1245 547',
 '506 112 144 148 250 258 384 106 15 4 71 1179 93 165 160 211 300 1245 547',
 '506 112 144 148 250 258 384 106 15 4 71 1179 93 165 160 211 300 1245 547',
 '506 112 144 148 250 258 384 106 15 4 71 1179 93 165 160 211 300 1245 547',
 '506 112 144 148 250 258 384 106 15 4 71 1179 93 165 160 211 300 1245 547',
 '506 112 144 148 250 258 384 106 15 4 71 1179 93 165 160 211 300 1245 547',
 '506 112 144 148 250 258 384 106 15 4 71 1179 93 165 160 211 300 1245 547',
 '506 112 144 148 250 258 384 106 15 4 71 1179 93 165 160 211 300 1245 547',
 '506 112 144 148 250 258 384 106 15 4 71 1179 93 165 160 211 300 1245 547',
 '506 112 144 148 250 258 384 106 15 4 71 1179 93 165 160 211 300 1245 547',

In [17]:
print(len(train_src))
print(len(train_trg))

13369584
13369584


In [18]:
# Função para taggear as similaridades entre os pares:
def label_simi(count):
    if count == 0:
        return 1.0
    if count == 1:
        return 0.98
    if count == 2:
        return 0.96
    if count == 3:
        return 0.94
    if count == 4:
        return 0.92
    if count == 5:
        return 0.90
    if count == 6:
        return 0.88
    if count == 7:
        return 0.86
    if count == 8:
        return 0.84
    if count == 9:
        return 0.82
    if count == 10:
        return 0.80
    if count == 11:
        return 0.78
    if count == 12:
        return 0.76
    if count == 13:
        return 0.74
    if count == 14:
        return 0.72
    if count == 15:
        return 0.70

In [19]:
# Criando os pares de treino (label, traj_source, traj_target)
train_examples = []
count = 0
for i in range(len(train_src)):
    train_examples.append(InputExample(texts=[train_src[i], train_trg[i]], label=label_simi(count)))
    count += 1
    
    if count == 16:
        count = 0

In [20]:
print(train_examples[0])

<InputExample> label: 1.0, texts: 506 112 144 148 250 258 384 106 15 4 71 1179 93 165 160 211 300 1245 547; 506 112 144 148 250 258 384 106 15 4 71 1179 93 165 160 211 300 1245 547


In [21]:
print(train_examples[15])

<InputExample> label: 0.7, texts: 506 297 258 15 4 1179 93 165 160 547; 506 112 144 148 250 258 384 106 15 4 71 1179 93 165 160 211 300 1245 547


In [22]:
# We wrap our training dataset into a Pytorch Dataloader to shuffle examples and get batch sizes.
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)

In [23]:
# Loss functions for training a Sentence Transformers model
train_loss = losses.MultipleNegativesRankingLoss(model=model)

## Train a Sentence Transformer model

In [24]:
print("Qtde de dados de treino:", len(train_examples))
print("Qtde de dados de treino em 1 batch:", int(len(train_dataloader)))

Qtde de dados de treino: 13369584
Qtde de dados de treino em 1 batch: 417800


In [25]:
num_epochs = 3

warmup_steps = int(len(train_dataloader) * num_epochs * 0.01)
print("warmup_steps:", warmup_steps)

warmup_steps: 12534


In [26]:
!ls '../data/'

exp1-trj.t  saved_models  train.trg  val.trg
README.md   train.src	  val.src    vocab.txt


In [27]:
!head -n 2 '../data/val.src'

797 779 3631 1536 1527 475 842 492 170 132 1303 3366 2627 1945 415 1910 1896 471 233 1566 2225 5025 533 361 5008 3160 4553 4098 6123 7930 4197 4524 4148 48 8
797 3631 1527 475 1457 842 492 170 132 1303 3366 2627 1945 415 1910 1896 707 471 233 1566 2225 5025 533 361 5008 3160 4553 4098 6123 7930 4197 4524 4148 48 8


In [28]:
!head -n 2 '../data/val.trg'

797 779 3631 1536 1527 475 842 492 170 132 1303 3366 2627 1945 415 1910 1896 471 233 1566 2225 5025 533 361 5008 3160 4553 4098 6123 7930 4197 4524 4148 48 8
797 779 3631 1536 1527 475 842 492 170 132 1303 3366 2627 1945 415 1910 1896 471 233 1566 2225 5025 533 361 5008 3160 4553 4098 6123 7930 4197 4524 4148 48 8


In [29]:
val_src = []
with open('../data/val.src', 'r') as file:
    for line in file:
        val_src.append(line.strip().replace("'", ''))

In [30]:
val_trg = []
with open('../data/val.trg', 'r') as file:
    for line in file:
        val_trg.append(line.strip().replace("'", ''))

In [31]:
# Definindo o conjunto de validação com pares de sentenças e rótulos de similaridade
validation_examples = []
val_lbl = []
count = 0
for i in range(len(val_src)):   
    #validation_examples = [InputExample(texts=[val_src[i], val_trg[i], str(label_simi(count))])]
    validation_examples.append(InputExample(texts=[val_src[i], val_trg[i]], label=label_simi(count)))
    val_lbl.append(label_simi(count))
    count += 1
    if count == 16:
        count = 0


In [32]:
print(validation_examples[0])

<InputExample> label: 1.0, texts: 797 779 3631 1536 1527 475 842 492 170 132 1303 3366 2627 1945 415 1910 1896 471 233 1566 2225 5025 533 361 5008 3160 4553 4098 6123 7930 4197 4524 4148 48 8; 797 779 3631 1536 1527 475 842 492 170 132 1303 3366 2627 1945 415 1910 1896 471 233 1566 2225 5025 533 361 5008 3160 4553 4098 6123 7930 4197 4524 4148 48 8


In [33]:
print(validation_examples[15])

<InputExample> label: 0.7, texts: 797 700 5914 1527 475 842 492 1945 415 471 1328 4427 533 249 361 4098 7930 4524 4148 8; 797 779 3631 1536 1527 475 842 492 170 132 1303 3366 2627 1945 415 1910 1896 471 233 1566 2225 5025 533 361 5008 3160 4553 4098 6123 7930 4197 4524 4148 48 8


In [34]:
# Inicializando o avaliador de similaridade de incorporação com o conjunto de validação
evaluator = EmbeddingSimilarityEvaluator(val_src, val_trg, val_lbl)

In [35]:
# Inicializando uma fila para rastrear os últimos três scores
score_history = deque(maxlen=3)

In [36]:
# Função de retorno de chamada personalizada
def evaluation_callback(score, epoch, steps):
    print(f"Epoch: {epoch}, Step: {steps}, Score: {score}")
    
    score_history.append(score)
    
    if len(score_history) == 3:
        # Verifica se a diferença entre os últimos três scores são menores ou iguais a 0.001
        dif1 = score_history[0] - score_history[1] 
        dif2 = score_history[1] - score_history[2]
        if (dif1 <= 0.001 and dif2 <= 0.001):
            sys.exit(f"Best model saved in checkpoint {steps-2000}!")

In [37]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          evaluator=evaluator,
          evaluation_steps=1000,
          save_best_model=True,
          warmup_steps=warmup_steps,
          checkpoint_path='../data/saved_models/SBERT/',
          checkpoint_save_steps=1000,
          checkpoint_save_total_limit=10,
          callback=evaluation_callback,
          show_progress_bar=True)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/417800 [00:00<?, ?it/s]

Epoch: 0, Step: 1000, Score: 0.8145543245089251
Epoch: 0, Step: 2000, Score: 0.803974880663111
Epoch: 0, Step: 3000, Score: 0.7907480593542794
Epoch: 0, Step: 4000, Score: 0.7521753771178866
Epoch: 0, Step: 5000, Score: 0.7191932882815643
Epoch: 0, Step: 6000, Score: 0.7086832831400212
Epoch: 0, Step: 7000, Score: 0.6967796522916955
Epoch: 0, Step: 8000, Score: 0.7061234252761772
Epoch: 0, Step: 9000, Score: 0.7000015165948558
Epoch: 0, Step: 10000, Score: 0.6990967471524842
Epoch: 0, Step: 11000, Score: 0.7069309666494124


SystemExit: Best model saved in checkpoint 9000!

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
