In [3]:
"""
MODIFIED: (efv) Use STSb-multi-mt Spanish
source: https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/sts/training_stsbenchmark.py

---

This examples trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) for the STSbenchmark from scratch. It generates sentence embeddings
that can be compared using cosine-similarity to measure the similarity.

Usage:
python training_nli.py

OR
python training_nli.py pretrained_transformer_model_name
"""
from torch.utils.data import DataLoader
from torch import cuda
import math
from sentence_transformers import SentenceTransformer,  LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import sys
import os
import gzip
import csv

from datasets import load_dataset

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

# logging.info(f"CUDA Device Name:{cuda.get_device_name()}")

MODEL = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'

#You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
#model_name = sys.argv[1] if len(sys.argv) > 1 else 'distilbert-base-uncased'
model_name = MODEL 

# Read the dataset
train_batch_size = 32
num_epochs = 12
model_save_path = 'output/mine_embedding_model_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read stsb-multi-mt train dataset")

train_samples = []
dev_samples = []
test_samples = []

def samples_from_dataset(dataset):
    samples = [InputExample(texts=[e['sentence1'], e['sentence2']], label=e['similarity_score'] / 5) \
        for e in dataset] 
    return samples

train_samples = samples_from_dataset(load_dataset("stsb_multi_mt", name="ru", split="train"))
dev_samples = samples_from_dataset(load_dataset("stsb_multi_mt", name="ru", split="dev"))
test_samples = samples_from_dataset(load_dataset("stsb_multi_mt", name="ru", split="test"))

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

initial_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, write_csv=False)
initial_evaluator(model)



Downloading pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s][A
Downloading pytorch_model.bin:   2%|▏         | 10.5M/471M [00:02<02:03, 3.73MB/s][A
Downloading pytorch_model.bin:   4%|▍         | 21.0M/471M [00:06<02:13, 3.37MB/s][A
Downloading pytorch_model.bin:   7%|▋         | 31.5M/471M [00:09<02:14, 3.27MB/s][A
Downloading pytorch_model.bin:   9%|▉         | 41.9M/471M [00:12<02:13, 3.22MB/s][A
Downloading pytorch_model.bin:  11%|█         | 52.4M/471M [00:16<02:10, 3.19MB/s][A
Downloading pytorch_model.bin:  13%|█▎        | 62.9M/471M [00:19<02:08, 3.18MB/s][A
Downloading pytorch_model.bin:  16%|█▌        | 73.4M/471M [00:22<02:05, 3.18MB/s][A
Downloading pytorch_model.bin:  18%|█▊        | 83.9M/471M [00:26<02:02, 3.17MB/s][A
Downloading pytorch_model.bin:  20%|██        | 94.4M/471M [00:29<01:58, 3.17MB/s][A
Downloading pytorch_model.bin:  22%|██▏       | 105M/471M [00:32<01:56, 3.15MB/s] [A
Downloading pytorch_model.bin:  25%|██▍       | 115M/471M [00:

2023-05-08 13:30:13 - Use pytorch device: cpu
2023-05-08 13:30:13 - Read stsb-multi-mt train dataset


Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [04:04<01:26, 3.16MB/s]

2023-05-08 13:30:14 - Found cached dataset stsb_multi_mt (C:/Users/maria/.cache/huggingface/datasets/stsb_multi_mt/ru/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9)


Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [04:06<01:26, 3.16MB/s]

2023-05-08 13:30:16 - Found cached dataset stsb_multi_mt (C:/Users/maria/.cache/huggingface/datasets/stsb_multi_mt/ru/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9)


Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [04:07<01:26, 3.16MB/s]

2023-05-08 13:30:18 - Found cached dataset stsb_multi_mt (C:/Users/maria/.cache/huggingface/datasets/stsb_multi_mt/ru/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9)
2023-05-08 13:30:18 - EmbeddingSimilarityEvaluator: Evaluating the model on  dataset:


Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [04:28<01:26, 3.16MB/s]

2023-05-08 13:30:38 - Cosine-Similarity :	Pearson: 0.7867	Spearman: 0.7932
2023-05-08 13:30:38 - Manhattan-Distance:	Pearson: 0.7727	Spearman: 0.7725
2023-05-08 13:30:38 - Euclidean-Distance:	Pearson: 0.7725	Spearman: 0.7724
2023-05-08 13:30:38 - Dot-Product-Similarity:	Pearson: 0.6188	Spearman: 0.6078


0.7932018056116618

In [4]:

logging.info("Read stsb-multi-mt dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')


# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs  * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))


## Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)


##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

#model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='stsb-multi-mt-test')
test_evaluator(model, output_path=model_save_path)


Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [04:58<01:26, 3.16MB/s]

2023-05-08 13:31:08 - Read stsb-multi-mt dev dataset
2023-05-08 13:31:08 - Warmup-steps: 144



Epoch:   0%|          | 0/4 [00:00<?, ?it/s][A

Iteration:   0%|          | 0/360 [00:00<?, ?it/s][A[A

Iteration:   0%|          | 1/360 [00:02<16:28,  2.75s/it][A[A

Iteration:   1%|          | 2/360 [00:05<15:37,  2.62s/it][A[A

Iteration:   1%|          | 3/360 [00:07<15:06,  2.54s/it][A[A

Iteration:   1%|          | 4/360 [00:09<14:00,  2.36s/it][A[A

Iteration:   1%|▏         | 5/360 [00:11<13:18,  2.25s/it][A[A

Iteration:   2%|▏         | 6/360 [00:13<12:59,  2.20s/it][A[A

Iteration:   2%|▏         | 7/360 [00:15<12:42,  2.16s/it][A[A

Iteration:   2%|▏         | 8/360 [00:17<12:32,  2.14s/it][A[A

Iteration:   2%|▎         | 9/360 [00:19<12:49,  2.19s/it][A[A

Iteration:   3%|▎         | 10/360 [00:21<12:40,  2.17s/it][A[A

Iteration:   3%|▎         | 11/360 [00:24<12:48,  2.20s/it][A[A

Iteration:   3%|▎         | 12/360 [00:26<12:36,  2.17s/it][A[A

Iteration:   4%|▎         | 13/360 [00:28<12:35,  2.18s/it][A[A

Iteration:   4%|▍         | 14

2023-05-08 13:43:57 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 0:


                                                                                 
Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [18:13<01:26, 3.16MB/s]
                                                                                 
Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [18:13<01:26, 3.16MB/s]
                                                                                 
Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [18:13<01:26, 3.16MB/s]
                                                                                 
Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [18:13<01:26, 3.16MB/s]
                                                                                 
Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [18:13<01:26, 3.16MB/s]
Epoch:   0%|          | 0/4 [13:15<?, ?it/s][A

2023-05-08 13:44:23 - Cosine-Similarity :	Pearson: 0.8610	Spearman: 0.8589
2023-05-08 13:44:23 - Manhattan-Distance:	Pearson: 0.8226	Spearman: 0.8286
2023-05-08 13:44:23 - Euclidean-Distance:	Pearson: 0.8228	Spearman: 0.8293
2023-05-08 13:44:23 - Dot-Product-Similarity:	Pearson: 0.7536	Spearman: 0.7786
2023-05-08 13:44:23 - Save model to output/mine_embedding_model_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2-2023-05-08_13-27-39



Epoch:  25%|██▌       | 1/4 [13:16<39:50, 796.91s/it][A

Iteration:   0%|          | 0/360 [00:00<?, ?it/s][A[A

Iteration:   0%|          | 1/360 [00:02<16:24,  2.74s/it][A[A

Iteration:   1%|          | 2/360 [00:05<15:03,  2.52s/it][A[A

Iteration:   1%|          | 3/360 [00:07<14:30,  2.44s/it][A[A

Iteration:   1%|          | 4/360 [00:09<13:42,  2.31s/it][A[A

Iteration:   1%|▏         | 5/360 [00:11<13:49,  2.34s/it][A[A

Iteration:   2%|▏         | 6/360 [00:13<13:33,  2.30s/it][A[A

Iteration:   2%|▏         | 7/360 [00:16<13:50,  2.35s/it][A[A

Iteration:   2%|▏         | 8/360 [00:18<13:30,  2.30s/it][A[A

Iteration:   2%|▎         | 9/360 [00:20<12:56,  2.21s/it][A[A

Iteration:   3%|▎         | 10/360 [00:22<12:42,  2.18s/it][A[A

Iteration:   3%|▎         | 11/360 [00:24<12:47,  2.20s/it][A[A

Iteration:   3%|▎         | 12/360 [00:26<12:47,  2.20s/it][A[A

Iteration:   4%|▎         | 13/360 [00:29<13:02,  2.25s/it][A[A

Iteration:   4%|▍    

2023-05-08 13:57:12 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 1:


                                                                                 
Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [31:27<01:26, 3.16MB/s]
                                                                                 
Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [31:27<01:26, 3.16MB/s]
                                                                                 
Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [31:27<01:26, 3.16MB/s]
                                                                                 
Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [31:27<01:26, 3.16MB/s]
                                                                                 
Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [31:27<01:26, 3.16MB/s]
Epoch:  25%|██▌       | 1/4 [26:29<39:50, 796.91s/it][A

2023-05-08 13:57:38 - Cosine-Similarity :	Pearson: 0.8620	Spearman: 0.8604
2023-05-08 13:57:38 - Manhattan-Distance:	Pearson: 0.8202	Spearman: 0.8259
2023-05-08 13:57:38 - Euclidean-Distance:	Pearson: 0.8208	Spearman: 0.8271
2023-05-08 13:57:38 - Dot-Product-Similarity:	Pearson: 0.7574	Spearman: 0.7789
2023-05-08 13:57:38 - Save model to output/mine_embedding_model_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2-2023-05-08_13-27-39



Epoch:  50%|█████     | 2/4 [26:30<26:30, 795.24s/it][A

Iteration:   0%|          | 0/360 [00:00<?, ?it/s][A[A

Iteration:   0%|          | 1/360 [00:02<15:13,  2.55s/it][A[A

Iteration:   1%|          | 2/360 [00:04<13:34,  2.28s/it][A[A

Iteration:   1%|          | 3/360 [00:06<13:41,  2.30s/it][A[A

Iteration:   1%|          | 4/360 [00:08<13:12,  2.22s/it][A[A

Iteration:   1%|▏         | 5/360 [00:11<13:03,  2.21s/it][A[A

Iteration:   2%|▏         | 6/360 [00:12<12:36,  2.14s/it][A[A

Iteration:   2%|▏         | 7/360 [00:14<12:21,  2.10s/it][A[A

Iteration:   2%|▏         | 8/360 [00:17<12:24,  2.12s/it][A[A

Iteration:   2%|▎         | 9/360 [00:18<12:00,  2.05s/it][A[A

Iteration:   3%|▎         | 10/360 [00:20<11:46,  2.02s/it][A[A

Iteration:   3%|▎         | 11/360 [00:22<12:04,  2.08s/it][A[A

Iteration:   3%|▎         | 12/360 [00:24<11:44,  2.02s/it][A[A

Iteration:   4%|▎         | 13/360 [00:27<11:57,  2.07s/it][A[A

Iteration:   4%|▍    

2023-05-08 14:10:31 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 2:


                                                                                 
Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [44:47<01:26, 3.16MB/s]
                                                                                 
Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [44:47<01:26, 3.16MB/s]
                                                                                 
Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [44:47<01:26, 3.16MB/s]
                                                                                 
Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [44:47<01:26, 3.16MB/s]
                                                                                 
Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [44:47<01:26, 3.16MB/s]
Epoch:  50%|█████     | 2/4 [39:49<26:30, 795.24s/it][A

2023-05-08 14:10:57 - Cosine-Similarity :	Pearson: 0.8641	Spearman: 0.8629
2023-05-08 14:10:57 - Manhattan-Distance:	Pearson: 0.8214	Spearman: 0.8284
2023-05-08 14:10:57 - Euclidean-Distance:	Pearson: 0.8221	Spearman: 0.8293
2023-05-08 14:10:57 - Dot-Product-Similarity:	Pearson: 0.7565	Spearman: 0.7817
2023-05-08 14:10:58 - Save model to output/mine_embedding_model_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2-2023-05-08_13-27-39



Epoch:  75%|███████▌  | 3/4 [39:50<13:17, 797.34s/it][A

Iteration:   0%|          | 0/360 [00:00<?, ?it/s][A[A

Iteration:   0%|          | 1/360 [00:01<09:07,  1.52s/it][A[A

Iteration:   1%|          | 2/360 [00:03<09:28,  1.59s/it][A[A

Iteration:   1%|          | 3/360 [00:05<11:04,  1.86s/it][A[A

Iteration:   1%|          | 4/360 [00:07<11:10,  1.88s/it][A[A

Iteration:   1%|▏         | 5/360 [00:09<11:11,  1.89s/it][A[A

Iteration:   2%|▏         | 6/360 [00:11<11:35,  1.96s/it][A[A

Iteration:   2%|▏         | 7/360 [00:13<11:08,  1.89s/it][A[A

Iteration:   2%|▏         | 8/360 [00:16<11:54,  2.03s/it][A[A

Iteration:   2%|▎         | 9/360 [00:17<11:41,  2.00s/it][A[A

Iteration:   3%|▎         | 10/360 [00:19<11:45,  2.02s/it][A[A

Iteration:   3%|▎         | 11/360 [00:22<11:59,  2.06s/it][A[A

Iteration:   3%|▎         | 12/360 [00:24<11:42,  2.02s/it][A[A

Iteration:   4%|▎         | 13/360 [00:25<11:36,  2.01s/it][A[A

Iteration:   4%|▍    

2023-05-08 14:23:41 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 3:


                                                                                 
Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [57:57<01:26, 3.16MB/s]
                                                                                 
Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [57:57<01:26, 3.16MB/s]
                                                                                 
Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [57:57<01:26, 3.16MB/s]
                                                                                 
Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [57:57<01:26, 3.16MB/s]
                                                                                 
Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [57:57<01:26, 3.16MB/s]
Epoch:  75%|███████▌  | 3/4 [52:59<13:17, 797.34s/it][A

2023-05-08 14:24:07 - Cosine-Similarity :	Pearson: 0.8643	Spearman: 0.8630
2023-05-08 14:24:07 - Manhattan-Distance:	Pearson: 0.8215	Spearman: 0.8285
2023-05-08 14:24:07 - Euclidean-Distance:	Pearson: 0.8222	Spearman: 0.8296
2023-05-08 14:24:07 - Dot-Product-Similarity:	Pearson: 0.7561	Spearman: 0.7818
2023-05-08 14:24:07 - Save model to output/mine_embedding_model_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2-2023-05-08_13-27-39



Epoch: 100%|██████████| 4/4 [53:00<00:00, 795.02s/it][A
Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [57:58<01:26, 3.16MB/s]

2023-05-08 14:24:08 - EmbeddingSimilarityEvaluator: Evaluating the model on stsb-multi-mt-test dataset:


Downloading pytorch_model.bin:  42%|████▏     | 199M/471M [58:18<01:26, 3.16MB/s]

2023-05-08 14:24:29 - Cosine-Similarity :	Pearson: 0.8232	Spearman: 0.8214
2023-05-08 14:24:29 - Manhattan-Distance:	Pearson: 0.7980	Spearman: 0.7977
2023-05-08 14:24:29 - Euclidean-Distance:	Pearson: 0.7978	Spearman: 0.7981
2023-05-08 14:24:29 - Dot-Product-Similarity:	Pearson: 0.7037	Spearman: 0.7115


0.821449405018268