# SimCSE with SBERT

In [1]:
# !pip install sentence-transformers

In [2]:
import torch
torch.cuda.is_available()

True

# Dataset

In [3]:
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader

dataset = load_dataset("tweet_eval", "sentiment")

Reusing dataset tweet_eval (/home/rak/.cache/huggingface/datasets/tweet_eval/emoji/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

# show data
def show_random_elements(dataset, num_examples=20):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [5]:
show_random_elements(dataset['train'])

Unnamed: 0,text,label
0,Long lines mean good food. Tacos overload in Grand Central Market. #LA2016 #nomnom…,❤
1,"Aonair - hands down amazing wine, amazing vineyard and of course, the owner- Grant Long Jr. ️ him.…",❤
2,"""Who Likes Peanuts"" Good Morning Mi Gente Linda Y Fea, Today Is A Beautiful Morning Because…",😂
3,Exploring the Hollywood sign and all around there today! More on my Insta story #TheOne…,😎
4,Yahsha and his baby cousin Kaniyah meeting for the first time. They created a bond instantly @user,💕
5,Found this picture of us. XOXO ️ #blackandmild #artists #friends #love @ Casa 0101 Theater,❤
6,Cotton candy California sunset . #SUNSET #california #cottoncandyclouds #calilife…,😍
7,#tbt to the day I made it on TV doing what I love ️ #BeaverNation @ Oregon State University,❤
8,️Sun-Kissed #balayage for #irelandbaldwin using @user #bondultim8 and #lightmaster…,☀
9,Such an amazing live performance! P A N I C A T T H E D I S C O ️ @ Mayan Nightclub,❤


In [6]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, TripletEvaluator
import logging
from datetime import datetime
import os
import gzip
import csv

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout


#You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = 'bert-base-uncased'

model_save_path = 'output/training_simcse-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name, max_seq_length=64)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


2022-05-18 13:26:43 - Use pytorch device: cuda


For SimCSE, we create our train_samples with InputExamples that consists of two times the same sentences, i.e.
```
train_samples = [InputExample(texts=["sent1", "sent1"]), InputExample(texts=["sent2", "sent2"]), ...]
````


In [7]:
import tqdm

def get_triplets_input_example(dataset): 
    triplets_input_examples = []
   
    dataset_list = pd.DataFrame(dataset).values.tolist()
    for index, anchor_data in enumerate(dataset):
        positive_dataset = dataset_list.copy()
        negative_dataset = dataset_list.copy()

        positive_dataset.pop(index)
        positive_list = [x[0] for x in positive_dataset if x[1] == anchor_data['label']]
        positive_item = random.choice(positive_list)

        negative_dataset.pop(index)
        negative_list = [x[0] for x in negative_dataset if x[1] != anchor_data['label']]
        negative_item = random.choice(negative_list)
        
        triplets_input_examples.append(InputExample(texts = [anchor_data['text'], positive_item, negative_item], label = anchor_data['label']))

    return triplets_input_examples

def get_input_example(dataset):
    input_examples = []
   
    for index, anchor_data in enumerate(dataset):
        input_examples.append(InputExample(texts = [anchor_data['text']], label = anchor_data['label']))

    return input_examples

In [8]:
# Read the AllNLI.tsv.gz file and create the training dataset
# We just pass sentence1 two times to InputExample and use MultipleNegativesRankingLoss
train_samples = []
dev_samples = []

print(type(dataset['train']))
logging.info("get train triplets input")
train_samples = get_input_example(dataset['train'])
logging.info("get validation triplets input")
eval_val_examples  = get_triplets_input_example(dataset['validation'])
logging.info("setting triplet evaluator")
dev_evaluator = TripletEvaluator.from_input_examples(eval_val_examples, batch_size=8, name='my_dev')
# dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(eval_val_examples, batch_size=8, name='my_dev')

<class 'datasets.arrow_dataset.Dataset'>
2022-05-18 13:26:43 - get train triplets input
2022-05-18 13:26:45 - get validation triplets input
2022-05-18 13:26:49 - setting triplet evaluator


In [9]:
print(len(dataset['train']))
print(len(dataset['test']))
print(len(dataset['validation']))
print(dataset['validation'][0])

45000
50000
5000
{'text': 'A little throwback with my favourite person @ Water Wall', 'label': 0}


# Train

As loss, we use: MultipleNegativesRankingLoss

Here, texts[0] and texts[1] are considered as positive pair, while all others are negatives in a batch

In [10]:
from transformers.optimization import AdamW

# Configure the training
train_batch_size = 32
num_epochs = 5

# Use MultipleNegativesRankingLoss for SimCSE
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.BatchAllTripletLoss(model)


warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

logging.info("Performance before training")
dev_evaluator(model)

#optimizer
optimizer = AdamW(model.parameters(),
                    lr=5e-5,
                    eps=1e-8)

# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=dev_evaluator,
          epochs=num_epochs,
          evaluation_steps=100,
          warmup_steps=warmup_steps,
          output_path=model_save_path,
        #   optimizer_class=  ,
          )


2022-05-18 13:26:49 - Warmup-steps: 704
2022-05-18 13:26:49 - Performance before training
2022-05-18 13:26:49 - TripletEvaluator: Evaluating the model on my_dev dataset:
2022-05-18 13:27:06 - Accuracy Cosine Distance:   	53.24
2022-05-18 13:27:06 - Accuracy Manhattan Distance:	52.84
2022-05-18 13:27:06 - Accuracy Euclidean Distance:	53.00



Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1407 [00:00<?, ?it/s]

2022-05-18 13:27:12 - TripletEvaluator: Evaluating the model on my_dev dataset in epoch 0 after 100 steps:
2022-05-18 13:27:25 - Accuracy Cosine Distance:   	53.24
2022-05-18 13:27:25 - Accuracy Manhattan Distance:	52.80
2022-05-18 13:27:25 - Accuracy Euclidean Distance:	52.68

2022-05-18 13:27:25 - Save model to output/training_simcse-2022-05-18_13-26-37
2022-05-18 13:27:32 - TripletEvaluator: Evaluating the model on my_dev dataset in epoch 0 after 200 steps:
2022-05-18 13:27:45 - Accuracy Cosine Distance:   	53.50
2022-05-18 13:27:45 - Accuracy Manhattan Distance:	54.26
2022-05-18 13:27:45 - Accuracy Euclidean Distance:	53.92

2022-05-18 13:27:46 - Save model to output/training_simcse-2022-05-18_13-26-37
2022-05-18 13:27:52 - TripletEvaluator: Evaluating the model on my_dev dataset in epoch 0 after 300 steps:
2022-05-18 13:28:06 - Accuracy Cosine Distance:   	56.14
2022-05-18 13:28:06 - Accuracy Manhattan Distance:	55.92
2022-05-18 13:28:06 - Accuracy Euclidean Distance:	56.04

2022-

Iteration:   0%|          | 0/1407 [00:00<?, ?it/s]

2022-05-18 13:32:14 - TripletEvaluator: Evaluating the model on my_dev dataset in epoch 1 after 100 steps:
2022-05-18 13:32:28 - Accuracy Cosine Distance:   	63.14
2022-05-18 13:32:28 - Accuracy Manhattan Distance:	62.28
2022-05-18 13:32:28 - Accuracy Euclidean Distance:	62.46

2022-05-18 13:32:28 - Save model to output/training_simcse-2022-05-18_13-26-37
2022-05-18 13:32:34 - TripletEvaluator: Evaluating the model on my_dev dataset in epoch 1 after 200 steps:
2022-05-18 13:32:49 - Accuracy Cosine Distance:   	63.52
2022-05-18 13:32:49 - Accuracy Manhattan Distance:	62.72
2022-05-18 13:32:49 - Accuracy Euclidean Distance:	62.76

2022-05-18 13:32:49 - Save model to output/training_simcse-2022-05-18_13-26-37
2022-05-18 13:32:55 - TripletEvaluator: Evaluating the model on my_dev dataset in epoch 1 after 300 steps:
2022-05-18 13:33:09 - Accuracy Cosine Distance:   	63.26
2022-05-18 13:33:09 - Accuracy Manhattan Distance:	62.26
2022-05-18 13:33:09 - Accuracy Euclidean Distance:	62.62

2022-

Iteration:   0%|          | 0/1407 [00:00<?, ?it/s]

2022-05-18 13:37:14 - TripletEvaluator: Evaluating the model on my_dev dataset in epoch 2 after 100 steps:
2022-05-18 13:37:29 - Accuracy Cosine Distance:   	63.92
2022-05-18 13:37:29 - Accuracy Manhattan Distance:	62.72
2022-05-18 13:37:29 - Accuracy Euclidean Distance:	62.72

2022-05-18 13:37:34 - TripletEvaluator: Evaluating the model on my_dev dataset in epoch 2 after 200 steps:
2022-05-18 13:37:49 - Accuracy Cosine Distance:   	64.64
2022-05-18 13:37:49 - Accuracy Manhattan Distance:	63.50
2022-05-18 13:37:49 - Accuracy Euclidean Distance:	63.68

2022-05-18 13:37:49 - Save model to output/training_simcse-2022-05-18_13-26-37
2022-05-18 13:37:55 - TripletEvaluator: Evaluating the model on my_dev dataset in epoch 2 after 300 steps:
2022-05-18 13:38:09 - Accuracy Cosine Distance:   	64.02
2022-05-18 13:38:09 - Accuracy Manhattan Distance:	63.44
2022-05-18 13:38:09 - Accuracy Euclidean Distance:	63.24

2022-05-18 13:38:15 - TripletEvaluator: Evaluating the model on my_dev dataset in ep

Iteration:   0%|          | 0/1407 [00:00<?, ?it/s]

2022-05-18 13:42:11 - TripletEvaluator: Evaluating the model on my_dev dataset in epoch 3 after 100 steps:
2022-05-18 13:42:26 - Accuracy Cosine Distance:   	64.00
2022-05-18 13:42:26 - Accuracy Manhattan Distance:	64.10
2022-05-18 13:42:26 - Accuracy Euclidean Distance:	64.04

2022-05-18 13:42:32 - TripletEvaluator: Evaluating the model on my_dev dataset in epoch 3 after 200 steps:
2022-05-18 13:42:46 - Accuracy Cosine Distance:   	64.02
2022-05-18 13:42:46 - Accuracy Manhattan Distance:	63.52
2022-05-18 13:42:46 - Accuracy Euclidean Distance:	63.72

2022-05-18 13:42:52 - TripletEvaluator: Evaluating the model on my_dev dataset in epoch 3 after 300 steps:
2022-05-18 13:43:06 - Accuracy Cosine Distance:   	64.02
2022-05-18 13:43:06 - Accuracy Manhattan Distance:	63.56
2022-05-18 13:43:06 - Accuracy Euclidean Distance:	63.68

2022-05-18 13:43:12 - TripletEvaluator: Evaluating the model on my_dev dataset in epoch 3 after 400 steps:
2022-05-18 13:43:26 - Accuracy Cosine Distance:   	64.08

Iteration:   0%|          | 0/1407 [00:00<?, ?it/s]

2022-05-18 13:47:11 - TripletEvaluator: Evaluating the model on my_dev dataset in epoch 4 after 100 steps:
2022-05-18 13:47:25 - Accuracy Cosine Distance:   	65.02
2022-05-18 13:47:25 - Accuracy Manhattan Distance:	64.80
2022-05-18 13:47:25 - Accuracy Euclidean Distance:	64.96

2022-05-18 13:47:25 - Save model to output/training_simcse-2022-05-18_13-26-37
2022-05-18 13:47:31 - TripletEvaluator: Evaluating the model on my_dev dataset in epoch 4 after 200 steps:
2022-05-18 13:47:45 - Accuracy Cosine Distance:   	64.80
2022-05-18 13:47:45 - Accuracy Manhattan Distance:	64.86
2022-05-18 13:47:45 - Accuracy Euclidean Distance:	64.78

2022-05-18 13:47:51 - TripletEvaluator: Evaluating the model on my_dev dataset in epoch 4 after 300 steps:
2022-05-18 13:48:05 - Accuracy Cosine Distance:   	64.30
2022-05-18 13:48:05 - Accuracy Manhattan Distance:	64.38
2022-05-18 13:48:05 - Accuracy Euclidean Distance:	64.22

2022-05-18 13:48:11 - TripletEvaluator: Evaluating the model on my_dev dataset in ep

In [11]:


##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

model = SentenceTransformer(model_save_path)
logging.info("get test triplets input")
eval_test_examples  = get_triplets_input_example(dataset['test'])
# test_evaluator = TripletEvaluator.from_input_examples(test_samples, batch_size=train_batch_size, name='sts-test')
test_evaluator = TripletEvaluator.from_input_examples(eval_test_examples, batch_size=train_batch_size, name='my-test')
test_evaluator(model, output_path=model_save_path)

2022-05-18 13:52:06 - Load pretrained SentenceTransformer: output/training_simcse-2022-05-18_13-26-37
2022-05-18 13:52:08 - Use pytorch device: cuda
2022-05-18 13:52:08 - get test triplets input
2022-05-18 14:02:07 - TripletEvaluator: Evaluating the model on my-test dataset:
2022-05-18 14:03:02 - Accuracy Cosine Distance:   	64.85
2022-05-18 14:03:02 - Accuracy Manhattan Distance:	64.74
2022-05-18 14:03:02 - Accuracy Euclidean Distance:	64.81



0.6485