# SimCSE with SBERT

In [1]:
# !pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp38-cp38-win_amd64.whl (1.1 MB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp38-cp38-win_amd64.whl (3.3 MB)
Collecting regex!=2019.12.17
  Downloading regex-2022.4.24-cp38-cp38-win_amd64.whl (262 kB)
Collecting click
  Downloading click-8.1.3-py3-none-any.whl (96 kB)
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py): started
  Building wheel for sentence-transformers (setup.py): finished with status 'done'
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.0-py3-none-any.whl size=120747 sha256=2afa92df88776a3e8e5375cdbe8fe704b2c006914c2490e2fe3aeb892e3c

In [2]:
import torch
torch.cuda.is_available()

  from .autonotebook import tqdm as notebook_tqdm


True

# Dataset

In [3]:
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader

dataset = load_dataset("tweet_eval", "emotion")

Reusing dataset tweet_eval (C:\Users\user\.cache\huggingface\datasets\tweet_eval\emotion\1.1.0\12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)
100%|██████████| 3/3 [00:00<00:00, 751.40it/s]


In [4]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

# show data
def show_random_elements(dataset, num_examples=20):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [5]:
show_random_elements(dataset['train'])

Unnamed: 0,text,label
0,Fun pizza night last night with the @user crew. What a gorgeous bunch of newbies ❤️ #lively #exciting 💪🏼,joy
1,"God, I've been so physically weak the whole day. So much shaking :(",sadness
2,"Oh that cheery fucking note, good night shit heads X",anger
3,If I spend even 5 minutes with you and you already irritate me I seriously will bitch you out until you shut up,anger
4,@user It's so sad! There's always such optimism with a new year. This is...not good. @user @user,sadness
5,@user wallah my blood is boiling I need to take a nap ugh,anger
6,Historically Japanese have always been into #jazz and #blues. The 70s dark age of jazz big names like C.C. &amp; M.D. were surviving on Tokyo.,joy
7,"@user BLM was outraged by the shooting in NC the other day, turns out the guy pointed gun at police. Wait for facts before outrage",anger
8,snap: hiAleshia 😃,joy
9,I absolutely love having an anxiety attack halfway through a family meal,joy


In [6]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, TripletEvaluator
import logging
from datetime import datetime
import os
import gzip
import csv

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout


#You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = 'bert-base-uncased'

model_save_path = 'output/training_simcse-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name, max_seq_length=64)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


Downloading: 100%|██████████| 570/570 [00:00<00:00, 572kB/s]
Downloading: 100%|██████████| 420M/420M [00:12<00:00, 36.1MB/s] 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading: 100%|██████████|

2022-05-18 14:40:37 - Use pytorch device: cuda


For SimCSE, we create our train_samples with InputExamples that consists of two times the same sentences, i.e.
```
train_samples = [InputExample(texts=["sent1", "sent1"]), InputExample(texts=["sent2", "sent2"]), ...]
````


In [7]:
import tqdm

def get_triplets_input_example(dataset): 
    triplets_input_examples = []
   
    dataset_list = pd.DataFrame(dataset).values.tolist()
    for index, anchor_data in enumerate(dataset):
        positive_dataset = dataset_list.copy()
        negative_dataset = dataset_list.copy()

        positive_dataset.pop(index)
        positive_list = [x[0] for x in positive_dataset if x[1] == anchor_data['label']]
        positive_item = random.choice(positive_list)

        negative_dataset.pop(index)
        negative_list = [x[0] for x in negative_dataset if x[1] != anchor_data['label']]
        negative_item = random.choice(negative_list)
        
        triplets_input_examples.append(InputExample(texts = [anchor_data['text'], positive_item, negative_item], label = anchor_data['label']))

    return triplets_input_examples

def get_input_example(dataset):
    input_examples = []
   
    for index, anchor_data in enumerate(dataset):
        input_examples.append(InputExample(texts = [anchor_data['text']], label = anchor_data['label']))

    return input_examples

In [8]:
# Read the AllNLI.tsv.gz file and create the training dataset
# We just pass sentence1 two times to InputExample and use MultipleNegativesRankingLoss
train_samples = []
dev_samples = []

print(type(dataset['train']))
logging.info("get train triplets input")
train_samples = get_input_example(dataset['train'])
logging.info("get validation triplets input")
eval_val_examples  = get_triplets_input_example(dataset['validation'])
logging.info("setting triplet evaluator")
dev_evaluator = TripletEvaluator.from_input_examples(eval_val_examples, batch_size=8, name='my_dev')
# dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(eval_val_examples, batch_size=8, name='my_dev')

<class 'datasets.arrow_dataset.Dataset'>
2022-05-18 14:40:37 - get train triplets input
2022-05-18 14:40:37 - get validation triplets input
2022-05-18 14:40:38 - setting triplet evaluator


In [9]:
print(len(dataset['train']))
print(len(dataset['test']))
print(len(dataset['validation']))
print(dataset['validation'][0])

3257
1421
374
{'text': '@user @user Oh, hidden revenge and anger...I rememberthe time,she rebutted you.', 'label': 0}


# Train

As loss, we use: MultipleNegativesRankingLoss

Here, texts[0] and texts[1] are considered as positive pair, while all others are negatives in a batch

In [10]:
from transformers.optimization import AdamW

# Configure the training
train_batch_size = 32
num_epochs = 1

# Use MultipleNegativesRankingLoss for SimCSE
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.BatchAllTripletLoss(model)


warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

logging.info("Performance before training")
dev_evaluator(model)

#optimizer
optimizer = AdamW(model.parameters(),
                    lr=5e-5,
                    eps=1e-8)

# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=dev_evaluator,
          epochs=num_epochs,
          evaluation_steps=100,
          warmup_steps=warmup_steps,
          output_path=model_save_path,
        #   optimizer_class=  ,
          )


2022-05-18 14:40:38 - Warmup-steps: 11
2022-05-18 14:40:38 - Performance before training
2022-05-18 14:40:38 - TripletEvaluator: Evaluating the model on my_dev dataset:




2022-05-18 14:40:46 - Accuracy Cosine Distance:   	51.87
2022-05-18 14:40:46 - Accuracy Manhattan Distance:	51.60
2022-05-18 14:40:46 - Accuracy Euclidean Distance:	50.80



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
Iteration:   0%|          | 0/102 [00:00<?, ?it/s][A
Iteration:   1%|          | 1/102 [00:00<00:43,  2.32it/s][A
Iteration:   2%|▏         | 2/102 [00:01<01:06,  1.50it/s][A
Iteration:   3%|▎         | 3/102 [00:01<01:06,  1.50it/s][A
Iteration:   4%|▍         | 4/102 [00:02<01:06,  1.48it/s][A
Iteration:   5%|▍         | 5/102 [00:03<01:06,  1.46it/s][A
Iteration:   6%|▌         | 6/102 [00:03<01:04,  1.50it/s][A
Iteration:   7%|▋         | 7/102 [00:04<01:02,  1.53it/s][A
Iteration:   8%|▊         | 8/102 [00:05<01:01,  1.54it/s][A
Iteration:   9%|▉         | 9/102 [00:05<00:59,  1.56it/s][A
Iteration:  10%|▉         | 10/102 [00:06<00:58,  1.58it/s][A
Iteration:  11%|█         | 11/102 [00:07<01:04,  1.40it/s][A
Epoch:   0%|          | 0/1 [00:07<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 4.00 GiB total capacity; 2.83 GiB already allocated; 0 bytes free; 2.95 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:


##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

model = SentenceTransformer(model_save_path)
logging.info("get test triplets input")
eval_test_examples  = get_triplets_input_example(dataset['test'])
# test_evaluator = TripletEvaluator.from_input_examples(test_samples, batch_size=train_batch_size, name='sts-test')
test_evaluator = TripletEvaluator.from_input_examples(eval_test_examples, batch_size=train_batch_size, name='my-test')
test_evaluator(model, output_path=model_save_path)