## Downloading Dataset

In [1]:
!wget https://zenodo.org/record/2787612/files/SICK.zip?download=1

--2023-08-02 17:53:35--  https://zenodo.org/record/2787612/files/SICK.zip?download=1
Resolving zenodo.org (zenodo.org)... 188.185.124.72
Connecting to zenodo.org (zenodo.org)|188.185.124.72|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 217584 (212K) [application/octet-stream]
Saving to: ‘SICK.zip?download=1’


2023-08-02 17:53:36 (3.08 MB/s) - ‘SICK.zip?download=1’ saved [217584/217584]



In [2]:
!unzip SICK.zip?download=1

Archive:  SICK.zip?download=1
  inflating: readme.txt              
  inflating: SICK.txt                


## Importing Libraries

In [5]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m61.4/86.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[

In [6]:
import os
import csv
import math
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample

In [7]:
df=pd.read_csv('/content/SICK.txt', delimiter='\t', encoding='utf8')
df.head()

Unnamed: 0,pair_ID,sentence_A,sentence_B,entailment_label,relatedness_score,entailment_AB,entailment_BA,sentence_A_original,sentence_B_original,sentence_A_dataset,sentence_B_dataset,SemEval_set
0,1,A group of kids is playing in a yard and an ol...,A group of boys in a yard is playing and a man...,NEUTRAL,4.5,A_neutral_B,B_neutral_A,"A group of children playing in a yard, a man i...","A group of children playing in a yard, a man i...",FLICKR,FLICKR,TRAIN
1,2,A group of children is playing in the house an...,A group of kids is playing in a yard and an ol...,NEUTRAL,3.2,A_contradicts_B,B_neutral_A,"A group of children playing in a yard, a man i...","A group of children playing in a yard, a man i...",FLICKR,FLICKR,TRAIN
2,3,The young boys are playing outdoors and the ma...,The kids are playing outdoors near a man with ...,ENTAILMENT,4.7,A_entails_B,B_entails_A,"The children are playing outdoors, while a man...","The children are playing outdoors, while a man...",FLICKR,FLICKR,TRAIN
3,4,The young boys are playing outdoors and the ma...,There is no boy playing outdoors and there is ...,CONTRADICTION,3.6,A_contradicts_B,B_contradicts_A,"The children are playing outdoors, while a man...","The children are playing outdoors, while a man...",FLICKR,FLICKR,TRIAL
4,5,The kids are playing outdoors near a man with ...,A group of kids is playing in a yard and an ol...,NEUTRAL,3.4,A_neutral_B,B_neutral_A,"A group of children playing in a yard, a man i...","The children are playing outdoors, while a man...",FLICKR,FLICKR,TRAIN


In [8]:
df.isnull().sum()

pair_ID                0
sentence_A             0
sentence_B             0
entailment_label       0
relatedness_score      0
entailment_AB          0
entailment_BA          0
sentence_A_original    0
sentence_B_original    0
sentence_A_dataset     0
sentence_B_dataset     0
SemEval_set            0
dtype: int64

In [9]:
df.dtypes

pair_ID                  int64
sentence_A              object
sentence_B              object
entailment_label        object
relatedness_score      float64
entailment_AB           object
entailment_BA           object
sentence_A_original     object
sentence_B_original     object
sentence_A_dataset      object
sentence_B_dataset      object
SemEval_set             object
dtype: object

In [10]:
df.SemEval_set.unique()

array(['TRAIN', 'TRIAL', 'TEST'], dtype=object)

## Preparing Input Sentences for Training

In [11]:
dataset_path = '/content/SICK.txt'

# Using pretrained model
model_name = 'nli-distilroberta-base-v2'
train_batch_size = 16
model_save_path = 'output/training-'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Loading a pre-trained sentence transformer model
model = SentenceTransformer(model_name)

# Preparing data for training objectives
train_samples = []
dev_samples = []
test_samples = []
with open(dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        score = float(row['relatedness_score'])/ 5.0  # Normalize score to range 0 ... 1
        inp_example = InputExample(texts=[row['sentence_A'], row['sentence_B']], label=score)
        if row['SemEval_set'] == 'TRIAL':
            dev_samples.append(inp_example)
        elif row['SemEval_set'] == 'TEST':
            test_samples.append(inp_example)
        else:
            train_samples.append(inp_example)


Downloading (…)7023f/.gitattributes:   0%|          | 0.00/736 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)433037023f/README.md:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

Downloading (…)3037023f/config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)33037023f/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)7023f/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading (…)33037023f/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)037023f/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

## Performance of base model

In [12]:
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sick-test')
print('Base model performance:', test_evaluator(model))

Base model performance: 0.7877062927879224


## Fine-tuning our model for better embeddings

In [14]:
import shutil
shutil.make_archive('training-nli-distilroberta-base-v2-2023-08-02_17-54-58','zip','training-nli-distilroberta-base-v2-2023-08-02_17-54-58')

'/content/training-nli-distilroberta-base-v2-2023-08-02_17-54-58.zip'

In [13]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


# Evaluate a model based on the similarity of the embeddings by calculating the Spearman and Pearson rank correlation
#in comparison to the gold standard labels.
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sick-dev')


num_epochs=20
# Training the fine tuned model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          output_path=model_save_path)


new_model = SentenceTransformer(model_save_path)

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/278 [00:00<?, ?it/s]

Iteration:   0%|          | 0/278 [00:00<?, ?it/s]

Iteration:   0%|          | 0/278 [00:00<?, ?it/s]

Iteration:   0%|          | 0/278 [00:00<?, ?it/s]

Iteration:   0%|          | 0/278 [00:00<?, ?it/s]

Iteration:   0%|          | 0/278 [00:00<?, ?it/s]

Iteration:   0%|          | 0/278 [00:00<?, ?it/s]

Iteration:   0%|          | 0/278 [00:00<?, ?it/s]

Iteration:   0%|          | 0/278 [00:00<?, ?it/s]

Iteration:   0%|          | 0/278 [00:00<?, ?it/s]

Iteration:   0%|          | 0/278 [00:00<?, ?it/s]

Iteration:   0%|          | 0/278 [00:00<?, ?it/s]

Iteration:   0%|          | 0/278 [00:00<?, ?it/s]

Iteration:   0%|          | 0/278 [00:00<?, ?it/s]

Iteration:   0%|          | 0/278 [00:00<?, ?it/s]

Iteration:   0%|          | 0/278 [00:00<?, ?it/s]

Iteration:   0%|          | 0/278 [00:00<?, ?it/s]

Iteration:   0%|          | 0/278 [00:00<?, ?it/s]

Iteration:   0%|          | 0/278 [00:00<?, ?it/s]

Iteration:   0%|          | 0/278 [00:00<?, ?it/s]

## Performance of new fine-tuned model

In [15]:
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sick-test')
print('New fined-tuned model performance:', test_evaluator(new_model, output_path=model_save_path))

New fined-tuned model performance: 0.8522323018043951


* We can see that after fine-tuning our model's performace increased from base 0.787 to 0.8522

### Few examples of similarity between sentences using our model

In [16]:
def cosine_sim(sent1, sent2):
  emb=new_model.encode([sent1,sent2])
  sen1_emb=emb[0]
  sent2_emb=emb[1]
  cos_sim=np.dot(emb[0],emb[1])/(np.linalg.norm(emb[0])*np.linalg.norm(emb[1]))
  return cos_sim

In [17]:
# Few cosine similarity examples
q=cosine_sim('How do I read my YouTube comments?',	'How do I see my YouTube comments?')
if q>0.90:
  print(f'The sentences are similar with cosine score',q)
else:
  print('They sentences have difference, the cosine score is',q)

The sentences are similar with cosine score 0.95589316


In [18]:
q=cosine_sim('Is God existence itself?	', 'Does God exist?')
if q>0.90:
  print(f'The sentences are similar with cosine score',q)
else:
  print('They sentences have difference, the cosine score is',q)

The sentences are similar with cosine score 0.9546137


In [19]:
q=cosine_sim('The boys played very well in match.', 'The boys did not played very well in match.')
if q>0.90:
  print(f'The sentences are similar with cosine score',q)
else:
  print('They sentences have difference, the cosine score is',q)

They sentences have difference, the cosine score is 0.7821773


## Comparing our model with most downloaded model in Hugging-face library for sentence-similarity

In [20]:
# Comparing our fine-tuned model with the most popular sentence-similarity model from hugging face library
from sentence_transformers import SentenceTransformer
sentences = ['How do I read my YouTube comments?',	'How do I see my YouTube comments?']

popular_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

Downloading (…)0fe39/.gitattributes:   0%|          | 0.00/968 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)83e900fe39/README.md:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

Downloading (…)e900fe39/config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading unigram.json:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Downloading (…)900fe39/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [21]:
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
print('Most popular model simiarity performance:',test_evaluator(popular_model))

Most popular model simiarity performance: 0.7924763887870493


* We can that our fine-tuned model exceeded the performace of the most popular model in hugging-face library `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2` by 6.94%

In [22]:
# Similarity evaluation results for our fine-tuned model
test_result=pd.read_csv('/content/output/training-nli-distilroberta-base-v2-2023-08-02_17-54-58/similarity_evaluation_sick-test_results.csv')
test_result

Unnamed: 0,epoch,steps,cosine_pearson,cosine_spearman,euclidean_pearson,euclidean_spearman,manhattan_pearson,manhattan_spearman,dot_pearson,dot_spearman
0,-1,-1,0.892631,0.852232,0.870371,0.848208,0.869966,0.846926,0.840596,0.785297


In [25]:
a="""QuillBot's paraphraser takes your sentences and makes changes, helping you to rework and rephrase your content quickly and efficiently!"""
b="""With the help of QuillBot's paraphraser, you can rapidly and effectively rework and rephrase your content by taking your sentences and making adjustments!"""
q=cosine_sim(a,b)
if q>0.90:
  print(f'The sentences are similar with cosine score',q)
else:
  print('They sentences have difference, the cosine score is',q)

The sentences are similar with cosine score 0.959372
