In [38]:
!pip install -U sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [39]:
from sentence_transformers import SentenceTransformer, util
from datasets import load_dataset
import torch 
from sklearn.metrics import mean_squared_error

In [40]:
# Load the English version of the STSB dataset
dataset = load_dataset("stsb_multi_mt", "en")

train_df = dataset['train'].to_pandas()
eval_df = dataset['dev'].to_pandas()
test_df = dataset['test'].to_pandas()

train_df['similarity_score'] /= 5.0
eval_df['similarity_score'] /= 5.0
test_df['similarity_score'] /= 5.0

In [41]:
test_df

Unnamed: 0,sentence1,sentence2,similarity_score
0,A girl is styling her hair.,A girl is brushing her hair.,0.50
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,0.72
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,1.00
3,A man is cutting up a cucumber.,A man is slicing a cucumber.,0.84
4,A man is playing a harp.,A man is playing a keyboard.,0.30
...,...,...,...
1374,"Philippines, Canada pledge to further boost re...",Philippines saves 100 after ferry sinks,0.00
1375,Israel bars Palestinians from Jerusalem's Old ...,"Two-state solution between Palestinians, Israe...",0.20
1376,How much do you know about Secret Service?,Lawmakers from both sides express outrage at S...,0.20
1377,Obama Struggles to Soothe Saudi Fears As Iran ...,Myanmar Struggles to Finalize Voter Lists for ...,0.00


In [43]:

def compute_similarity(text1, text2, model_name='sentence-transformers/all-MiniLM-L6-v2'):
    
    model = SentenceTransformer(model_name)

    # encode 
    emb1 = model.encode(text1)
    emb2 = model.encode(text2)

    cos_sim = util.cos_sim(torch.tensor([emb1]), torch.tensor([emb2]))

    return cos_sim.item()

In [44]:
sim_score1 = compute_similarity("I had a bad day", "I had so much fun")
print(sim_score1)
sim_score2 = compute_similarity("I had a bad day", "Everything was terrible today")
print(sim_score2)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.34685951471328735


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.6562914252281189


In [None]:
predicted_scores = test_df.apply(lambda row: compute_similarity(row['sentence1'], row['sentence2']), axis=1)

In [46]:
actual_scores = test_df['similarity_score']
mse = mean_squared_error(actual_scores, predicted_scores)
print("Mean Squared Error (MSE):", mse)

Mean Squared Error (MSE): 0.03681892654408509


In [48]:
test_df

Unnamed: 0,sentence1,sentence2,similarity_score
0,A girl is styling her hair.,A girl is brushing her hair.,0.50
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,0.72
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,1.00
3,A man is cutting up a cucumber.,A man is slicing a cucumber.,0.84
4,A man is playing a harp.,A man is playing a keyboard.,0.30
...,...,...,...
1374,"Philippines, Canada pledge to further boost re...",Philippines saves 100 after ferry sinks,0.00
1375,Israel bars Palestinians from Jerusalem's Old ...,"Two-state solution between Palestinians, Israe...",0.20
1376,How much do you know about Secret Service?,Lawmakers from both sides express outrage at S...,0.20
1377,Obama Struggles to Soothe Saudi Fears As Iran ...,Myanmar Struggles to Finalize Voter Lists for ...,0.00


In [49]:
predicted_scores

0       0.805222
1       0.788625
2       0.946493
3       0.882043
4       0.355616
          ...   
1374    0.244586
1375    0.577014
1376    0.471657
1377    0.074634
1378    0.406679
Length: 1379, dtype: float64