In [19]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [20]:
import spacy
import numpy as np
from scipy.spatial.distance import cosine
from datasets import load_dataset
from sklearn.metrics import mean_squared_error

In [21]:
# Load the English version of the STSB dataset
dataset = load_dataset("stsb_multi_mt", "en")

train_df = dataset['train'].to_pandas()
eval_df = dataset['dev'].to_pandas()
test_df = dataset['test'].to_pandas()

train_df['similarity_score'] /= 5.0
eval_df['similarity_score'] /= 5.0
test_df['similarity_score'] /= 5.0

In [22]:
nlp = spacy.load('en_core_web_md')

In [23]:
def preprocess_text(text):
    doc = nlp(text)
    preprocessed_text = ' '.join([token.text.lower() for token in doc if not token.is_stop])
    return preprocessed_text

In [24]:
def compute_similarity(text1, text2):
    
    preprocessed_text1 = preprocess_text(text1)
    preprocessed_text2 = preprocess_text(text2)

    # embeddings
    doc1 = nlp(preprocessed_text1)
    doc2 = nlp(preprocessed_text2)

    # average embedding for text
    avg_embedding1 = np.mean([token.vector for token in doc1 if token.has_vector], axis=0)
    avg_embedding2 = np.mean([token.vector for token in doc2 if token.has_vector], axis=0)

    if np.any(avg_embedding1) and np.any(avg_embedding2):
        similarity = 1 - cosine(avg_embedding1, avg_embedding2)
    else:
        similarity = 0.0 

    return similarity

In [25]:

sim_score1 = compute_similarity("I had a bad day", "I had so much fun")
sim_score2 = compute_similarity("I had a bad day", "Everything was terrible today")

print("Similarity score 1:", sim_score1)
print("Similarity score 2:", sim_score2)

Similarity score 1: 0.287041295253869
Similarity score 2: 0.5933464621935196


In [26]:
test_df['computed_similarity'] = test_df.apply(lambda row: compute_similarity(row['sentence1'], row['sentence2']), axis=1)

In [29]:
mse = mean_squared_error(test_df['similarity_score'], test_df['computed_similarity'])
print("Mean Squared Error (MSE):", mse)

Mean Squared Error (MSE): 0.16366424145989153
