### Package Imports

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
import os
from gensim.models import FastText

In [2]:
# Load the File
df_str_rel = pd.read_csv(os.path.join('..','data','raw','eng_train.csv'))
df_str_rel.head()

Unnamed: 0,PairID,Text,Score
0,ENG-train-0000,"It that happens, just pull the plug.\nif that ...",1.0
1,ENG-train-0001,A black dog running through water.\nA black do...,1.0
2,ENG-train-0002,I've been searchingthe entire abbey for you.\n...,1.0
3,ENG-train-0003,If he is good looking and has a good personali...,1.0
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0


In [3]:
df_str_rel['Text'].values

array(['It that happens, just pull the plug.\nif that ever happens, just pull the plug.',
       'A black dog running through water.\nA black dog is running through some water.',
       "I've been searchingthe entire abbey for you.\nI'm looking for you all over the abbey.",
       ...,
       "I actually read a chapter or two beyond that point, but my heart wasn't in it any more.\nLets say she's a blend of two types of beings.",
       'A boy gives being in the snow two thumbs up.\nA satisfied cat is perched beside a crystal lamp.',
       'Perhaps it is strange to think about sex constantly these days.\nFew people know how to shoot pool these days.'],
      dtype=object)

In [5]:
# Creating a column "Split_Text" which is a list of two sentences.
df_str_rel['Split_Text'] = df_str_rel['Text'].apply(lambda x: x.split("\n"))
df_str_rel['Split_Text'].loc[0]

['It that happens, just pull the plug.',
 'if that ever happens, just pull the plug.']

## load static FastText Embeddings

In [6]:
#loading the Fasttext Embeddings downloaded eng .bin file from https://fasttext.cc/docs/en/crawl-vectors.html
fasttext_model = FastText.load_fasttext_format('/Users/lemarx/Documents/01_projects/SentencesRelatedness24/data/embeddings/cc.en.300.bin')

  fasttext_model = FastText.load_fasttext_format('/Users/lemarx/Documents/01_projects/SentencesRelatedness24/data/embeddings/cc.en.300.bin')


In [7]:
#function to apply the embeddings to one sentence
def to_sent_emb(sentence):
    sentence_emb = np.array([fasttext_model.wv[word] for word in sentence.split() if word in fasttext_model.wv]).mean(axis=0)
    return sentence_emb

In [8]:
#adding the application of the embeddings to the cosine_similarity computation function
def cosine_similarity(vector_a, vector_b):
    vector_a = to_sent_emb(vector_a)
    vector_b = to_sent_emb(vector_b)
    dot_product = np.dot(vector_a, vector_b)
    norm_a = np.linalg.norm(vector_a)
    norm_b = np.linalg.norm(vector_b)

    similarity = dot_product / (norm_a * norm_b)
    return similarity

In [9]:
#jointly applying the embeddings and cosine similarity to the whole dataset
df_str_rel['cos_sim'] = df_str_rel.apply(lambda row: cosine_similarity(row['Split_Text'][0],row['Split_Text'][1]), axis= 1)

In [11]:
#evaluating the performance of the FastText Embeddings on the whole dataset
true_scores = df_str_rel['Score'].values
pred_scores = df_str_rel['cos_sim'].values
print("Spearman Correlation:", round(spearmanr(true_scores,pred_scores)[0],2))