In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
from scipy.stats import spearmanr, pearsonr
from sent2vec.vectorizer import Vectorizer


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
PATH = os.path.join("..", "data", "raw")

train = pd.read_csv(os.path.join(PATH, 'eng_train.csv'))
train["Split_Text"] = train["Text"].apply(lambda x: x.replace("\n", " "))
train['Split_Text'] = train['Split_Text'].apply(lambda x: x.split("\r"))
train['Split_Text'] = train['Split_Text'].apply(lambda x: [re.sub(r"[^a-zA-Z0-9]+", ' ', k) for k in x])

train["sen_1"] = train["Split_Text"].apply(lambda x: x[0])
train["sen_2"] = train["Split_Text"].apply(lambda x: x[1])
train.drop(["Split_Text"], axis=1, inplace=True)

train.head()

Unnamed: 0,PairID,Text,Score,sen_1,sen_2
0,ENG-train-0000,"It that happens, just pull the plug.\r\nif tha...",1.0,It that happens just pull the plug,if that ever happens just pull the plug
1,ENG-train-0001,A black dog running through water.\r\nA black ...,1.0,A black dog running through water,A black dog is running through some water
2,ENG-train-0002,I've been searchingthe entire abbey for you.\r...,1.0,I ve been searchingthe entire abbey for you,I m looking for you all over the abbey
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,If he is good looking and has a good personali...,If he s good looking and a good personality h...
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,She does not hate you she is just annoyed with...,She doesn t hate you she is just annoyed


Distilbert-base-uncased

In [47]:
sen_1_list = train["sen_1"].to_list()
sen_2_list = train["sen_2"].to_list()

# RUNNING ONE COLUMN AT A TIME BECAUSE IT TAKES ~ 14GB of RAM per column
# Get model ~ 0,3 GB download but only with the first run
vectorizer = Vectorizer(pretrained_weights='distilbert-base-uncased', ensemble_method='average')

vectorizer.run(sen_1_list)

print(f"Run 1 done, {len(sen_1_list)} sentences vectorized")
vectors = np.array(vectorizer.vectors)

train["sen_1_vec"] = list(vectors)


# Second column
vectorizer_2 = Vectorizer(pretrained_weights='distilbert-base-uncased', ensemble_method='average')
vectorizer_2.run(sen_2_list)

print(f"Run 2 done, {len(sen_2_list)} sentences vectorized")
vectors_2 = np.array(vectorizer_2.vectors)
train["sen_2_vec"] = list(vectors_2)

Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Run 1 done, 5500 sentences vectorized
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Run 2 done, 5500 sentences vectorized


In [15]:
from scipy import spatial

train["Prediction"] = train.apply(lambda x: 1 - spatial.distance.cosine(x["sen_1_vec"], x["sen_2_vec"]), axis=1)

display(train.head())
print("Spearman Correlation:", round(spearmanr(train["Score"], train["Prediction"])[0],2))

Unnamed: 0,PairID,Text,Score,sen_1,sen_2,sen_1_vec,sen_2_vec,Prediction
0,ENG-train-0000,"It that happens, just pull the plug.\r\nif tha...",1.0,It that happens just pull the plug,if that ever happens just pull the plug,"[0.043268822, -0.34856343, 0.28903472, -0.1526...","[0.18558623, -0.29072785, 0.30751923, -0.11294...",0.983931
1,ENG-train-0001,A black dog running through water.\r\nA black ...,1.0,A black dog running through water,A black dog is running through some water,"[0.032298513, -0.29267886, 0.22542144, -0.1273...","[0.1487496, -0.27362612, 0.23198934, -0.070322...",0.985127
2,ENG-train-0002,I've been searchingthe entire abbey for you.\r...,1.0,I ve been searchingthe entire abbey for you,I m looking for you all over the abbey,"[0.050184645, -0.20874508, 0.2368095, -0.16057...","[0.13193569, -0.30554605, 0.3308887, -0.128524...",0.974711
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,If he is good looking and has a good personali...,If he s good looking and a good personality h...,"[-0.19122285, -0.3755346, 0.21169132, -0.11737...","[-0.003060758, -0.3280269, 0.26041013, -0.0622...",0.961116
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,She does not hate you she is just annoyed with...,She doesn t hate you she is just annoyed,"[0.024173506, -0.34374678, 0.25788307, -0.1318...","[0.19522768, -0.29358017, 0.27051896, -0.09734...",0.971275


Spearman Correlation: 0.16


distilbert-base-multilingual-cased

In [21]:
# Other weights try

# glove-wiki-gigaword-300
# bert-base-nli-mean-tokens

# RUNNING ONE COLUMN AT A TIME BECAUSE IT TAKES ~ 14GB of RAM per column
# Get model ~ 0,5 GB download but only with the first run
vectorizer = Vectorizer(pretrained_weights='distilbert-base-multilingual-cased', ensemble_method='average') 

vectorizer.run(sen_1_list)

print(f"Run 1 done, {len(sen_1_list)} sentences vectorized")
vectors = np.array(vectorizer.vectors)

train["sen_1_vec"] = list(vectors)


# # Second column
vectorizer_2 = Vectorizer(pretrained_weights='distilbert-base-multilingual-cased', ensemble_method='average')
vectorizer_2.run(sen_2_list)

print(f"Run 2 done, {len(sen_2_list)} sentences vectorized")
vectors_2 = np.array(vectorizer_2.vectors)
train["sen_2_vec"] = list(vectors_2)

Initializing Bert distilbert-base-multilingual-cased
Vectorization done on cpu


Downloading tokenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<?, ?B/s]
Downloading vocab.txt: 100%|██████████| 996k/996k [00:00<00:00, 17.0MB/s]
Downloading tokenizer.json: 100%|██████████| 1.96M/1.96M [00:00<00:00, 14.5MB/s]
Downloading config.json: 100%|██████████| 466/466 [00:00<?, ?B/s] 
Downloading model.safetensors: 100%|██████████| 542M/542M [00:34<00:00, 15.6MB/s] 


Run 1 done, 5500 sentences vectorized
Initializing Bert distilbert-base-multilingual-cased
Vectorization done on cpu
Run 2 done, 5500 sentences vectorized


In [27]:
from scipy import spatial

train["Prediction"] = train.apply(lambda x: 1 - spatial.distance.cosine(x["sen_1_vec"], x["sen_2_vec"]), axis=1)

display(train.head())
print("Spearman Correlation:", round(spearmanr(train["Score"], train["Prediction"])[0],3))


train["Prediction"] = train.apply(lambda x: 1 - spatial.distance.euclidean(x["sen_1_vec"], x["sen_2_vec"]), axis=1)

# display(train.head())
print("Spearman Correlation:", round(spearmanr(train["Score"], train["Prediction"])[0],3))



train["Prediction"] = train.apply(lambda x: 1 - spatial.distance.correlation(x["sen_1_vec"], x["sen_2_vec"]), axis=1)

# display(train.head())
print("Spearman Correlation:", round(spearmanr(train["Score"], train["Prediction"])[0],2))

Unnamed: 0,PairID,Text,Score,sen_1,sen_2,sen_1_vec,sen_2_vec,Prediction
0,ENG-train-0000,"It that happens, just pull the plug.\r\nif tha...",1.0,It that happens just pull the plug,if that ever happens just pull the plug,"[0.288757, 0.017465338, 0.37414962, 0.17203511...","[0.24385595, -0.0225325, 0.2959763, 0.21476698...",0.988541
1,ENG-train-0001,A black dog running through water.\r\nA black ...,1.0,A black dog running through water,A black dog is running through some water,"[0.22300872, -0.0035058856, 0.42560536, 0.2109...","[0.18355757, -0.035942025, 0.30828395, 0.24548...",0.987399
2,ENG-train-0002,I've been searchingthe entire abbey for you.\r...,1.0,I ve been searchingthe entire abbey for you,I m looking for you all over the abbey,"[0.37595713, 0.0032268763, 0.33328888, 0.20933...","[0.27767316, -0.041070223, 0.30190146, 0.21894...",0.98724
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,If he is good looking and has a good personali...,If he s good looking and a good personality h...,"[0.50591886, -0.013606593, 0.4011759, 0.185587...","[0.33724537, 0.018533602, 0.32457644, 0.266543...",0.964153
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,She does not hate you she is just annoyed with...,She doesn t hate you she is just annoyed,"[0.40013552, 0.056739792, 0.37590942, 0.173118...","[0.28692865, -0.030071296, 0.2977713, 0.223826...",0.96625


Spearman Correlation: 0.109
Spearman Correlation: 0.108
Spearman Correlation: 0.11


distilbert-base-multilingual-cased + XGBRegressor

In [38]:
# Try to predict similarity using XGBoost

# prepare the difference matrix
train["diff"] = train.apply(lambda x: np.array(x["sen_1_vec"]) - np.array(x["sen_2_vec"]), axis=1)

df_xgb = pd.DataFrame(train["diff"].tolist()).copy()
df_xgb["Score"] = train["Score"]

# Predict using XGBoost
from xgboost import XGBRegressor
from sklearn.metrics import make_scorer

# import evaluate CV
from sklearn.model_selection import cross_val_score
model = XGBRegressor()

def spearmanr_score(y_true, y_pred):
    return spearmanr(y_true, y_pred)[0]

spearmanr_score = make_scorer(spearmanr_score)

scores = cross_val_score(model, df_xgb.drop(["Score"], axis=1), df_xgb["Score"], cv = 5, scoring=spearmanr_score)

print("Spearman Correlation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Spearman Correlation: 0.09 (+/- 0.14)
