In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
from scipy.stats import spearmanr, pearsonr
from gensim.models import Word2Vec as w2v

In [2]:
# download pretrained word2vec model ~1.5GB
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [3]:
PATH = os.path.join("..", "data", "raw")

df_train = pd.read_csv(os.path.join(PATH, 'eng_train.csv'))
df_train["Split_Text"] = df_train["Text"].apply(lambda x: x.replace("\n", " "))
df_train['Split_Text'] = df_train['Split_Text'].apply(lambda x: x.split("\r"))
df_train['Split_Text'] = df_train['Split_Text'].apply(lambda x: [re.sub(r"[^a-zA-Z0-9]+", ' ', k) for k in x])

df_train["sen_1"] = df_train["Split_Text"].apply(lambda x: x[0])
df_train["sen_2"] = df_train["Split_Text"].apply(lambda x: x[1])
df_train.drop(["Split_Text"], axis=1, inplace=True)

def vector_if_available(list):
    vectors = []
    for word in list:
        try:
            vectors.append(wv[word])
        except KeyError:
            pass

    return vectors
    
df_train["embed_1"] = df_train["sen_1"].apply(lambda x: vector_if_available(x))
df_train["embed_2"] = df_train["sen_2"].apply(lambda x: vector_if_available(x))
df_train.head()

Unnamed: 0,PairID,Text,Score,sen_1,sen_2,embed_1,embed_2
0,ENG-train-0000,"It that happens, just pull the plug.\r\nif tha...",1.0,It that happens just pull the plug,if that ever happens just pull the plug,"[[0.07910156, -0.0050354004, 0.111816406, 0.21...","[[-0.22558594, -0.01953125, 0.09082031, 0.2373..."
1,ENG-train-0001,A black dog running through water.\r\nA black ...,1.0,A black dog running through water,A black dog is running through some water,"[[-0.10595703, 0.21386719, 0.118652344, -0.031...","[[-0.10595703, 0.21386719, 0.118652344, -0.031..."
2,ENG-train-0002,I've been searchingthe entire abbey for you.\r...,1.0,I ve been searchingthe entire abbey for you,I m looking for you all over the abbey,"[[0.07910156, -0.0050354004, 0.111816406, 0.21...","[[0.07910156, -0.0050354004, 0.111816406, 0.21..."
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,If he is good looking and has a good personali...,If he s good looking and a good personality h...,"[[0.07910156, -0.0050354004, 0.111816406, 0.21...","[[0.07910156, -0.0050354004, 0.111816406, 0.21..."
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,She does not hate you she is just annoyed with...,She doesn t hate you she is just annoyed,"[[-0.026977539, 0.067871094, 0.106933594, 0.05...","[[-0.026977539, 0.067871094, 0.106933594, 0.05..."


In [11]:
# Mean-pooling of word vectors
df_train["mean_1"] = df_train["embed_1"].apply(lambda x: np.mean(x, axis=0))
df_train["mean_2"] = df_train["embed_2"].apply(lambda x: np.mean(x, axis=0))

# Use cosine similarity between the vectors as a score
df_train["PredictMean"] = df_train.apply(lambda x: np.dot(x["mean_1"], x["mean_2"])/(np.linalg.norm(x["mean_1"])*np.linalg.norm(x["mean_2"])), axis=1)
# display(df_train.head())

print("Cosine distance, Spearman Correlation:", round(spearmanr(df_train["Score"], df_train["PredictMean"])[0],3))

# Try different similarity measures

from scipy import spatial


# Use cosine similarity between the vectors as a score
df_train["PredictMean"] = df_train.apply(lambda x: 1 - spatial.distance.euclidean(x["mean_1"], x["mean_2"]), axis=1)
# display(df_train.head())

print("Euclidean distance, Spearman Correlation:", round(spearmanr(df_train["Score"], df_train["PredictMean"])[0],3))

# Use correlation between the vectors as a score
df_train["PredictMean"] = df_train.apply(lambda x: np.corrcoef(x["mean_1"], x["mean_2"])[0][1], axis=1)
# display(df_train.head())

print("Correlation, Spearman Correlation:", round(spearmanr(df_train["Score"], df_train["PredictMean"])[0],3))

Cosine distance, Spearman Correlation: 0.375
Euclidean distance, Spearman Correlation: 0.374
Correlation, Spearman Correlation: 0.375


In [98]:
# Max-pooling of word vectors
df_train["max_1"] = df_train["embed_1"].apply(lambda x: np.max(x, axis=0))
df_train["max_2"] = df_train["embed_2"].apply(lambda x: np.max(x, axis=0))

# Use cosine similarity between the vectors as a score
df_train["PredictMax"] = df_train.apply(lambda x: np.dot(x["max_1"], x["max_2"])/(np.linalg.norm(x["max_1"])*np.linalg.norm(x["max_2"])), axis=1)
display(df_train.head())

print("Spearman Correlation:", round(spearmanr(df_train["Score"], df_train["PredictMax"])[0],2))

Unnamed: 0,PairID,Text,Score,sen_1,sen_2,embed_1,embed_2,mean_1,mean_2,Predict,max_1,max_2,PredictMax
0,ENG-train-0000,"It that happens, just pull the plug.\r\nif tha...",1.0,It that happens just pull the plug,if that ever happens just pull the plug,"[[0.07910156, -0.0050354004, 0.111816406, 0.21...","[[-0.22558594, -0.01953125, 0.09082031, 0.2373...","[-0.19144146, 0.14930256, -0.055718057, 0.1323...","[-0.18121542, 0.14402872, -0.025543213, 0.1314...",0.989086,"[0.07910156, 0.29296875, 0.1640625, 0.38085938...","[0.1640625, 0.29296875, 0.3046875, 0.38085938,...",0.95041
1,ENG-train-0001,A black dog running through water.\r\nA black ...,1.0,A black dog running through water,A black dog is running through some water,"[[-0.10595703, 0.21386719, 0.118652344, -0.031...","[[-0.10595703, 0.21386719, 0.118652344, -0.031...","[-0.20457341, 0.095846325, 0.0060800407, 0.128...","[-0.2035389, 0.1010685, 0.009536743, 0.130867,...",0.99349,"[0.040771484, 0.29296875, 0.2265625, 0.3261718...","[0.040771484, 0.29296875, 0.2265625, 0.3261718...",0.995175
2,ENG-train-0002,I've been searchingthe entire abbey for you.\r...,1.0,I ve been searchingthe entire abbey for you,I m looking for you all over the abbey,"[[0.07910156, -0.0050354004, 0.111816406, 0.21...","[[0.07910156, -0.0050354004, 0.111816406, 0.21...","[-0.16130336, 0.10294566, 0.02094942, 0.149026...","[-0.14080593, 0.09958867, 0.04020909, 0.120962...",0.96378,"[0.1640625, 0.29296875, 0.3046875, 0.23730469,...","[0.1640625, 0.29296875, 0.3046875, 0.32617188,...",0.971155
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,If he is good looking and has a good personali...,If he s good looking and a good personality h...,"[[0.07910156, -0.0050354004, 0.111816406, 0.21...","[[0.07910156, -0.0050354004, 0.111816406, 0.21...","[-0.18789212, 0.11101083, 0.019651648, 0.13283...","[-0.1727731, 0.103882805, 0.023776583, 0.13874...",0.994563,"[0.07910156, 0.29296875, 0.2265625, 0.38085938...","[0.07910156, 0.29296875, 0.25390625, 0.3808593...",0.97965
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,She does not hate you she is just annoyed with...,She doesn t hate you she is just annoyed,"[[-0.026977539, 0.067871094, 0.106933594, 0.05...","[[-0.026977539, 0.067871094, 0.106933594, 0.05...","[-0.18909495, 0.12531802, -0.005154258, 0.1236...","[-0.18012899, 0.13189697, -0.00040893554, 0.12...",0.995886,"[-0.013977051, 0.29296875, 0.1640625, 0.237304...","[-0.013977051, 0.29296875, 0.1640625, 0.237304...",0.994848


Spearman Correlation: 0.3


XGBoost prediction

In [8]:
# Try to predict similarity using XGBoost
train = df_train.copy()
# prepare the difference matrix
train["diff"] = train.apply(lambda x: np.array(x["mean_1"]) - np.array(x["mean_2"]), axis=1)

df_xgb = pd.DataFrame(train["diff"].tolist()).copy()
df_xgb["Score"] = train["Score"]

# Predict using XGBoost
from xgboost import XGBRegressor
from sklearn.metrics import make_scorer

# import evaluate CV
from sklearn.model_selection import cross_val_score
model = XGBRegressor()

def spearmanr_score(y_true, y_pred):
    return spearmanr(y_true, y_pred)[0]

spearmanr_score = make_scorer(spearmanr_score)

scores = cross_val_score(model, df_xgb.drop(["Score"], axis=1), df_xgb["Score"], cv = 5, scoring=spearmanr_score)

print("Spearman Correlation: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

Spearman Correlation: 0.088 (+/- 0.041)
