In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
from scipy.stats import spearmanr, pearsonr
from scipy import spatial
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
PATH = os.path.join("..", "data", "raw")

df_train = pd.read_csv(os.path.join(PATH, 'eng_train.csv'))
df_train["Split_Text"] = df_train["Text"].apply(lambda x: x.replace("\n", " "))
df_train['Split_Text'] = df_train['Split_Text'].apply(lambda x: x.split("\r"))
df_train['Split_Text'] = df_train['Split_Text'].apply(lambda x: [re.sub(r"[^a-zA-Z0-9]+", ' ', k) for k in x])

df_train["sen_1"] = df_train["Split_Text"].apply(lambda x: x[0])
df_train["sen_2"] = df_train["Split_Text"].apply(lambda x: x[1])
df_train.drop(["Split_Text"], axis=1, inplace=True)
df_train.head()

# For testing puposes:
# df_train = df_train.iloc[[212, 23,1578, 4000, 1230, 1, 2 ,4, 4500]]

Unnamed: 0,PairID,Text,Score,sen_1,sen_2
0,ENG-train-0000,"It that happens, just pull the plug.\r\nif tha...",1.0,It that happens just pull the plug,if that ever happens just pull the plug
1,ENG-train-0001,A black dog running through water.\r\nA black ...,1.0,A black dog running through water,A black dog is running through some water
2,ENG-train-0002,I've been searchingthe entire abbey for you.\r...,1.0,I ve been searchingthe entire abbey for you,I m looking for you all over the abbey
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,If he is good looking and has a good personali...,If he s good looking and a good personality h...
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,She does not hate you she is just annoyed with...,She doesn t hate you she is just annoyed


# 1. Simple BART without data preprocessing

In [3]:
# Load BART tokenizer and model
# ~ 1.6GB download
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Example sentences
sentence_i = "This is the first sentence."

def get_bert_embeddings(sentence):
    # Tokenize and encode the sentences
    tokens_i = tokenizer(sentence, return_tensors="pt")

    # Get BART embeddings
    with torch.no_grad():
        embeddings_i = model(**tokens_i).last_hidden_state.mean(dim=1)

    # Convert embeddings to numpy arrays
    embeddings_i = embeddings_i.numpy()
    return list(embeddings_i[0])

# Get embeddings for the example sentences
embeddings_i = get_bert_embeddings(sentence_i)
embeddings_i

[-0.30514315,
 -0.2292133,
 0.23413709,
 -0.34856534,
 0.06824613,
 -0.09749609,
 0.35988426,
 0.7686615,
 -0.032881413,
 -0.49449062,
 -0.042543553,
 -0.3318028,
 -0.13952154,
 0.15522552,
 -0.13557294,
 0.2265741,
 0.4182347,
 0.0011433437,
 -0.012813231,
 0.10374844,
 -0.05894299,
 0.31845438,
 -0.22073671,
 0.3675331,
 0.6689993,
 -0.118647255,
 -0.014847621,
 -0.1748038,
 -0.31859198,
 -0.39284933,
 0.13670088,
 0.10032763,
 -0.47933918,
 0.117123984,
 -0.2462191,
 -0.4814206,
 0.07749182,
 -0.40006036,
 -0.19473492,
 0.109304614,
 -0.65909964,
 -0.080104195,
 -0.14316034,
 -0.21717468,
 -0.1279759,
 -0.3917843,
 0.07767913,
 -0.3379607,
 0.0055684857,
 -0.022800405,
 -0.8193708,
 0.24718958,
 0.15658988,
 0.18555883,
 -0.43917695,
 0.7213313,
 -0.13825467,
 -0.4140617,
 -0.58948195,
 -0.16100958,
 0.35640478,
 -0.084027134,
 -0.0026356839,
 -0.30997878,
 0.03056789,
 -0.102107614,
 0.0043818494,
 0.27329773,
 -1.0201566,
 0.1610376,
 -0.8640444,
 -0.4488918,
 0.25736737,
 0.01305

In [4]:
# 10 minutes to run
df_train["embed_1"] = df_train["sen_1"].progress_apply(lambda x: get_bert_embeddings(x))
df_train["embed_2"] = df_train["sen_2"].progress_apply(lambda x: get_bert_embeddings(x))
df_train.tail()

  0%|          | 0/5500 [00:00<?, ?it/s]

  0%|          | 0/5500 [00:00<?, ?it/s]

Unnamed: 0,PairID,Text,Score,sen_1,sen_2,embed_1,embed_2
5495,ENG-train-5495,A young boy pounding on an anvil.\r\nWoman sit...,0.0,A young boy pounding on an anvil,Woman sits on the curb talking on a cellphone,"[0.31648403, 0.25111917, -0.29609928, -0.05665...","[0.43188465, -0.19199376, 0.12269094, -0.04211..."
5496,ENG-train-5496,I love how he recognized his wife tempered his...,0.0,I love how he recognized his wife tempered his...,Torpedo Ink is Viktor s Band of Brothers the ...,"[0.094828896, 0.077845246, -0.11653494, 0.1789...","[-0.19185084, -0.17047605, -0.1986252, 0.32793..."
5497,ENG-train-5497,I actually read a chapter or two beyond that p...,0.0,I actually read a chapter or two beyond that p...,Lets say she s a blend of two types of beings,"[-0.053749584, -0.34628856, -0.21478921, -0.04...","[0.55557835, -0.033709772, 0.014751155, 0.0558..."
5498,ENG-train-5498,A boy gives being in the snow two thumbs up.\r...,0.0,A boy gives being in the snow two thumbs up,A satisfied cat is perched beside a crystal l...,"[0.073898986, -0.11729693, 0.30308354, -0.0264...","[0.31168067, -0.07857962, 0.09063501, 0.180331..."
5499,ENG-train-5499,Perhaps it is strange to think about sex const...,0.0,Perhaps it is strange to think about sex const...,Few people know how to shoot pool these days,"[-0.005585519, 0.40072262, -0.19272114, -0.050...","[0.32973552, 0.21052323, 0.02112219, 0.0848861..."


In [5]:
# Use cosine similarity between the vectors as a score
df_train["Prediction"] = df_train.apply(lambda x: 1 - spatial.distance.cosine(x["embed_1"], x["embed_2"]), axis=1)
display(df_train.head())

print("Cosine distance, Spearman Correlation:", round(spearmanr(df_train["Score"], df_train["Prediction"])[0],3))

Unnamed: 0,PairID,Text,Score,sen_1,sen_2,embed_1,embed_2,Prediction
0,ENG-train-0000,"It that happens, just pull the plug.\r\nif tha...",1.0,It that happens just pull the plug,if that ever happens just pull the plug,"[0.19992355, -0.19264238, 0.35428113, -0.16218...","[0.22918944, -0.13682392, 0.36472526, -0.46745...",0.832529
1,ENG-train-0001,A black dog running through water.\r\nA black ...,1.0,A black dog running through water,A black dog is running through some water,"[0.08456387, -0.16079591, -0.5148684, 0.043007...","[0.30121833, -0.033289216, -0.3586976, 0.11303...",0.887316
2,ENG-train-0002,I've been searchingthe entire abbey for you.\r...,1.0,I ve been searchingthe entire abbey for you,I m looking for you all over the abbey,"[0.06486244, 0.15821557, 0.18189488, -0.127942...","[0.2661249, 0.013692539, 0.23186632, -0.183504...",0.7474
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,If he is good looking and has a good personali...,If he s good looking and a good personality h...,"[-0.04202984, -0.43217644, 0.011997626, 0.1277...","[0.36093086, -0.32214803, 0.21463105, 0.016391...",0.931207
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,She does not hate you she is just annoyed with...,She doesn t hate you she is just annoyed,"[0.22149947, -0.24875253, -0.032242805, -0.181...","[0.40925542, -0.186313, 0.049004916, -0.057169...",0.945836


Cosine distance, Spearman Correlation: 0.596
