In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
from scipy.stats import spearmanr, pearsonr
from scipy import spatial
from transformers import BartTokenizer, BartModel
import torch
from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
PATH = os.path.join("..", "data", "raw")

df_train = pd.read_csv(os.path.join(PATH, 'eng_train.csv'))
df_train["Split_Text"] = df_train["Text"].apply(lambda x: x.replace("\n", " "))
df_train['Split_Text'] = df_train['Split_Text'].apply(lambda x: x.split("\r"))
df_train['Split_Text'] = df_train['Split_Text'].apply(lambda x: [re.sub(r"[^a-zA-Z0-9]+", ' ', k) for k in x])

df_train["sen_1"] = df_train["Split_Text"].apply(lambda x: x[0])
df_train["sen_2"] = df_train["Split_Text"].apply(lambda x: x[1])
df_train.drop(["Split_Text"], axis=1, inplace=True)
df_train.head()

# For testing puposes:
# df_train = df_train.iloc[[212, 23,1578, 4000, 1230, 1, 2 ,4, 4500]]

Unnamed: 0,PairID,Text,Score,sen_1,sen_2
0,ENG-train-0000,"It that happens, just pull the plug.\r\nif tha...",1.0,It that happens just pull the plug,if that ever happens just pull the plug
1,ENG-train-0001,A black dog running through water.\r\nA black ...,1.0,A black dog running through water,A black dog is running through some water
2,ENG-train-0002,I've been searchingthe entire abbey for you.\r...,1.0,I ve been searchingthe entire abbey for you,I m looking for you all over the abbey
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,If he is good looking and has a good personali...,If he s good looking and a good personality h...
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,She does not hate you she is just annoyed with...,She doesn t hate you she is just annoyed


# 1. Simple BART without data preprocessing

In [3]:
# Load BART tokenizer and model
# ~ 1.6GB download
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartModel.from_pretrained("facebook/bart-large-cnn")

# Example sentences
sentence_i = "This is the first sentence."

def get_bart_embeddings(sentence):
    # Tokenize and encode the sentences
    tokens_i = tokenizer(sentence, return_tensors="pt")

    # Get BART embeddings
    with torch.no_grad():
        embeddings_i = model(**tokens_i).last_hidden_state.mean(dim=1)

    # Convert embeddings to numpy arrays
    embeddings_i = embeddings_i.numpy()
    return list(embeddings_i[0])

# Get embeddings for the example sentences
embeddings_i = get_bart_embeddings(sentence_i)
embeddings_i

[0.211976,
 -0.13496348,
 -0.15495235,
 0.06478819,
 -0.145485,
 -0.043188006,
 -0.47269195,
 0.12503225,
 -0.015076393,
 -0.269134,
 0.07640724,
 -0.26634884,
 -0.4174175,
 -0.04174583,
 -0.28923738,
 -0.45047167,
 -0.340972,
 -0.124985576,
 0.1844579,
 -0.09033446,
 3.7348526,
 -0.11281986,
 0.33048213,
 -0.4693795,
 -0.34390354,
 -0.09340204,
 0.08813639,
 -0.25314727,
 -0.15623412,
 -0.22234684,
 -0.005976487,
 -0.17191175,
 0.21635824,
 -0.24469408,
 0.053853482,
 -0.02555412,
 -0.2888717,
 -0.18868993,
 0.082022265,
 -0.08427289,
 -0.23295067,
 -0.04301316,
 -0.1754452,
 -0.35685754,
 -0.1614235,
 -0.09065042,
 -0.63767666,
 0.3297166,
 -0.28048125,
 0.009388361,
 -0.47773015,
 0.3266214,
 -0.21748286,
 0.2752797,
 -0.10208789,
 -0.5947888,
 0.29443136,
 -0.081161946,
 -0.12102787,
 -0.2651376,
 -0.03741174,
 -0.5452968,
 -0.07648764,
 0.20594825,
 -0.22157034,
 -0.014294356,
 -0.39832026,
 0.24391893,
 -0.43440762,
 0.105321705,
 -0.16991358,
 -0.45593402,
 -0.38780457,
 -0.2638

In [4]:
# 30 minutes to run
df_train["embed_1"] = df_train["sen_1"].progress_apply(lambda x: get_bart_embeddings(x))
df_train["embed_2"] = df_train["sen_2"].progress_apply(lambda x: get_bart_embeddings(x))
df_train.tail()

  0%|          | 0/5500 [00:00<?, ?it/s]

  0%|          | 0/5500 [00:00<?, ?it/s]

Unnamed: 0,PairID,Text,Score,sen_1,sen_2,embed_1,embed_2
5495,ENG-train-5495,A young boy pounding on an anvil.\r\nWoman sit...,0.0,A young boy pounding on an anvil,Woman sits on the curb talking on a cellphone,"[0.09755159, -0.4822137, -0.16296929, -0.32676...","[-0.23564994, -0.1651132, -0.373366, 0.2031315..."
5496,ENG-train-5496,I love how he recognized his wife tempered his...,0.0,I love how he recognized his wife tempered his...,Torpedo Ink is Viktor s Band of Brothers the ...,"[0.06294839, 0.14637762, -0.34401453, -0.02745...","[-0.1694183, -0.31326413, 0.006729871, -0.1742..."
5497,ENG-train-5497,I actually read a chapter or two beyond that p...,0.0,I actually read a chapter or two beyond that p...,Lets say she s a blend of two types of beings,"[0.09683492, 0.39988258, -0.190917, -0.0704128...","[0.046493202, 0.18600889, -0.3208649, -0.00230..."
5498,ENG-train-5498,A boy gives being in the snow two thumbs up.\r...,0.0,A boy gives being in the snow two thumbs up,A satisfied cat is perched beside a crystal l...,"[0.14005427, -0.30437708, -0.47931513, 0.10546...","[0.104509, -0.40117535, -0.71972233, -0.122540..."
5499,ENG-train-5499,Perhaps it is strange to think about sex const...,0.0,Perhaps it is strange to think about sex const...,Few people know how to shoot pool these days,"[-0.27250853, 0.8595137, -0.1390178, -0.141094...","[0.03904477, 0.42368385, -0.16985238, 0.117221..."


In [5]:
# Use cosine similarity between the vectors as a score
df_train["Prediction"] = df_train.apply(lambda x: 1 - spatial.distance.cosine(x["embed_1"], x["embed_2"]), axis=1)
display(df_train.head())

print("Cosine distance, Spearman Correlation:", round(spearmanr(df_train["Score"], df_train["Prediction"])[0],3))

Unnamed: 0,PairID,Text,Score,sen_1,sen_2,embed_1,embed_2,Prediction
0,ENG-train-0000,"It that happens, just pull the plug.\r\nif tha...",1.0,It that happens just pull the plug,if that ever happens just pull the plug,"[-0.11991139, 0.58756006, 0.09463254, -0.03265...","[-0.20186402, 0.7138717, 0.00923355, -0.136259...",0.927955
1,ENG-train-0001,A black dog running through water.\r\nA black ...,1.0,A black dog running through water,A black dog is running through some water,"[0.07583185, -0.46020386, -0.47407135, -0.2750...","[-0.08202204, -0.42917526, -0.62733215, 0.1499...",0.943377
2,ENG-train-0002,I've been searchingthe entire abbey for you.\r...,1.0,I ve been searchingthe entire abbey for you,I m looking for you all over the abbey,"[0.2032519, -0.11944655, -0.024208633, -0.0364...","[0.06615189, 0.18228194, -0.14900401, 0.113741...",0.901551
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,If he is good looking and has a good personali...,If he s good looking and a good personality h...,"[0.11519185, -0.2673091, -0.41681337, -0.41138...","[0.01850178, -0.16432671, -0.46576646, -0.2831...",0.953265
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,She does not hate you she is just annoyed with...,She doesn t hate you she is just annoyed,"[0.055690456, 0.37848297, -0.25579396, 0.05901...","[0.07870173, 0.19003592, -0.32398522, -0.14323...",0.947812


Cosine distance, Spearman Correlation: 0.673


# BERT with Preprocessing

In [6]:
# Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

# Function for preprocessing a single sentence
def preprocess_sentence(sentence):
    # Tokenization
    words = word_tokenize(sentence)
    
    # Lowercasing
    words = [word.lower() for word in words]
    
    # Removing stop words and punctuation
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalnum() and word not in stop_words]
    
    # Stemming (optional)
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    
    # Join the words back into a sentence
    processed_sentence = ' '.join(words)
    
    return processed_sentence

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dchro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dchro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
df_train["sen_1_processed"] = df_train["sen_1"].progress_apply(lambda x: preprocess_sentence(x))
df_train["sen_2_processed"] = df_train["sen_2"].progress_apply(lambda x: preprocess_sentence(x))

  0%|          | 0/5500 [00:00<?, ?it/s]

  0%|          | 0/5500 [00:00<?, ?it/s]

In [8]:
# 30 minutes to run
df_train["embed_1_processed"] = df_train["sen_1_processed"].progress_apply(lambda x: get_bart_embeddings(x))
df_train["embed_2_processed"] = df_train["sen_2_processed"].progress_apply(lambda x: get_bart_embeddings(x))
df_train.head()

  0%|          | 0/5500 [00:00<?, ?it/s]

  0%|          | 0/5500 [00:00<?, ?it/s]

Unnamed: 0,PairID,Text,Score,sen_1,sen_2,embed_1,embed_2,Prediction,sen_1_processed,sen_2_processed,embed_1_processed,embed_2_processed
0,ENG-train-0000,"It that happens, just pull the plug.\r\nif tha...",1.0,It that happens just pull the plug,if that ever happens just pull the plug,"[-0.11991139, 0.58756006, 0.09463254, -0.03265...","[-0.20186402, 0.7138717, 0.00923355, -0.136259...",0.927955,happen pull plug,ever happen pull plug,"[0.103418395, 0.25857654, -0.37682882, 0.00904...","[0.25954938, 0.4175096, -0.6953409, -0.0292761..."
1,ENG-train-0001,A black dog running through water.\r\nA black ...,1.0,A black dog running through water,A black dog is running through some water,"[0.07583185, -0.46020386, -0.47407135, -0.2750...","[-0.08202204, -0.42917526, -0.62733215, 0.1499...",0.943377,black dog run water,black dog run water,"[0.09684483, -0.4108062, -0.8034242, -0.517948...","[0.09684483, -0.4108062, -0.8034242, -0.517948..."
2,ENG-train-0002,I've been searchingthe entire abbey for you.\r...,1.0,I ve been searchingthe entire abbey for you,I m looking for you all over the abbey,"[0.2032519, -0.11944655, -0.024208633, -0.0364...","[0.06615189, 0.18228194, -0.14900401, 0.113741...",0.901551,searchingth entir abbey,look abbey,"[0.21051075, -0.1410264, -0.4149496, -0.063323...","[0.34044912, -0.2988389, -0.49687386, -0.30214..."
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,If he is good looking and has a good personali...,If he s good looking and a good personality h...,"[0.11519185, -0.2673091, -0.41681337, -0.41138...","[0.01850178, -0.16432671, -0.46576646, -0.2831...",0.953265,good look good person might straight like bisexu,good look good person might straight like bi,"[-0.04059158, 0.07944068, -0.61094266, -0.3878...","[-0.020841777, 0.09724368, -0.7336934, -0.6219..."
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,She does not hate you she is just annoyed with...,She doesn t hate you she is just annoyed,"[0.055690456, 0.37848297, -0.25579396, 0.05901...","[0.07870173, 0.19003592, -0.32398522, -0.14323...",0.947812,hate annoy,hate annoy,"[0.09215502, 0.076552846, -0.5966915, -0.14657...","[0.09215502, 0.076552846, -0.5966915, -0.14657..."


In [9]:
# Use cosine similarity between the vectors as a score
df_train["Prediction_processed"] = df_train.apply(lambda x: 1 - spatial.distance.cosine(x["embed_1_processed"], x["embed_2_processed"]), axis=1)
# display(df_train.head())

print("Cosine distance, Spearman Correlation:", round(spearmanr(df_train["Score"], df_train["Prediction_processed"])[0],3))

Cosine distance, Spearman Correlation: 0.572
