## Importing Packages

In [1]:
import numpy as np
import pandas as pd

from gensim.parsing.preprocessing import remove_stopwords
from gensim.utils import simple_preprocess
from gensim.models.word2vec import Word2Vec
from nltk.stem import WordNetLemmatizer
from gensim.models import FastText

from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings('ignore')

## Loading the Dataset

In [2]:
data_ACL = pd.read_csv("Data/ACL/train.csv")
data_arxiv = pd.read_csv("Data/arXiv/train.csv")

df = pd.concat([data_ACL,data_arxiv],ignore_index=True)

## Preprocessing the Data

In [3]:
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = simple_preprocess(remove_stopwords(text))
    text = [lemmatizer.lemmatize(word) for word in text]
    return text

In [4]:
for i in df:
    df[i] = df[i].apply(preprocess)

In [5]:
sentences1 = df['sentence1'].values.tolist()
sentences2 = df['sentence2'].values.tolist()

sentences = sentences1 + sentences2

## Feature Representation

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
test = [" ".join(sen) for sen in sentences1[:1000]]
test2 =  [" ".join(sen) for sen in sentences2[:1000]]
tests = test + test2

In [8]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(tests)

In [9]:
sample_tfidf = tfidf_matrix.toarray()
sample_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Word2Vec - CBoW

The CBOW model learns the embedding by predicting the current word based on its context. CBOW is faster and has better representations for more frequent words.

In [10]:
num_features = 300  
min_word_count = 10
num_workers = 4    
context = 10

In [11]:
# cbow = Word2Vec(sentences, workers=num_workers, vector_size=num_features, min_count=min_word_count, window=context)
cbow = Word2Vec.load("cbow.model")

In [12]:
vocab = list(cbow.wv.index_to_key)

def get_mean_vector(model, words):
    words = [word for word in words if word in vocab]
    if len(words) >= 1:
        return np.mean(model.wv[words], axis=0)
    else:
        return []

In [13]:
cbow.wv.most_similar(cbow.wv['neural'])

[('neural', 1.0),
 ('learning', 0.5339397192001343),
 ('deep', 0.5212929844856262),
 ('vision', 0.49717044830322266),
 ('recognition', 0.49365249276161194),
 ('classification', 0.4671529531478882),
 ('cnns', 0.4524872899055481),
 ('task', 0.4393136203289032),
 ('nlp', 0.4372183382511139),
 ('cnn', 0.4124053120613098)]

In [14]:
cbow_score = []
for i in range(10):
    v1 = get_mean_vector(cbow, sentences1[i])
    v2 = get_mean_vector(cbow, sentences2[i])

    cbow_score.extend(cbow.wv.cosine_similarities(v1,[v2]))

print(f"Average Cosine Similarity for CBoW: {np.mean(cbow_score)}")

Average Cosine Similarity for CBoW: 0.8438852429389954


In [15]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

def tsne_plot(model):
    labels = []
    tokens = []
    for word in vocab[:50]:
            tokens.append(model.wv[word])
            labels.append(word)
    
    tsne_model = TSNE(perplexity=7, n_components=2, init='pca',      n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)
    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(8,6)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

In [69]:
# tsne_plot(cbow)

### Word2Vec - SkipGram

 The skip-gram model learns by predicting the surrounding words given a current word. Skipgram works well with small amount of data and is found to represent rare words well.

 Since the corpus is smaller and contains unique scientific words, Skipgram performs slightly better than CBoW.

In [17]:
# sg = Word2Vec(sentences, sg=1, workers=num_workers, vector_size=num_features, min_count=min_word_count, window=context)
sg = Word2Vec.load("sg.model")

In [18]:
sg.wv.most_similar(sg.wv['neural'])

[('neural', 1.0),
 ('deep', 0.7168455719947815),
 ('convolutional', 0.6565815210342407),
 ('network', 0.6542420387268066),
 ('vision', 0.6304519772529602),
 ('recognition', 0.6088976860046387),
 ('task', 0.5937486290931702),
 ('learning', 0.5822420120239258),
 ('classification', 0.5733896493911743),
 ('supremely', 0.5633379220962524)]

In [19]:
sg_score = []
for i in range(10):
    v1 = get_mean_vector(sg, sentences1[i])
    v2 = get_mean_vector(sg, sentences2[i])

    sg_score.extend(sg.wv.cosine_similarities(v1,[v2]))

print(f"Average Cosine Similarity for Skipgram: {np.mean(sg_score)}")

Average Cosine Similarity for Skipgram: 0.8762563467025757


In [1]:
tsne_plot(sg)

NameError: name 'tsne_plot' is not defined

In [21]:
cbow.save("cbow.model")
sg.save("sg.model")

### GloVe

Word2Vec only captures the local context of words. During training, it only considers neighboring words to capture the context. GloVe considers the entire corpus and creates a large matrix that can capture the co-occurrence of words within the corpus.

In [22]:
# from gensim.scripts.glove2word2vec import glove2word2vec
# glove_file = 'glove.6B.300d.txt'
# word2vec_file = 'glove.6B.300d.txt.word2vec'
# glove2word2vec(glove_file, word2vec_file)

In [23]:
from gensim.models import KeyedVectors
file_name = "glove.6B.300d.txt.word2vec"
model = KeyedVectors.load_word2vec_format(file_name, binary=False)

In [24]:
glove_vocab = model.key_to_index

def glove_mean_vector(model, words):
    words = [word for word in words if word in glove_vocab]
    if len(words) >= 1:
        return np.mean(model[words], axis=0)
    else:
        return []

In [25]:
model.most_similar(model['neural'])

[('neural', 1.0),
 ('neuronal', 0.6541045904159546),
 ('neurons', 0.6144998073577881),
 ('cortical', 0.5799639821052551),
 ('circuitry', 0.5606817603111267),
 ('plasticity', 0.5572713017463684),
 ('pathways', 0.5520570874214172),
 ('brain', 0.5319003462791443),
 ('cognitive', 0.5172109007835388),
 ('neuron', 0.5144911408424377)]

In [26]:
glove_score = []

for i in range(10):
    v1 = glove_mean_vector(model, sentences1[i])
    v2 = glove_mean_vector(model, sentences2[i])

    glove_score.extend(model.cosine_similarities(v1,[v2]))

print(f"Average Cosine Similarity for GloVe: {np.mean(glove_score)}")

Average Cosine Similarity for GloVe: 0.8457534909248352


In [71]:
def tsne_plot_glove(model):
    labels = []
    tokens = []
    for word in vocab[:50]:
            tokens.append(model[word])
            labels.append(word)
    
    tsne_model = TSNE(perplexity=7, n_components=2, init='pca',      n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)
    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(8,6)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

# tsne_plot_glove(model)

### FastText

The working logic of FastText algorithm is similar to Word2Vec, but the biggest difference is that it also uses N-grams of words during training. While this increases the size and processing time of the model, it also gives the model the ability to predict different variations of words.

FastText provides a great advantage in obtaining vectors of even words that are not directly in its own vocabulary.

In [28]:
# fasttext = FastText(sentences, sg=1, workers=num_workers, vector_size=num_features, min_count=min_word_count, window=context)
fasttext = Word2Vec.load("fasttext.model")

In [29]:
fasttext.wv.most_similar(fasttext.wv['model'])

[('model', 0.9999999403953552),
 ('modelbased', 0.6032783389091492),
 ('modeling', 0.4718541204929352),
 ('blockmodel', 0.46743205189704895),
 ('metamodel', 0.46408745646476746),
 ('selectfrommodel', 0.4546191990375519),
 ('mpn', 0.44286206364631653),
 ('modelling', 0.4332956075668335),
 ('facescrub', 0.41293075680732727),
 ('infersent', 0.40839967131614685)]

In [30]:
fasttext_score = []

for i in range(10):
    v1 = get_mean_vector(fasttext, sentences1[i])
    v2 = get_mean_vector(fasttext, sentences2[i])

    fasttext_score.append(fasttext.wv.cosine_similarities(v1,[v2])) 

print(f"Average Cosine Similarity for FastText: {np.mean(fasttext_score)}")

Average Cosine Similarity for FastText: 0.8827802538871765


In [73]:
# tsne_plot(fasttext)

In [None]:
fasttext.save("fasttext.model")

## Evaluation

In [66]:
def identify_paraphrase(idx1, idx2, sent1, sent2, model):
    print("S1:", data_ACL['sentence1'][idx1])
    print("S2:", data_ACL['sentence2'][idx2])
    v1 = get_mean_vector(fasttext, sent1)
    v2 = get_mean_vector(fasttext, sent2)
    if model == fasttext:
        score = model.wv.cosine_similarities(v1,[v2])[0]
    elif model == model:
        score = model.cosine_similarities(v1,[v2])[0]

    if score > 0.75:
        print("\nIt is a PARAPHRASE of the first sentence.")
    else:
        print("\nIt is NOT A PARAPHRASE of the first sentence. ")
    print(f"Cosine Similarity: {score}")

In [67]:
identify_paraphrase(0, 0, sentences1[0], sentences2[0], fasttext)

S1: for all methods , the tweets were tokenized with the cmu twitter nlp tool .
S2: the tweets were tokenized and part-ofspeech tagged with the cmu ark twitter nlp tool and stanford corenlp .

It is a PARAPHRASE of the first sentence.
Cosine Similarity: 0.8875526189804077


In [68]:
identify_paraphrase(0, 1, sentences1[0], sentences2[1], model)

S1: for all methods , the tweets were tokenized with the cmu twitter nlp tool .
S2: nederhof et al , for instance , show that prefix probabilities , and therefore surprisal , can be estimated from tree adjoining grammars .

It is NOT A PARAPHRASE of the first sentence. 
Cosine Similarity: 0.4238167107105255
