In [1]:
# import libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
#from sklearn.feature_extraction.text import CountVectorizer

import pandas as pd
import numpy as np
import re

In [2]:
# read data and preprocessings
Data = pd.read_csv('data.tsv', sep='\t', error_bad_lines=False)
Data = Data.dropna()
# drop the duplicate rows in question2
Data = Data.drop_duplicates(subset=['question2'])

# fillter the first 100 'question1' with 'is_duplicate' == 1.0
Q1 = Data[Data['is_duplicate'] == 1.0].head(100)
# keep only the 'question1' column
Q1 = Q1['question1']

Q2= Data['question2']
# test Q2 as the first 1000 'question2'
#Q2 = Data['question2'].head(1000)
Q2 = Q2.astype(str) # make sure the type is string

#Process the review column line by line to do text preprocessing
def process_review(review):
    # remove the punctuations and numbers
    review = re.sub(r'[^A-Za-z]+', ' ', review)
    # convert the review to lower case
    review = review.lower()
    # remove the stopwords
    stop_words = set(stopwords.words('english'))
    # tokenize the words
    word_tokens = word_tokenize(review)
    filtered_review = [w for w in word_tokens if not w in stop_words]
    # lemmatize the words
    lemmatizer = WordNetLemmatizer()
    lemmatized_review = [lemmatizer.lemmatize(w) for w in filtered_review]
    # return the processed review
    return lemmatized_review

# process the train and test reviews
Q1 = Q1.apply(process_review)
Q2 = Q2.apply(process_review)



  Data = pd.read_csv('data.tsv', sep='\t', error_bad_lines=False)
b'Skipping line 83032: expected 6 fields, saw 7\n'
b'Skipping line 154657: expected 6 fields, saw 7\n'
b'Skipping line 323916: expected 6 fields, saw 7\n'
  Data = pd.read_csv('data.tsv', sep='\t', error_bad_lines=False)


In [23]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 272959 entries, 0 to 363190
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            272959 non-null  object 
 1   qid1          272959 non-null  object 
 2   qid2          272959 non-null  float64
 3   question1     272959 non-null  object 
 4   question2     272959 non-null  object 
 5   is_duplicate  272959 non-null  float64
dtypes: float64(2), object(4)
memory usage: 14.6+ MB


In [3]:
# create a vocabulary
vocabulary = set()
for q in Q2:
    for w in q:
        vocabulary.add(w)

vocabulary = list(vocabulary)
len(vocabulary)

49511

## TF-IDF

In [4]:
TF = {}
DF = {}
# count the TF and DF
for word in vocabulary:
        DF[word] = 0
        for index_q, q in enumerate(Q2):
            if word in q:
                DF[word] += 1
                if word not in TF:
                    TF[word] = [(index_q, q.count(word))]
                else:
                    TF[word].append((index_q, q.count(word)))

# iterate the TF and convert the count into log scale
for word in TF:
    for i in range(len(TF[word])):
        TF[word][i] = (TF[word][i][0], np.log10(1 + TF[word][i][1]))

#convert DF into IDF
N = len(Q2)
for word in DF:
    DF[word] = N / DF[word]

# convert TF, IDF into TF-IDF
for word in TF:
    for i in range(len(TF[word])):
        TF[word][i] = (TF[word][i][0], TF[word][i][1] * DF[word])

# TF is already the inverted file
inverted_file = TF

In [5]:
#get the top 5 similar questions
def get_top5_similar_questions(query):
    ranking = {}
    for word in query:
        if word in inverted_file:
            for index_q, score in inverted_file[word]:
                if index_q not in ranking:
                    ranking[index_q] = score
                else:
                    ranking[index_q] += score
    ranking = sorted(ranking.items(), key=lambda x: x[1], reverse=True)
    return ranking[:2], ranking[:5]

In [6]:
Q1_dict = Q1.to_dict()
acc_2 = 0
acc_5 = 0
for q in Q1_dict:
    real_index = q
    test_q = Q1_dict[q] 
    top2, top5 = get_top5_similar_questions(test_q)
    if real_index in [i[0] for i in top2]:
        acc_2 += 1
    if real_index in [i[0] for i in top5]:
        acc_5 += 1
    
print('top2 accuracy: ', acc_2/100)
print('top5 accuracy: ', acc_5/100)
   

top2 accuracy:  0.22
top5 accuracy:  0.36


## sentence embedding(averaging)

In [7]:
# load the pre-trained glove word embeddings
embeddings_dict = {}
with open("glove/glove.6B.50d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_dict[word] = vector

In [9]:
len(embeddings_dict)

400000

In [13]:
#convert the value of Q1_dict into sentence embedding
Q1_dict_vec = {}
for q_index in Q1_dict:
    q_vec = np.zeros(50)
    for word in Q1_dict[q_index]:
        if word in embeddings_dict:
            q_vec += embeddings_dict[word]/len(Q1_dict[q])
    Q1_dict_vec[q_index] = q_vec


#convert the value of Q2_dict into sentence embedding
Q2_dict = Q2.to_dict()
Q2_dict_vec = {}
for q_index in Q2_dict:
    q_vec = np.zeros(50)
    for word in Q2_dict[q_index]:
        if word in embeddings_dict:
            q_vec += embeddings_dict[word]/len(Q2_dict[q])
    Q2_dict_vec[q_index] = q_vec

{1: array([-0.07651999, -0.40656669, -0.02555669, -0.01735666,  0.3804933 ,
         0.52929198,  0.73242668,  0.07705334, -0.10654001,  0.22238002,
         0.19640333,  0.06668266,  0.15760332,  0.23630699, -0.40978   ,
         0.07523668, -0.27992132,  0.38652966,  0.11597901, -0.62419003,
        -0.45305001,  0.23469997,  0.26793998, -0.20339999,  0.48094998,
        -0.87481002, -0.22028667,  0.42598   ,  0.78249668, -0.15155867,
         1.76064331,  0.24309   , -0.11562409, -0.52126664, -0.388485  ,
         0.19397934, -0.17605334, -0.28364933,  0.29486334,  0.046     ,
        -0.17036234,  0.11212333, -0.24511967,  0.47757998, -0.16052333,
         0.09089434,  0.160248  , -0.36126126,  0.03312267, -0.08165666]),
 3: array([-0.0443    ,  0.42972334,  0.02970267, -0.18370666, -0.18396002,
         0.33099332,  0.23058002,  0.303102  ,  0.20789999,  0.26293668,
         0.04635667, -0.061596  , -0.03999999,  0.13095   , -0.01612335,
        -0.12790567, -0.02333667,  0.463156

In [16]:
# calculate cosine similarity between 2 sentence embeddings
def cosine_similarity(vector_a, vector_b):
    dot_product = np.dot(vector_a, vector_b)
    norm_a = np.linalg.norm(vector_a)
    norm_b = np.linalg.norm(vector_b)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

In [17]:
# get the top 5 similar questions
def get_top5_similar_questions2(query):
    ranking = {}
    for q in Q2_dict:
        ranking[q] = cosine_similarity(query, Q2_dict[q])
    ranking = sorted(ranking.items(), key=lambda x: x[1], reverse=True)
    return ranking[:2], ranking[:5]

In [22]:
# get the accuracy
acc_2 = 0
acc_5 = 0
for q in Q1_dict:
    real_index = q
    test_q = Q1_dict[q] 
    top2, top5 = get_top5_similar_questions2(test_q)
    if real_index in [i[0] for i in top2]:
        acc_2 += 1
    if real_index in [i[0] for i in top5]:
        acc_5 += 1

print('top2 accuracy: ', acc_2/100)
print('top5 accuracy: ', acc_5/100)

  similarity = dot_product / (norm_a * norm_b)


top2 accuracy:  0.56
top5 accuracy:  0.65


## sentence embedding(based on thesis)

In [25]:
corpus = []
for Q in Q2:
    for word in Q:
        corpus.append(word)

# calculate the unigram probability of a word in the corpus
def calculate_unigram_probability(word):
    word_count = corpus.count(word)
    total_words = len(corpus)
    unigram_probability = word_count / total_words
    return unigram_probability

# create a dictionary to store the unigram probability of each word
unigram_probabilities = {}
for word in vocabulary:
    unigram_probabilities[word] = calculate_unigram_probability(word)

In [43]:
len(unigram_probabilities)

49511

In [45]:
# Q1_dict = Q1.to_dict() # in case, might need to run again
# Q2_dict = Q2.to_dict()

def sentence_embedding(word_embeddings, sentences, a, word_probabilities):
    sentence_embeddings = {}
    for index, s in sentences.items():
        vs = np.zeros(50)  # Initialize sentence embedding as zero vector
        for w in s:
            try:
                a_value = a / (a + word_probabilities[w])  # Smooth inverse frequency, SIF
                vs += a_value * word_embeddings[w] * (1/len(s)) # vs += sif * word_vector
                #vs += ((word_embeddings[w] * a)/(a + word_probabilities[w]))* (1/len(s))
            except KeyError:
                continue
        sentence_embeddings[index] = vs

    sentence_list = list(sentence_embeddings.values())
    num_sentences = len(sentence_list)
    embedding_dim = sentence_list[0].shape[0]  # Assuming all embeddings have the same dimension
    X = np.zeros((embedding_dim, num_sentences))

    for i, embedding in enumerate(sentence_list):
        X[:, i] = embedding

    # Perform singular value decomposition
    u, _, _ = np.linalg.svd(X, full_matrices=False)  #full_matrices=False ensures that only the necessary number of singular vectors is returned
    u = u[:, 0]  # Extract first singular vector

    for index, s in sentences.items():
        vs = sentence_embeddings[index]
        uuT = np.outer(u, u)  # Compute the outer product of u with itself
        vs = vs - np.dot(uuT, vs)  # Subtract the product of uuT and vs from vs
        sentence_embeddings[index] = vs

    return sentence_embeddings


In [46]:
Q1_dict_vec2 = sentence_embedding(embeddings_dict, Q1_dict, 0.5, unigram_probabilities)

In [49]:
Q2_dict_vec2 = sentence_embedding(embeddings_dict, Q2_dict, 0.5, unigram_probabilities)

In [52]:
# get the top 5 similar questions
def get_top5_similar_questions3(query):
    ranking = {}
    for q in Q2_dict_vec2:
        ranking[q] = cosine_similarity(query, Q2_dict_vec2[q])
    ranking = sorted(ranking.items(), key=lambda x: x[1], reverse=True)
    return ranking[:2], ranking[:5]

# get the accuracy
acc_2 = 0
acc_5 = 0
for q in Q1_dict_vec2:
    real_index = q
    test_q = Q1_dict_vec2[q] 
    top2, top5 = get_top5_similar_questions3(test_q)
    if real_index in [i[0] for i in top2]:
        acc_2 += 1
    if real_index in [i[0] for i in top5]:
        acc_5 += 1

print('top2 accuracy: ', acc_2/100)
print('top5 accuracy: ', acc_5/100)

  similarity = dot_product / (norm_a * norm_b)


top2 accuracy:  0.62
top5 accuracy:  0.65


In [53]:
# try different values of a
for a in [0.01, 0.1, 1, 5, 10]:
    Q1_dict_vec2 = sentence_embedding(embeddings_dict, Q1_dict, a, unigram_probabilities)
    Q2_dict_vec2 = sentence_embedding(embeddings_dict, Q2_dict, a, unigram_probabilities)
    acc_2 = 0
    acc_5 = 0
    for q in Q1_dict_vec2:
        real_index = q
        test_q = Q1_dict_vec2[q] 
        top2, top5 = get_top5_similar_questions3(test_q)
        if real_index in [i[0] for i in top2]:
            acc_2 += 1
        if real_index in [i[0] for i in top5]:
            acc_5 += 1
    print('a = ', a)
    print('top2 accuracy: ', acc_2/100)
    print('top5 accuracy: ', acc_5/100)

  similarity = dot_product / (norm_a * norm_b)


a =  0.01
top2 accuracy:  0.59
top5 accuracy:  0.66
a =  0.1
top2 accuracy:  0.62
top5 accuracy:  0.66
a =  1
top2 accuracy:  0.62
top5 accuracy:  0.65
a =  5
top2 accuracy:  0.62
top5 accuracy:  0.65
a =  10
top2 accuracy:  0.62
top5 accuracy:  0.65
