## imports

In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import math
from numpy.linalg import norm

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MoHaMmAd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
english_stopwords = stopwords.words('english')
len(english_stopwords)

179

## read train data

In [3]:
df_train = pd.read_csv('./data/train_data.csv')
print('f_train.shape:', df_train.shape)
df_train

f_train.shape: (37250, 6)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,14,29,30,What are the laws to change your status from a...,What are the laws to change your status from a...,0
1,18,37,38,Why are so many Quora users posting questions ...,Why do people ask Quora questions which can be...,1
2,38,77,78,How do we prepare for UPSC?,How do I prepare for civil service?,1
3,58,117,118,I was suddenly logged off Gmail. I can't remem...,I can't remember my Gmail password or my recov...,1
4,60,121,122,How do I download content from a kickass torre...,Is Kickass Torrents trustworthy?,0
...,...,...,...,...,...,...
37245,404258,83107,222572,What is the difference between a psychologist ...,What is the difference between psychologist an...,1
37246,404276,24305,308365,What is copilotsearch.com?,What is ContenVania.com?,0
37247,404281,99131,81495,Why is Manaphy childish in Pokémon Ranger and ...,Why is Manaphy annoying in Pokemon ranger and ...,1
37248,404282,1931,16773,How does a long distance relationship work?,How are long distance relationships maintained?,1


## create total_term_count & most_total_term_count

In [4]:
total_term_count = dict()
uniqueQ2 = df_train.question2.unique()
N = len(uniqueQ2)
print('number of Q2:', N)

for i in uniqueQ2:
    for j in i.replace('?', '').replace(',', '').split(" "):
        if j not in english_stopwords:
            if j in total_term_count.keys():
                total_term_count[j] += 1
            else:
                total_term_count[j] = 1

total_term_count = dict(sorted(total_term_count.items(), key=lambda item: item[1], reverse=True))

print('number of terms:', len(total_term_count))

number of Q2: 16658
number of terms: 13288


In [5]:
most_total_term_count = dict(list(total_term_count.items())[:2000])
len(most_total_term_count)

2000

## compute TF

In [6]:
def computeDocLogTF(document): # compute Log TF of 
    terms = most_total_term_count.keys()
    tf = dict()
    docTerms = document.replace('?', '').replace(',', '').split(" ")
    wordDict = dict.fromkeys(terms, 0)
    for term in docTerms:
        if term in wordDict.keys():
            wordDict[term] += 1
    for term, count in wordDict.items():
        if count != 0:
            tf[term] = 1 + math.log(count)
        else:
            tf[term] = 0
    return tf

In [7]:
LogTF = dict()
for i in uniqueQ2:
    LogTF[df_train[df_train.question2 == i].qid2.iloc[0]] = computeDocLogTF(i)

LogTF = pd.DataFrame(LogTF)
LogTF

Unnamed: 0,30,38,78,118,122,126,146,160,174,202,...,537751,41380,537796,121048,315898,537844,394117,222572,308365,155606
What,1.0,0.0,0.0,0.000000,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
I,0.0,0.0,1.0,1.693147,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
How,1.0,0.0,1.0,1.000000,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
best,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Why,0.0,1.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
viewing,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
strangest,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gallery,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Snapdragon,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## compute IDF

In [8]:
IDF = dict.fromkeys(most_total_term_count, 0)
for i in most_total_term_count:
    IDF[i] = math.log(N / float(most_total_term_count[i]))
IDF

{'What': 1.0664772140519707,
 'I': 1.196278465331048,
 'How': 1.2293860511055528,
 'best': 2.2636137713729054,
 'Why': 2.519474977213607,
 'Is': 2.8009620106478748,
 'get': 2.8547547856118474,
 'Which': 3.091282607057837,
 'Quora': 3.175296200160866,
 'Can': 3.30391357798296,
 'people': 3.3154174024644445,
 'India': 3.3354514614975606,
 'way': 3.524201732700765,
 'life': 3.600348441544335,
 'know': 3.6113982777309204,
 'money': 3.63160098504844,
 'would': 3.6568606518076776,
 'Trump': 3.659188941567269,
 'one': 3.670912405263328,
 'make': 3.7020526459990513,
 'good': 3.711832675052691,
 'like': 3.7392316492408053,
 'If': 3.7778464853685847,
 'time': 3.8570146848971887,
 'Who': 3.85985963702942,
 'learn': 3.927632252111142,
 'Donald': 3.9399023447029564,
 'new': 3.9523248647015135,
 'without': 3.9776426726858034,
 'ever': 4.010218843120416,
 'someone': 4.020202287104599,
 'notes': 4.023552373989881,
 'Instagram': 4.040473251478218,
 'account': 4.0611636447356645,
 '500': 4.1259344808934

## compute train data TF-IDF

In [9]:
def computeTFIDF(qid2):
    tf = LogTF[qid2]
    tfidf = np.array(tf.values) * np.array(list(IDF.values()))
    return tfidf

In [10]:
train_TFIDF = dict()
for i in uniqueQ2:
    qid2 = df_train[df_train.question2 == i].qid2.iloc[0]
    train_TFIDF[qid2] = computeTFIDF(qid2)

len(train_TFIDF)

train_TFIDF = pd.DataFrame(train_TFIDF)
train_TFIDF

Unnamed: 0,30,38,78,118,122,126,146,160,174,202,...,537751,41380,537796,121048,315898,537844,394117,222572,308365,155606
0,1.066477,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1.066477,1.066477,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,1.066477,1.066477,1.066477,0.0
1,0.000000,0.000000,1.196278,2.025476,0.0,0.000000,1.196278,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,1.196278,0.0,0.0,0.000000,0.000000,0.000000,0.0
2,1.229386,0.000000,1.229386,1.229386,0.0,1.229386,1.229386,0.000000,0.000000,0.0,...,0.000000,0.0,1.229386,1.229386,0.0,0.0,0.000000,0.000000,0.000000,0.0
3,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
4,0.000000,2.519475,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,...,2.519475,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
1996,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
1997,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
1998,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0


## read test data

In [11]:
df_test = pd.read_csv('./data/test_data.csv')
df_test

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,249,499,500,How will the implementation of GST bill impact...,What exactly is GST bill and how exactly will ...,1
1,1287,2566,2567,What are some popular method to do suicide?,Is there some painless way to suicide?,1
2,2244,4464,4465,I am poor in English grammar so how should I i...,What can I do to improve my English Grammar?,1
3,2417,4804,4805,What are some good psychological hacks?,What is the coolest psychological trick?,1
4,2572,5110,5111,What is your best sexual experience?,What was your most favorite sexual experience?,1
...,...,...,...,...,...,...
975,403162,113631,2502,What places should I visit during my visit to ...,What are the places to visit for a honeymoon i...,1
976,403236,74247,6804,Is it possible to stop masturbating?,How should I stop masturbating?,1
977,403388,21982,51853,How do I learn quickly?,How can l learn faster?,1
978,403553,68327,68796,How can I trace phone calls from a cell phone?,How do I track a cell phone by number for free?,1


## vector-space model functions

In [12]:
def computeQTFIDF(question1): # compute TF-IDF of a single query
    logtf = np.array(list(computeDocLogTF(question1).values()))
    tfidf = logtf * np.array(list(IDF.values()))
    return tfidf

#==============================================================================================

def cosineSimilarity(A, B):
    A = np.array(A)
    B = np.array(B)
    cosine = np.dot(A, B)/(norm(A) * norm(B))
    return cosine

#==============================================================================================
    
def computeSimilarTFIDF(document): # final vector space model weight
    doc_TFIDF = computeQTFIDF(document)
    VSMW = dict()
    for i in train_TFIDF.items():
        VSMW[df_train[df_train.qid2 == i[0]].question2.iloc[0]] = cosineSimilarity(doc_TFIDF, i[1])
    return dict(list(dict(sorted(VSMW.items(), key=lambda item: item[1], reverse=True)).items())[:10])


In [13]:
len(IDF)

2000

## metric functions

In [14]:
def p_at_k(predict, actual = df_test, k=10):
    p_at_ks = list()
    for i in actual.qid1.unique():
        actuals = list(actual[actual.qid1 == i].question2)
        thisP = 0
        thisPredicts = list(predict[i].keys())
        for l in range(k):
            if thisPredicts[l] in actuals:
                thisP += 1
        p_at_ks.append(thisP / k)
    return sum(p_at_ks) / len(p_at_ks)


In [15]:
def meanAveragePrecision(predict, actual = df_test, k=10):
    MAPs = list()
    for i in actual.qid1.unique():
        actuals = list(actual[actual.qid1 == i].question2)
        thisP = 0
        thisPList = list()
        thisPredicts = list(predict[i].keys())
        for l in range(k):
            if thisPredicts[l] in actuals:
                thisP += 1
                thisPList.append(thisP / (l + 1))
        MAPs.append(sum(thisPList) / len(actuals))
    return sum(MAPs) / len(MAPs)
    

In [16]:
def meanReciprocalRank(predict, actual = df_test):
    mrrs = list()
    for i in actual.qid1.unique():
        actuals = list(actual[actual.qid1 == i].question2)
        thisPredicts = list(predict[i].keys())
        for l in range(10):
            f = True
            if thisPredicts[l] in actuals:
                mrrs.append(1 / (l + 1))
                f = False
                break
        if f:
            mrrs.append(0)

    return sum(mrrs) / len(mrrs)



## TEST vector-space model

In [17]:
results = dict()
for i, j in zip(df_test.qid1.unique(), df_test.question1.unique()):
    results[i] = computeSimilarTFIDF(j)
len(results)

  cosine = np.dot(A, B)/(norm(A) * norm(B))


146

In [18]:
print("P@5:", p_at_k(predict=results, k=5))
print("P@10:", p_at_k(predict=results))
print("MAP5:", meanAveragePrecision(predict=results, k=5))
print("MAP10:", meanAveragePrecision(predict=results))
print("MRR:", meanReciprocalRank(predict=results))

P@5: 0.4397260273972603
P@10: 0.29931506849315076
MAP5: 0.309167494544774
MAP10: 0.36400944393928447
MRR: 0.6573222439660796


## P@5: 0.4397260273972603<br>
## P@10: 0.29931506849315076<br>
## MAP5: 0.309167494544774<br>
## MAP10: 0.36400944393928447<br>
## MRR: 0.6573222439660796