In [1]:
import numpy as np 
import pandas as pd 

In [17]:
path = 'datasets/'
train = pd.read_csv(path + 'train.csv.zip')

In [18]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            404290 non-null  int64 
 1   qid1          404290 non-null  int64 
 2   qid2          404290 non-null  int64 
 3   question1     404289 non-null  object
 4   question2     404288 non-null  object
 5   is_duplicate  404290 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


In [19]:
#encode questions to unicode
train['question1'] = train['question1'].apply(lambda x: str(x))
train['question2'] = train['question2'].apply(lambda x: str(x))

In [20]:
#merging texts
questions = list(train['question1']) + list(train['question2'])

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(lowercase=False)
tfidf.fit_transform(questions)

<808580x109679 sparse matrix of type '<class 'numpy.float64'>'
	with 8146555 stored elements in Compressed Sparse Row format>

In [21]:
#dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(),tfidf.idf_))
word2tfidf

{'00': 10.04915924690928,
 '000': 7.304591037001674,
 '0000': 13.909888957949875,
 '000000': 13.909888957949875,
 '00000000': 13.909888957949875,
 '0000000000': 13.50442384984171,
 '0000001': 13.909888957949875,
 '00000074': 13.909888957949875,
 '0000021210': 13.909888957949875,
 '00001': 13.909888957949875,
 '0001': 13.909888957949875,
 '0002': 13.909888957949875,
 '000INR': 13.909888957949875,
 '000K': 13.909888957949875,
 '000Rs': 12.811276669281765,
 '000USD': 13.909888957949875,
 '000WebHost': 13.909888957949875,
 '000ft': 13.909888957949875,
 '000rs': 13.909888957949875,
 '000s': 13.909888957949875,
 '000webhost': 13.909888957949875,
 '001': 12.811276669281765,
 '0015': 13.909888957949875,
 '0019': 12.99359822607572,
 '002': 13.21674177738993,
 '0021': 13.909888957949875,
 '0035': 13.909888957949875,
 '003SC': 13.909888957949875,
 '005': 13.909888957949875,
 '0051': 13.50442384984171,
 '007': 13.50442384984171,
 '008': 13.50442384984171,
 '008801703772104': 13.909888957949875,
 '

* As we have TF-IDF scores, will convert each question to a weighted average of word2vec vectors by these scores
* Will use `pre-trained GLOVE model` which comes free with [Spacy](https://spacy.io/usage/vectors-similarity)
* It is trained on Wikipedia and therefore, it is stronger in terms of word semantics

In [23]:
# en_vectors_web_lg it includes over 1 million unique vectors
import spacy

nlp = spacy.load('en_core_web_sm')

In [24]:
from tqdm import tqdm

vecs1 = []
for qu1 in tqdm(list(train['question1'])):
    doc1 = nlp(qu1) 
    mean_vec1 = np.zeros([len(doc1),len(doc1[0].vector)])
    for word1 in doc1:
        vec1 = word1.vector
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        mean_vec1 += vec1 * idf #computing final vec
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)

##new col included
train['q1_feats_m'] = list(vecs1)

100%|██████████| 404290/404290 [1:05:27<00:00, 102.93it/s]


In [25]:
vecs2 = []
for qu2 in tqdm(list(train['question2'])):
    doc2 = nlp(qu2) 
    mean_vec2 = np.zeros([len(doc1),len(doc2[0].vector)])
    for word2 in doc2:
        vec2 = word2.vector
        try:
            idf = word2tfidf[str(word2)]
        except:
            idf = 0
        mean_vec2 += vec2 * idf
    mean_vec2 = mean_vec2.mean(axis=0)
    vecs2.append(mean_vec2)

##new col included 
train['q2_feats_m'] = list(vecs2)

100%|██████████| 404290/404290 [1:03:07<00:00, 106.75it/s]


In [27]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_feats_m,q2_feats_m
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"[-6.179506778717041, 37.45073118805885, -67.92...","[-14.616980731487274, 59.75548753142357, -53.2..."
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"[9.236667931079865, -80.37141644954681, -45.78...","[-3.5657422859221697, -16.844570636749268, -13..."
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,"[97.54683184623718, 22.97219370305538, -39.558...","[156.8336295336485, 59.99189615249634, -8.4143..."
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,"[57.58697843551636, -22.017089188098907, -4.59...","[41.47243919968605, 56.71731689572334, 31.5306..."
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"[83.1857842206955, -40.50698482990265, -83.403...","[-14.446974992752075, -4.33825546503067, -70.1..."


In [28]:
#save vector form

train.to_csv('datasets/3-train_vectors.csv',index=False)

In [29]:
df3 = train.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)

In [30]:
df3.head(2)

Unnamed: 0,id,q1_feats_m,q2_feats_m
0,0,"[-6.179506778717041, 37.45073118805885, -67.92...","[-14.616980731487274, 59.75548753142357, -53.2..."
1,1,"[9.236667931079865, -80.37141644954681, -45.78...","[-3.5657422859221697, -16.844570636749268, -13..."


---------
next go to 4-Extracting_final_train_dataset

In [31]:
#Questions 1 tfidf weighted word2vec 95
df3_q1 = pd.DataFrame(df3.q1_feats_m.values.tolist(),index=df3.index)

#Questions 2 tfidf weighted word2vec
df3_q2 = pd.DataFrame(df3.q2_feats_m.values.tolist(),index=df3.index)

In [34]:
df3_q1.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,-6.179507,37.450731,-67.929894,32.224274,143.348826,135.374574,17.865208,54.562352,81.618936,232.909839,...,-71.834689,-60.222858,-22.026407,103.33672,-68.477445,-54.976584,-67.802663,116.269999,60.515897,-12.245916
1,9.236668,-80.371416,-45.785907,78.291656,183.568221,100.894077,74.344804,48.360802,127.297421,112.987302,...,-32.130515,-98.080325,19.11379,-20.507508,-76.981011,82.665075,41.085582,129.377781,115.868467,4.383543


In [35]:
df3_q2.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,-14.616981,59.755488,-53.263745,19.514497,113.916473,101.657056,8.561499,66.232769,32.888127,210.812733,...,-72.266625,-37.072086,-31.14273,94.064854,-45.053242,-34.155221,-76.548099,99.282776,50.791731,-17.566246
1,-3.565742,-16.844571,-130.911785,0.320254,79.350278,23.562028,79.124551,84.119839,128.684135,279.539877,...,6.193171,-65.084229,-15.654534,-3.475828,26.999802,170.172613,-57.038953,194.269546,128.207803,55.490061


In [36]:
#save vector form q1
df3_q1.to_csv('datasets/3.1-train_tfidf_weighted_word2vec_96_Q1.csv',index=False)

In [37]:
#save vector form q2
df3_q2.to_csv('datasets/3.2-train_tfidf_weighted_word2vec_95_Q2.csv',index=False)

Go to 4th file named as extracting final train dataset