# NLP Features Extraction

# 5. Featurizing text data with tfidf weighted word-vectors:

In [1]:
# import required liabraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import sys
import os 
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import spacy

In [2]:
# load data
df = pd.read_csv('train.csv')

In [3]:
# convert all questions into string
df['question1'] = df['question1'].apply(lambda x: str(x))
df['question2'] = df['question2'].apply(lambda x: str(x))

## 5.1 TF-IDF vectorization of text data:

In [4]:
# get tf-idf for all words
# merge texts
questions = list(df['question1']) + list(df['question2'])

tfidf = TfidfVectorizer()
tfidf.fit_transform(questions)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

In [5]:
word2tfidf

{'000': 5.204692619390966,
 '10': 5.61015772749913,
 '11': 5.61015772749913,
 '11th': 5.61015772749913,
 '14': 5.204692619390966,
 '1822': 5.61015772749913,
 '19': 5.204692619390966,
 '1952': 5.61015772749913,
 '20s': 5.61015772749913,
 '21': 5.61015772749913,
 '23': 5.61015772749913,
 '24': 5.61015772749913,
 '25': 5.61015772749913,
 '30': 5.204692619390966,
 '3768': 5.61015772749913,
 '3d': 5.204692619390966,
 '425': 5.61015772749913,
 '50': 5.61015772749913,
 '60000': 5.61015772749913,
 '60k': 5.61015772749913,
 '855': 5.61015772749913,
 'about': 4.357394759003762,
 'abstract': 5.204692619390966,
 'account': 5.204692619390966,
 'active': 5.61015772749913,
 'actually': 5.61015772749913,
 'address': 5.61015772749913,
 'aerodynamically': 5.61015772749913,
 'affect': 5.61015772749913,
 'affected': 5.61015772749913,
 'affects': 5.61015772749913,
 'after': 4.693866995624976,
 'aircraft': 5.204692619390966,
 'alive': 5.61015772749913,
 'all': 4.693866995624976,
 'already': 5.61015772749913

- After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores.
- here we use a pre-trained GLOVE model which comes free with "Spacy". https://spacy.io/usage/vectors-similarity
- It is trained on Wikipedia and therefore, it is stronger in terms of word semantics.
- In spacy we convert text to vector by considering language context.

## 5.2 Convert text to vector using spacy:

In [12]:
# load GLOVE model 
nlp = spacy.load('en_core_web_lg')

### 5.2.1 Try word-vector on a random question

In [24]:
x=nlp(df['question1'][11])
x.vector

array([ 1.0431499e+00,  1.3612869e+00, -3.4183400e+00, -3.7782669e+00,
       -1.8615010e+00,  1.3851540e+00,  6.2170732e-01,  3.0585918e+00,
       -5.3122711e+00,  4.4488840e+00,  3.5047364e+00,  3.1845810e+00,
       -3.8897929e+00,  6.6143703e-01,  3.6515746e+00, -3.3975768e+00,
        2.8977280e+00, -5.2005253e+00, -1.7853858e+00,  2.0627396e-01,
        7.6412398e-01, -5.5799484e-03,  9.6745394e-02, -4.6930485e+00,
       -1.6440490e+00, -1.1981790e+00, -1.0980819e+00, -4.7003001e-01,
       -2.2188442e+00,  3.2277062e+00,  1.3228920e+00, -2.2118580e+00,
       -5.9487885e-01,  2.2155700e+00,  2.8506119e+00,  2.5766900e-01,
       -1.0633795e+00,  1.6896591e-01,  5.3619795e+00,  2.8589251e+00,
       -1.3337719e+00,  1.1357700e+00,  4.4796991e-01, -2.9051621e+00,
        1.0272939e+00,  6.4028829e-01, -1.3912159e+00, -3.1668720e+00,
       -1.3765252e+00, -5.8780123e-02,  1.9106687e+00,  8.0875713e-01,
        6.2229997e-01, -4.4842696e+00, -2.7786410e+00, -2.4545054e+00,
      

### Check length of vector

In [11]:
len(x.vector)

300

Yes, we are getting 300 dimension vector for a question.

### 5.2.2 Convert each question to a weighted average of word2vec vectors

In [15]:
# convert each question to a weighted average of word2vec vectors
vecs1 = []
# tqdm is used to print the progrss bar
for qu1 in tqdm(list(df['question1'])):
    doc1 = nlp(qu1) 
    # 300 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), len(doc1[0].vector)])
    for word1 in doc1:
      # word2vec
        vec1 = word1.vector
        # fetch TF-IDF score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1 += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)
df['q1_feats_m'] = list(vecs1)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 52.74it/s]


In [16]:
# convert each question to a weighted average of word2vec vectors
vecs2 = []
for qu2 in tqdm(list(df['question2'])):
    doc2 = nlp(qu2) 
    mean_vec2 = np.zeros([len(doc2), len(doc2[0].vector)])
    for word2 in doc2:
        # word2vec
        vec2 = word2.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word2)]
        except:
            idf = 0
        # compute final vec
        mean_vec2 += vec2 * idf
    mean_vec2 = mean_vec2.mean(axis=0)
    vecs2.append(mean_vec2)
df['q2_feats_m'] = list(vecs2)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 52.19it/s]


In [21]:
df.head(2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_feats_m,q2_feats_m
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"[-33.71826529502869, 52.07591262459755, -190.2...","[-21.711020708084106, 58.97545304894447, -176...."
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"[-24.400290727615356, -14.92566442489624, 4.37...","[-60.13240456581116, 53.79257869720459, -82.05..."


In [18]:
# convert spacy vectors into dataframe
df3 = df.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df3_q1 = pd.DataFrame(df3.q1_feats_m.values.tolist(), index= df3.index)
df3_q2 = pd.DataFrame(df3.q2_feats_m.values.tolist(), index= df3.index)

In [19]:
df3_q1.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-33.718265,52.075913,-190.222739,-6.310965,146.169675,37.858362,-50.408487,143.494773,-131.796247,-18.780596,...,99.124639,-89.743293,-95.871759,59.600663,-13.629919,45.095505,-22.753867,-141.951119,-103.940463,78.642948
1,-24.400291,-14.925664,4.378237,-35.705061,102.353249,-18.293517,39.875559,101.587643,9.124969,-19.278654,...,40.453911,13.793268,17.980357,15.9176,-18.91358,-9.426158,10.21447,-39.973645,-38.393773,4.155893


In [20]:
df3_q2.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-21.711021,58.975453,-176.973742,-12.810946,97.421693,48.931264,-54.897399,117.241863,-161.155633,-7.480803,...,73.738798,-71.788306,-88.084183,70.597865,10.619926,9.00564,-18.064971,-106.613731,-92.589156,81.796279
1,-60.132405,53.792579,-82.058676,-12.650138,61.951646,64.926366,15.032457,138.998489,34.174074,20.495804,...,56.765928,16.049368,57.089808,61.829167,-57.72912,28.564083,-57.742117,47.643656,-75.674653,54.595995


In [None]:
# storing the q1 vector and q2 vector to csv file
df3_q1.to_csv("df3_q1.csv", index = False)
df3_q2.to_csv("df3_q2.csv", index = False)

## 5.3 Store all final features to csv file (basic + advanced + nlp):

In [68]:
# load basic + adv features data
df1 = pd.read_csv("basic+adv features.csv.csv")

In [80]:
# basic extracted features
df1.head(2)

Unnamed: 0,id,is_duplicate,q1len,q2len,q1+q2_len,q1-q2_len,q1_words,q2_words,total_words,words_difference,simillar_words,simillar_words_count,word_share,first_word_same
0,0,0,66,57,123,9,14,12,26,2,"{'What', 'invest', 'to', 'guide', 'step', 'is'...",10,0.384615,1
1,1,0,51,88,139,37,8,13,21,5,"{'What', '(Koh-i-Noor)', 'the', 'Kohinoor'}",4,0.190476,1


In [62]:
# spacy vector dataframe of question1
df3_q1.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-33.718265,52.075913,-190.222739,-6.310965,146.169675,37.858362,-50.408487,143.494773,-131.796247,-18.780596,...,99.124639,-89.743293,-95.871759,59.600663,-13.629919,45.095505,-22.753867,-141.951119,-103.940463,78.642948
1,-24.400291,-14.925664,4.378237,-35.705061,102.353249,-18.293517,39.875559,101.587643,9.124969,-19.278654,...,40.453911,13.793268,17.980357,15.9176,-18.91358,-9.426158,10.21447,-39.973645,-38.393773,4.155893


In [63]:
# spacy vector dataframe of question2
df3_q2.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-21.711021,58.975453,-176.973742,-12.810946,97.421693,48.931264,-54.897399,117.241863,-161.155633,-7.480803,...,73.738798,-71.788306,-88.084183,70.597865,10.619926,9.00564,-18.064971,-106.613731,-92.589156,81.796279
1,-60.132405,53.792579,-82.058676,-12.650138,61.951646,64.926366,15.032457,138.998489,34.174074,20.495804,...,56.765928,16.049368,57.089808,61.829167,-57.72912,28.564083,-57.742117,47.643656,-75.674653,54.595995


In [None]:
# merge all dataframes
df3_q1['id']=df1['id']
df3_q2['id']=df1['id']

df2  = df3_q1.merge(df3_q2, on='id',how='left')
res  = df1.merge(df2, on='id',how='left')

In [None]:
# storing the final features to csv file
res.to_csv("all final features.csv", index = False)