# NLP Features Extraction

# 5. Featurizing text data with tfidf weighted word-vectors:

In [1]:
# import required liabraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import sys
import os 
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import spacy

In [3]:
# load data
df = pd.read_csv('G:\DS\Revision\Applied AI\Module - Copy (4)\Quara\Quora/train.csv')

In [4]:
# convert all questions into string
df['question1'] = df['question1'].apply(lambda x: str(x))
df['question2'] = df['question2'].apply(lambda x: str(x))

## 5.1 TF-IDF vectorization of text data:

In [5]:
# get tf-idf for all words
from sklearn.feature_extraction.text import TfidfVectorizer

# merge texts
questions = list(df['question1']) + list(df['question2'])

tfidf = TfidfVectorizer(lowercase=False, )
tfidf.fit_transform(questions)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

- After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores.
- here we use a pre-trained GLOVE model which comes free with "Spacy". https://spacy.io/usage/vectors-similarity
- It is trained on Wikipedia and therefore, it is stronger in terms of word semantics.
- In spacy we convert text to vector by considering language context.

## 5.2 Convert text to vector using spacy:

In [2]:
nlp = spacy.load('en_core_web_lg')
x=nlp('man')
len(x.vector)

300

In [7]:
# convert each question to a weighted average of word2vec vectors
vecs1 = []
# tqdm is used to print the progrss bar
for qu1 in tqdm(list(df['question1'])):
    doc1 = nlp(qu1) 
    # 300 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), len(doc1[0].vector)])
    for word1 in doc1:
      # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1 += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)
df['q1_feats_m'] = list(vecs1)

100%|████████████████████████████████████████████████████████████████████████| 404290/404290 [4:38:10<00:00, 24.22it/s]


In [7]:
# convert each question to a weighted average of word2vec vectors
vecs2 = []
for qu2 in tqdm(list(df['question2'])):
    doc2 = nlp(qu2) 
    mean_vec2 = np.zeros([len(doc2), len(doc2[0].vector)])
    for word2 in doc2:
        # word2vec
        vec2 = word2.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word2)]
        except:
            idf = 0
        # compute final vec
        mean_vec2 += vec2 * idf
    mean_vec2 = mean_vec2.mean(axis=0)
    vecs2.append(mean_vec2)
df['q2_feats_m'] = list(vecs2)

100%|████████████████████████████████████████████████████████████████████████| 404290/404290 [2:17:29<00:00, 49.01it/s]


In [15]:
# convert spacy vectors into dataframe
df3 = df.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df3_q1 = pd.DataFrame(df3.q1_feats_m.values.tolist(), index= df3.index)
df3_q2 = pd.DataFrame(df3.q2_feats_m.values.tolist(), index= df3.index)

In [None]:
# storing the q1 vector and q2 vector to csv file
df3_q1.to_csv("df3_q1.csv", index = False)
df3_q2.to_csv("df3_q2.csv", index = False)

## 5.3 Store all final features to csv file (basic + advanced + nlp):

In [68]:
# load basic + adv features data
df1 = pd.read_csv("basic+adv features.csv.csv")

In [80]:
# basic extracted features
df1.head(2)

Unnamed: 0,id,is_duplicate,q1len,q2len,q1+q2_len,q1-q2_len,q1_words,q2_words,total_words,words_difference,simillar_words,simillar_words_count,word_share,first_word_same
0,0,0,66,57,123,9,14,12,26,2,"{'What', 'invest', 'to', 'guide', 'step', 'is'...",10,0.384615,1
1,1,0,51,88,139,37,8,13,21,5,"{'What', '(Koh-i-Noor)', 'the', 'Kohinoor'}",4,0.190476,1


In [78]:
# spacy vector dataframe of question1
df3_q1.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-17.302041,65.077906,-262.939456,-20.686562,168.056481,37.079087,-62.802002,163.003715,-256.633855,-5.613477,...,120.913699,-132.868682,-111.14414,76.800932,-17.031813,18.101131,-26.370254,-169.293018,-136.936277,95.165242
1,-21.271277,42.341498,84.177521,-106.393414,88.151337,-43.99884,45.112466,109.73658,21.342126,-31.136926,...,51.731451,21.67201,50.035608,-10.191149,-110.361197,60.418664,36.341111,-174.373243,-63.843372,79.621566


In [79]:
# spacy vector dataframe of question2
df3_q2.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-5.951647,77.032716,-248.923202,-25.578063,109.20947,59.628984,-78.496038,139.783484,-279.317965,9.203245,...,89.509871,-110.533625,-100.479424,90.664803,19.507948,-15.140525,-19.345136,-129.408695,-118.284831,89.844395
1,-111.667871,108.646075,-29.244612,-92.022102,76.001256,50.877741,28.651991,189.795767,68.172862,6.925361,...,81.915579,52.01094,91.056819,-0.052428,-165.972075,103.07754,-83.073252,-68.480301,-117.666431,173.873648


In [None]:
# merge all dataframes
df3_q1['id']=df1['id']
df3_q2['id']=df1['id']

df2  = df3_q1.merge(df3_q2, on='id',how='left')
res  = df1.merge(df2, on='id',how='left')

In [None]:
# storing the final features to csv file
res.to_csv("all final features.csv", index = False)