In [1]:
%matplotlib inline

import matplotlib
import pandas as pd
import xgboost as xgb
import numpy as np
import scipy.sparse
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from ast import literal_eval



In [2]:
data_dir = '/home/uasa/Desktop/data/'
google_vec_filepath = data_dir + 'GoogleNews-vectors-negative300.bin'
lexvec_filepath = data_dir + 'lexvec.enwiki+newscrawl.300d.W+C.pos.vectors'
quora_train_filepath = data_dir + 'train.csv'
quora_features_filepath = data_dir + 'generated_features.csv'

In [3]:
types = {'question1': literal_eval,
        'question2': literal_eval,
        'first_tfidf': literal_eval,
        'second_tfidf': literal_eval,
        'word2vec': literal_eval}
cols = ['question1','question2','first_tfidf','second_tfidf','word2vec']

In [4]:
data = pd.read_csv(quora_features_filepath).dropna()
data = data[:404000]

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,first_tfidf,second_tfidf,word2vec
0,0,0,1,2,"['what', 'is', 'the', 'step', 'by', 'step', 'g...","['what', 'is', 'the', 'step', 'by', 'step', 'g...",0,[0.011015513827648302],[],[]
1,1,1,3,4,"['what', 'is', 'the', 'story', 'of', 'kohinoor...","['what', 'would', 'happen', 'if', 'the', 'indi...",0,"[0.0015611079155654546, 0, 0]","[0, 0.0020456996460423661, 0, 0.00530077916237...","[0.12108668801077602, -0.0026093271023792399, ..."
2,2,2,5,6,"['how', 'can', 'i', 'increase', 'the', 'speed'...","['how', 'can', 'internet', 'speed', 'be', 'inc...",0,"[0, 0, 0, 0.00094116567542333314, 0.0027663976...","[0, 0.0027663976319769424, 0, 2.95835727559128...","[0.54921196737500488, 0.18742697288149188, 0.3..."
3,3,3,7,8,"['why', 'am', 'i', 'mentally', 'very', 'lonely...","['find', 'the', 'remainder', 'when', 'math2324...",0,"[0.00025379717272249169, 0, 0, 0, 0, 0.0011458...","[0, 0, 0, 0, 0, 0, 0.00010966302079046566, 0, ...","[0.1766012788280199, 0, 0.18414095180310547, 0..."
4,4,4,9,10,"['which', 'one', 'dissolve', 'in', 'water', 'q...","['which', 'fish', 'would', 'survive', 'in', 's...",0,"[0, 2.4650131616824657e-05, 0, 0.0001634787768...","[0, 0.00057215682507888921, 0.0003454791477683...","[0.44114308460482676, 0.24632687319343469, 0.2..."


In [6]:
len(data)

404000

In [7]:
def num_string_to_list(x):
    if len(x) <=2:
        return 0
    return map(float, x[1:-1].split(', '))

def string_to_list(x):
    return x[1:-1].split(', ')

In [8]:
data['first_tfidf'] = data['first_tfidf'].apply(lambda x: num_string_to_list(x))
data['second_tfidf'] = data['second_tfidf'].apply(lambda x: num_string_to_list(x))
data['word2vec'] = data['word2vec'].apply(lambda x: num_string_to_list(x))

In [9]:
data['question1'] = data['question1'].apply(lambda x: string_to_list(x))
data['question2'] = data['question2'].apply(lambda x: string_to_list(x))

In [10]:
data.drop(data.columns[[0,1,2,3]], axis=1, inplace=True)
data.head()

Unnamed: 0,question1,question2,is_duplicate,first_tfidf,second_tfidf,word2vec
0,"['what', 'is', 'the', 'step', 'by', 'step', 'g...","['what', 'is', 'the', 'step', 'by', 'step', 'g...",0,[0.0110155138276],0,0
1,"['what', 'is', 'the', 'story', 'of', 'kohinoor...","['what', 'would', 'happen', 'if', 'the', 'indi...",0,"[0.00156110791557, 0.0, 0.0]","[0.0, 0.00204569964604, 0.0, 0.00530077916237,...","[0.121086688011, -0.00260932710238, 0.19284702..."
2,"['how', 'can', 'i', 'increase', 'the', 'speed'...","['how', 'can', 'internet', 'speed', 'be', 'inc...",0,"[0.0, 0.0, 0.0, 0.000941165675423, 0.002766397...","[0.0, 0.00276639763198, 0.0, 2.95835727559e-05...","[0.549211967375, 0.187426972881, 0.36903241766..."
3,"['why', 'am', 'i', 'mentally', 'very', 'lonely...","['find', 'the', 'remainder', 'when', 'math2324...",0,"[0.000253797172722, 0.0, 0.0, 0.0, 0.0, 0.0011...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0001096630207...","[0.176601278828, 0.0, 0.184140951803, 0.281655..."
4,"['which', 'one', 'dissolve', 'in', 'water', 'q...","['which', 'fish', 'would', 'survive', 'in', 's...",0,"[0.0, 2.46501316168e-05, 0.0, 0.00016347877681...","[0.0, 0.000572156825079, 0.000345479147768]","[0.441143084605, 0.246326873193, 0.22918478869..."


In [11]:
new_features = pd.DataFrame()

In [12]:
def length(x):
    if type(x) == int:
        return 0
    return len(x)

def summ(x):
    if type(x) == int:
        return 0
    return sum(x)*100

def maxx(x):
    if type(x) == int:
        return 0
    return max(x)*100

def w2v_sqr_score(x):
    if type(x) == int:
        return 0
    score = 0
    for item in x:
        score += item*item
    return score

def w2v_score(x):
    if type(x) == int:
        return 0
    score = 0
    for item in x:
        score += item
    return score

In [13]:
new_features['len_q1'] = data['question1'].apply(lambda x: len(x))
new_features['len_q2'] = data['question2'].apply(lambda x: len(x))

In [14]:
new_features['len_tf1'] = data['first_tfidf'].apply(lambda x: length(x))
new_features['len_tf2'] = data['second_tfidf'].apply(lambda x: length(x))

In [15]:
new_features['sum_tf1'] = data['first_tfidf'].apply(lambda x: summ(x))
new_features['sum_tf2'] = data['second_tfidf'].apply(lambda x: summ(x))

In [16]:
new_features['max_tf1'] = data['first_tfidf'].apply(lambda x: maxx(x))
new_features['max_tf2'] = data['second_tfidf'].apply(lambda x: maxx(x))

In [17]:
new_features['w2v_score'] = data['word2vec'].apply(lambda x: w2v_score(x))
new_features['w2v_sqr_score'] = data['word2vec'].apply(lambda x: w2v_sqr_score(x))

In [18]:
tfidf = pd.read_csv(data_dir+'tfidf.csv', index_col='term', usecols = [1, 2])
tfidf.head()

Unnamed: 0_level_0,weight
term,Unnamed: 1_level_1
0,4.8e-05
0,1e-05
0,6e-06
0,2e-06
2,2e-06


In [19]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

def get_score(x):
    if x in tfidf.index:
        return tfidf.loc[x,'weight']
    return 0

def tfidf_sum(df):
    l1 = []
    l2 = []
    for row in df.itertuples():
        s1 = 0
        s2 = 0
        for item in row[1]:
            s1 += get_score(stemmer.stem(item.decode('utf-8')))
        for item in row[2]:
            s2 += get_score(stemmer.stem(item.decode('utf-8')))
        l1.append(s1)
        l2.append(s2)
    new_frame = pd.DataFrame(
    {'first_tfidf_count': l1,
     'second_tfidf_count': l2,
    })
    return new_frame

In [20]:
new_features = new_features.join(tfidf_sum(data))

  return key in self._engine
  return self._engine.get_loc(key)


In [21]:
new_features.head()

Unnamed: 0,len_q1,len_q2,len_tf1,len_tf2,sum_tf1,sum_tf2,max_tf1,max_tf2,w2v_score,w2v_sqr_score,first_tfidf_count,second_tfidf_count
0,14,12,1,0,1.101551,0.0,1.101551,0.0,0.0,0.0,0.019313,0.008297
1,8,13,3,7,0.156111,1.206826,0.156111,0.530078,4.572842,1.379041,0.001758,0.012265
2,14,10,10,6,1.334607,0.542084,0.94177,0.27664,14.256037,5.456765,0.016111,0.008186
3,11,9,10,9,0.341648,0.040415,0.14548,0.029449,17.697194,7.083129,0.003416,0.000404
4,13,7,9,3,0.093417,0.091764,0.033918,0.057216,2.966856,0.621951,0.003176,0.003159


In [22]:
new_features['first_tfidf_count'] = new_features['first_tfidf_count']*100
new_features['second_tfidf_count'] = new_features['second_tfidf_count']*100
new_features['tfidf_sum'] = new_features['first_tfidf_count'] + new_features['second_tfidf_count']
new_features.head()

Unnamed: 0,len_q1,len_q2,len_tf1,len_tf2,sum_tf1,sum_tf2,max_tf1,max_tf2,w2v_score,w2v_sqr_score,first_tfidf_count,second_tfidf_count,tfidf_sum
0,14,12,1,0,1.101551,0.0,1.101551,0.0,0.0,0.0,1.93126,0.829709,2.76097
1,8,13,3,7,0.156111,1.206826,0.156111,0.530078,4.572842,1.379041,0.175778,1.226493,1.402271
2,14,10,10,6,1.334607,0.542084,0.94177,0.27664,14.256037,5.456765,1.611102,0.818578,2.42968
3,11,9,10,9,0.341648,0.040415,0.14548,0.029449,17.697194,7.083129,0.341648,0.040415,0.382063
4,13,7,9,3,0.093417,0.091764,0.033918,0.057216,2.966856,0.621951,0.317592,0.315938,0.63353


In [23]:
new_features['tfidf_dif_sum'] = new_features['sum_tf1']+new_features['sum_tf2']

In [24]:
new_features.join(data['is_duplicate']).to_csv(data_dir + 'training_with_new_features.csv')