## Quora Question Pairs

This notebook predicts if two questions have the similar meaning.

Load libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
# nltk.download()

Define tokenizer

In [5]:
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

reg_tknzr = RegexpTokenizer('\w+')

def tokenize_sentence(sentence, tokenizer='word'):
    if tokenizer == 'word':
        return word_tokenize(str(sentence))
    if tokenizer == 'regexp':
        return reg_tknzr.tokenize(str(sentence))

example_str = 'Find the remainder when [math]23^{24}[/math] is divided by 24,23?'
print(tokenize_sentence(example_str))
print(tokenize_sentence(example_str, 'regexp'))

['Find', 'the', 'remainder', 'when', '[', 'math', ']', '23^', '{', '24', '}', '[', '/math', ']', 'is', 'divided', 'by', '24,23', '?']
['Find', 'the', 'remainder', 'when', 'math', '23', '24', 'math', 'is', 'divided', 'by', '24', '23']


In [38]:
from nltk.tokenize import WordPunctTokenizer, RegexpTokenizer

# def tokenize_sentence(sentence, tokenizer='word'):
#     if tokenizer == 'word':
#         return word_tokenize(str(sentence))
#     if tokenizer == 'regexp':
#         return reg_tknzr.tokenize(str(sentence))
    
def tokenize_sentence(sentence, tokenizer=WordPunctTokenizer()):
    return tokenizer.tokenize(sentence)

reg_tknzr = RegexpTokenizer('\w+')
word_tknzr = WordPunctTokenizer()

example_str = 'Find the remainder when [math]23^{24}[/math] is divided by 24,23?'
print(tokenize_sentence(example_str))
print(tokenize_sentence(example_str, reg_tknzr))

['Find', 'the', 'remainder', 'when', '[', 'math', ']', '23', '^{', '24', '}[/', 'math', ']', 'is', 'divided', 'by', '24', ',', '23', '?']
['Find', 'the', 'remainder', 'when', 'math', '23', '24', 'math', 'is', 'divided', 'by', '24', '23']


Define a function to pre-process questions.

In [2]:
def text_preprocess(question, tokenizer, remove_stopwords=True, stemming=True):
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    # tokenize sentence
    words = tokenizer.tokenize(str(question))
    # remove stopwords
    if remove_stopwords:
        stopwords = set(stopwords.words('english'))
        words = [x.lower() for x in words if x.lower() not in stopwords]
    # stemming
    if stemming:
        ps = PorterStemmer()
        words = [ps.stem(x) for x in words]
    return words

Example

In [3]:
from nltk.tokenize import RegexpTokenizer, WordPunctTokenizer

reg_tknzr = RegexpTokenizer('\w+')
word_tknzr = WordPunctTokenizer()

example_str = 'Find the remainder when [math]23^{24}[/math] is divided by 24,23?'
print(text_preprocess(example_str, word_tknzr))
print(text_preprocess(example_str, reg_tknzr))

['find', 'remaind', '[', 'math', ']', '23', '^{', '24', '}[/', 'math', ']', 'divid', '24', ',', '23', '?']
['find', 'remaind', 'math', '23', '24', 'math', 'divid', '24', '23']


### Load data

In [4]:
data = pd.read_csv('./train.csv')

In [5]:
data.dtypes

id               int64
qid1             int64
qid2             int64
question1       object
question2       object
is_duplicate     int64
dtype: object

In [6]:
data.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [7]:
nquestions = len(data)

In [8]:
nquestions

300000

Pre-process each question.

In [9]:
import multiprocessing
from multiprocessing import Pool

print(multiprocessing.cpu_count())

8


In [10]:
with Pool(4) as p:
    question_tokens = p.starmap(text_preprocess, zip(data['question1'].values, [word_tknzr] * nquestions))
    data['q1_tokens'] = question_tokens
    
with Pool(4) as p:
    question_tokens = p.starmap(text_preprocess, zip(data['question2'].values, [word_tknzr] * nquestions))
    data['q2_tokens'] = question_tokens

In [11]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_tokens,q2_tokens
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"[step, step, guid, invest, share, market, indi...","[step, step, guid, invest, share, market, ?]"
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"[stori, kohinoor, (, koh, -, -, noor, ), diamo...","[would, happen, indian, govern, stole, kohinoo..."
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,"[increas, speed, internet, connect, use, vpn, ?]","[internet, speed, increas, hack, dn, ?]"
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,"[mental, lone, ?, solv, ?]","[find, remaind, [, math, ], 23, ^{, 24, }[/, m..."
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"[one, dissolv, water, quikli, sugar, ,, salt, ...","[fish, would, surviv, salt, water, ?]"


Combine all the tokenized questions together and train a word2vec model

In [12]:
sentences_w2v = np.hstack((data['q1_tokens'].values, data['q2_tokens'].values))

In [13]:
from gensim.models import Word2Vec

model = Word2Vec(sentences_w2v, min_count=1, workers=6)

In [14]:
model.save('./word2vec_model')

In [15]:
model = Word2Vec.load('./word2vec_model')

In [16]:
word_vectors = model.wv
vector_size = model.vector_size
del model

For each question, each word is a vector of length 100, use element-wise summation of each word vector to represent a question vector.

For each question pair, combine two question vectors to form a vector of length 200.

In [17]:
def question2vector(question_tokens):
    vector = np.zeros(vector_size)
    for token in question_tokens:
        if token in word_vectors:
            vector += word_vectors[token]
    return vector

In [18]:
def qpair2vector(question_pair):
    q1_tokens = question_pair[0]
    q2_tokens = question_pair[1]
    return np.hstack((question2vector(q1_tokens), question2vector(q2_tokens)))

In [62]:
def print_row(question_pair):
#     print(question_pair)
    q1_tokens = question_pair['q1_tokens']
    q2_tokens = question_pair['q2_tokens']
#     print(question_pair[0])
    print(np.hstack((question2vector(q1_tokens), question2vector(q2_tokens))).shape)

In [30]:
data[['q1_tokens', 'q2_tokens']].head().apply(qpair2vector, axis=1, result_type='expand')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,2.71449,-1.889232,6.313279,-2.514522,8.944314,-2.902041,3.089314,-5.209131,10.915428,-5.649148,...,-6.647331,-6.039927,1.730403,-3.567446,-3.031167,5.482224,0.470136,5.073454,-0.033691,5.039478
1,7.049123,-1.390078,0.920012,-0.655459,2.292866,2.158312,2.059679,-4.494858,8.220999,-0.36138,...,-12.07235,-5.861679,0.854648,1.267382,1.767869,-0.670569,6.786275,1.822016,0.677717,-6.960064
2,-2.164086,-1.286572,5.425792,0.241514,1.255902,1.259389,2.981848,-1.324762,3.025864,0.509957,...,-1.279771,-4.86266,1.645046,-4.889607,3.412172,4.415983,-2.126426,2.846926,0.580243,-0.312583
3,-1.200881,1.62828,1.452742,3.400235,0.926025,4.084927,0.196195,-1.380525,2.067565,0.287943,...,3.636922,-19.423926,5.47617,17.641724,-21.450284,-8.547889,-2.729999,0.930419,-2.607553,-7.117223
4,5.993781,6.72943,-3.04296,9.272593,-1.25082,4.452209,-2.823149,-1.734193,-4.676058,-3.729774,...,-3.296289,-0.927679,6.030814,4.330371,0.276846,2.90717,0.225902,-5.039953,-6.598514,-0.455986


In [31]:
from functools import reduce
from numpy import linalg as LA
vectors = []
for idx in range(len(train)):
    q1vec = np.zeros(100)
    for word in q1split[idx]:
        q1vec += model[word] 
    q2vec = np.zeros(100)
    for word in q2split[idx]:
        q2vec += model[word]
    vector = np.hstack((q1vec,q2vec))
    vectors.append(vector)    

In [28]:
train_vectors = pd.DataFrame.from_records(data[['q1_tokens', 'q2_tokens']].head().apply(qpair2vector, axis=1))

In [29]:
train_vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,2.71449,-1.889232,6.313279,-2.514522,8.944314,-2.902041,3.089314,-5.209131,10.915428,-5.649148,...,-6.647331,-6.039927,1.730403,-3.567446,-3.031167,5.482224,0.470136,5.073454,-0.033691,5.039478
1,7.049123,-1.390078,0.920012,-0.655459,2.292866,2.158312,2.059679,-4.494858,8.220999,-0.36138,...,-12.07235,-5.861679,0.854648,1.267382,1.767869,-0.670569,6.786275,1.822016,0.677717,-6.960064
2,-2.164086,-1.286572,5.425792,0.241514,1.255902,1.259389,2.981848,-1.324762,3.025864,0.509957,...,-1.279771,-4.86266,1.645046,-4.889607,3.412172,4.415983,-2.126426,2.846926,0.580243,-0.312583
3,-1.200881,1.62828,1.452742,3.400235,0.926025,4.084927,0.196195,-1.380525,2.067565,0.287943,...,3.636922,-19.423926,5.47617,17.641724,-21.450284,-8.547889,-2.729999,0.930419,-2.607553,-7.117223
4,5.993781,6.72943,-3.04296,9.272593,-1.25082,4.452209,-2.823149,-1.734193,-4.676058,-3.729774,...,-3.296289,-0.927679,6.030814,4.330371,0.276846,2.90717,0.225902,-5.039953,-6.598514,-0.455986


In [33]:
label = train.loc[:,'is_duplicate'].reset_index(drop=True)

In [34]:
train_vectors['label'] = label

In [35]:
train_vectors.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,label
0,3.737285,-1.076791,1.692858,-7.499455,-9.055481,-4.709146,2.997658,0.653588,-2.253008,3.467384,...,-6.56596,7.983998,8.465503,1.630318,-0.577622,3.999445,5.490358,-1.736918,7.677512,0
1,0.456444,-1.556494,-0.258488,-1.483942,-0.150758,-2.063886,-0.544661,-0.528503,2.829182,2.483006,...,3.284165,7.190341,4.746468,6.300947,0.987304,-0.897003,0.883633,-0.426671,3.077626,0
2,1.444396,-1.210374,-2.42313,-3.792901,0.425228,-2.014401,3.214325,4.528892,3.701342,-6.312274,...,-5.982558,3.102681,-0.257002,5.310513,0.167242,-0.095607,-0.634812,-6.138686,6.374227,0
3,1.864265,-1.934835,3.492548,-8.790129,7.653095,-2.400476,1.638999,1.04316,3.483204,0.576606,...,-5.443393,0.158519,0.205542,6.941473,-1.402384,-11.223416,5.195816,-1.278976,5.416936,0
4,3.837334,-4.155881,-2.965749,-6.012306,8.96973,3.857528,-7.284541,5.774252,12.329093,-0.99123,...,0.17841,5.452885,-5.131496,7.208042,-0.060435,-5.2396,1.320953,4.231333,-2.743696,0


In [36]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [37]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score



In [38]:
x = train_vectors.iloc[:, 0:200]
y = train_vectors.loc[:,'label']

In [39]:
xgb_model = XGBClassifier()
parameters = {'max_depth':12, 'n_estimators': 150, 'nthread': 8}
xgb_model.set_params(**parameters)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=12,
       min_child_weight=1, missing=None, n_estimators=150, nthread=8,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [41]:
xgb_model.fit(x, y)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=12,
       min_child_weight=1, missing=None, n_estimators=150, nthread=8,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [66]:
np.sum(xgb_model.predict(x) == y.values) / len(y)

0.96839666666666668

In [62]:
xgb_model._Booster.save_model('quota_question_mapping_xgb.model')

Or save to pickle

In [74]:
import pickle as pkl

with open('xgb_model.pkl', 'wb') as f:
    pkl.dump(xgb_model, f)

In [75]:
with open('xgb_model.pkl', 'rb') as f:
    xgb_model_load_pkl = pkl.load(f)

In [76]:
np.sum(xgb_model_load_pkl.predict(x) == y.values) / len(y)

0.96839666666666668

Load model

In [72]:
from xgboost import Booster
from sklearn.preprocessing import LabelEncoder

bst = Booster()
bst.load_model('quota_question_mapping_xgb.model')
model_loaded = XGBClassifier()
model_loaded._Booster = bst
model_loaded._le = LabelEncoder().fit(y)

In [73]:
np.sum(model_loaded.predict(x) == y.values) / len(y)

0.96839666666666668

test

In [47]:
model = gensim.models.Word2Vec.load('./word2vecModel_RMStop_Lower')

In [48]:
q1 = test.loc[:, 'question1'].values
q2 = test.loc[:, 'question2'].values
q1split = [str(x) for x in q1]
q2split = [str(x) for x in q2]
q1split = [re.split('\W+', x) for x in q1split]
q2split = [re.split('\W+', x) for x in q2split]

In [49]:
q1split = [[x for x in sen if x not in stopWords] for sen in q1split]
q2split = [[x for x in sen if x not in stopWords] for sen in q2split]

In [50]:
q1split = [[x.lower() for x in sen] for sen in q1split]
q2split = [[x.lower() for x in sen] for sen in q2split]

In [51]:
q1split = [[ps.stem(x) for x in sen] for sen in q1split]
q2split = [[ps.stem(x) for x in sen] for sen in q2split]

In [54]:
vectors = []
from functools import reduce
from numpy import linalg as LA
for idx in range(len(test)):
    q1vec = np.zeros(100)
    for word in q1split[idx]:
        if word in model.wv.vocab:
            q1vec += model[word] 
    q2vec = np.zeros(100)
    for word in q2split[idx]:
        if word in model.wv.vocab:
            q2vec += model[word]
    vector = np.hstack((q1vec,q2vec))
    vectors.append(vector)

In [55]:
test_vectors = pd.DataFrame.from_records(vectors)

In [56]:
test_vectors.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,4.41996,-1.5437,0.15023,-6.12723,8.095272,-0.542514,-0.443302,5.059507,-1.878882,2.207661,...,-2.178357,-2.234044,-4.043585,-1.034116,0.925721,0.61935,-0.352615,1.390167,-3.487625,2.823298
1,5.749731,-4.656629,1.051494,-9.750124,19.239902,-0.606078,4.005198,-0.360755,-6.190962,-0.391755,...,1.300382,-3.638298,3.762069,7.266968,5.804556,-3.441204,-2.662728,-0.9223,-3.02226,9.835109
2,15.812096,-15.134778,9.85599,-9.319308,23.659766,-21.7143,13.739175,-3.099473,8.334608,8.396172,...,8.443089,-17.689333,16.891249,-2.333238,10.577844,-1.070029,4.096502,-6.346195,-3.393667,9.532553
3,-0.032283,1.154847,3.585843,-7.15704,9.037101,-4.455307,2.756896,-4.226796,0.785194,3.53327,...,6.362015,-11.998778,6.599363,4.008316,12.246902,1.58172,-4.093464,-0.041705,-4.961662,10.674281
4,-1.715576,-3.158687,0.130192,-3.463692,3.310188,-2.552888,1.07098,10.039039,2.901516,2.071235,...,6.555799,-1.293155,4.129984,1.293328,0.948797,5.200791,-1.121328,0.250134,1.477031,-3.717828


In [55]:
test_pred = xgb_model.predict(test_vectors)
predictions = [round(pred) for pred in test_pred]

In [56]:
is_duplicate = [int(x) for x in predictions]
test['is_duplicate'] = is_duplicate
submission = test.loc[:, ['id', 'is_duplicate']]
submission.to_csv('./submission.csv', index=False)

In [57]:
submission = pd.read_csv('./submission.csv')

In [58]:
submission.head(5)

Unnamed: 0,id,is_duplicate
0,300000,0
1,300001,1
2,300002,0
3,300003,0
4,300004,1
