## Quora Question Pairs

This notebook predicts if two questions have the similar meaning.

### Load libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
# nltk.download()

Define a function to pre-process a question sentence.

In [3]:
def text_preprocess(question, tokenizer, remove_stopwords=True, stemming=True):
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    # tokenize sentence
    words = tokenizer.tokenize(str(question))
    # remove stopwords
    if remove_stopwords:
        stopwords = set(stopwords.words('english'))
        words = [x.lower() for x in words if x.lower() not in stopwords]
    # stemming
    if stemming:
        ps = PorterStemmer()
        words = [ps.stem(x) for x in words]
    return words

Example

In [4]:
from nltk.tokenize import RegexpTokenizer, WordPunctTokenizer

reg_tknzr = RegexpTokenizer('\w+')
word_tknzr = WordPunctTokenizer()

example_str = 'Find the remainder when [math]23^{24}[/math] is divided by 24,23?'
print(text_preprocess(example_str, word_tknzr))
print(text_preprocess(example_str, reg_tknzr))

['find', 'remaind', '[', 'math', ']', '23', '^{', '24', '}[/', 'math', ']', 'divid', '24', ',', '23', '?']
['find', 'remaind', 'math', '23', '24', 'math', 'divid', '24', '23']


### Load data

In [5]:
data = pd.read_csv('data/train.csv')

In [6]:
data.dtypes

id               int64
qid1             int64
qid2             int64
question1       object
question2       object
is_duplicate     int64
dtype: object

In [7]:
data.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [8]:
nquestions = len(data)
nquestions

300000

Pre-process each question.

In [9]:
import multiprocessing
from multiprocessing import Pool

print(multiprocessing.cpu_count())

8


In [10]:
with Pool(4) as p:
    question_tokens = p.starmap(text_preprocess, zip(data['question1'].values, [word_tknzr] * nquestions))
    data['q1_tokens'] = question_tokens
    
with Pool(4) as p:
    question_tokens = p.starmap(text_preprocess, zip(data['question2'].values, [word_tknzr] * nquestions))
    data['q2_tokens'] = question_tokens

In [11]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_tokens,q2_tokens
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"[step, step, guid, invest, share, market, indi...","[step, step, guid, invest, share, market, ?]"
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"[stori, kohinoor, (, koh, -, -, noor, ), diamo...","[would, happen, indian, govern, stole, kohinoo..."
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,"[increas, speed, internet, connect, use, vpn, ?]","[internet, speed, increas, hack, dn, ?]"
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,"[mental, lone, ?, solv, ?]","[find, remaind, [, math, ], 23, ^{, 24, }[/, m..."
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"[one, dissolv, water, quikli, sugar, ,, salt, ...","[fish, would, surviv, salt, water, ?]"


Combine all the tokenized questions together and train a word2vec model

In [12]:
sentences_w2v = np.hstack((data['q1_tokens'].values, data['q2_tokens'].values))

In [13]:
from gensim.models import Word2Vec

model = Word2Vec(sentences_w2v, min_count=1, workers=6)

In [14]:
model.save('data/word2vec.model')

In [15]:
model = Word2Vec.load('data/word2vec.model')

In [16]:
word_vectors = model.wv
vector_size = model.vector_size
del model

For each question, each word is a vector of length 100, use element-wise summation of each word vector to represent a question vector.

For each question pair, stack two question vectors to form a vector of length 200 as the final feature vector used by classification.

In [17]:
def question2vector(question_tokens):
    vector = np.zeros(vector_size)
    for token in question_tokens:
        if token in word_vectors:
            vector += word_vectors[token]
    return vector

In [18]:
def qpair2vector(question_pair):
    q1_tokens = question_pair[0]
    q2_tokens = question_pair[1]
    return np.hstack((question2vector(q1_tokens), question2vector(q2_tokens)))

In [19]:
qpair_vectors = data[['q1_tokens', 'q2_tokens']].apply(qpair2vector, axis=1, result_type='expand')

In [20]:
qpair_vectors['is_duplicate'] = data['is_duplicate']

In [21]:
qpair_vectors.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,is_duplicate
0,-2.156157,6.6435,3.054326,3.907453,0.852425,2.267368,-1.701995,-2.221284,-0.63758,5.213196,...,-0.413092,0.6279,-0.835702,2.323166,-1.759867,-2.9835,-2.15458,3.719437,-3.788028,0
1,1.687733,10.222186,-1.882403,5.861205,-0.445831,-1.331147,-4.375886,-0.660034,0.43865,-5.562514,...,-2.523814,3.205657,-0.279237,-16.469647,2.580349,-10.018491,3.994422,1.330824,2.27318,0
2,0.178269,1.808385,6.385694,6.790597,-2.070033,-0.507843,0.109221,1.580514,-0.253203,0.754769,...,-0.747845,-3.370323,2.483994,-5.634591,-0.134453,5.830806,-3.231422,0.276338,-1.593215,0
3,0.46288,-0.37331,0.579856,-0.331945,3.005991,2.953964,2.868057,1.087494,0.571178,-0.829834,...,-3.385631,-7.298527,-17.116871,1.785401,17.851413,-13.004873,-7.684263,11.530005,4.63031,0
4,-0.616648,0.357837,0.408068,3.266895,0.307053,4.126242,-4.163109,-1.199611,6.447889,-5.604953,...,-7.234048,1.582192,1.020403,-4.499362,-0.646023,2.78196,-2.008184,3.021362,2.155658,0


Get numerical features and labels of the data.

In [22]:
X = qpair_vectors.iloc[:, :-1].values
y = qpair_vectors.iloc[:, -1].values

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)

Train a xgboosting classifier and test it on the test data.

In [24]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier()
parameters = {'max_depth':12, 'n_estimators': 150, 'nthread': 6}
xgb_clf.set_params(**parameters)



XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=12,
       min_child_weight=1, missing=None, n_estimators=150, nthread=6,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [25]:
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=12,
       min_child_weight=1, missing=None, n_estimators=150, nthread=6,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [26]:
from sklearn.metrics import accuracy_score
y_pred = xgb_clf.predict(X_test)
acc_score = accuracy_score(y_test, y_pred)
print('accuracy score on test data is ', acc_score)

print(np.sum(y_pred == y_test) / len(y_test))

accuracy score on test data is  0.7968666666666666
0.7968666666666666


  if diff:


Save the trained model and load it using the xgboost package methods

In [27]:
xgb_clf._Booster.save_model('data/quota_question_mapping_word2vec200_xgb.model')

In [28]:
from xgboost import Booster
from sklearn.preprocessing import LabelEncoder

bstr = Booster()
bstr.load_model('data/quota_question_mapping_word2vec200_xgb.model')
xgb_clf = XGBClassifier()
xgb_clf._Booster = bstr
xgb_clf._le = LabelEncoder().fit(y_train)

In [29]:
y_pred = xgb_clf.predict(X_test)
acc_score = accuracy_score(y_test, y_pred)
print('accuracy score on test data is ', acc_score)

accuracy score on test data is  0.7968666666666666


  if diff:


Or save to pickle

In [30]:
import pickle as pkl

with open('data/quota_question_mapping_word2vec200_xgb.pkl', 'wb') as f:
    pkl.dump(xgb_clf, f)

In [31]:
with open('data/quota_question_mapping_word2vec200_xgb.pkl', 'rb') as f:
    xgb_clf = pkl.load(f)

In [32]:
np.sum(xgb_clf.predict(X_test) == y_test) / len(y_test)

  if diff:


0.7968666666666666