In [4]:
#Importing packages

import pandas as pd
import numpy as np
import gensim
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from tqdm import tqdm_notebook
from nltk import word_tokenize
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from nltk import word_tokenize
stop_words = stopwords.words('english')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score



In [2]:
#Reading the files

train= pd.read_csv("train.csv")
test= pd.read_csv("test.csv")
sample_sub=pd.read_csv("sample_submission.csv")

In [3]:
#Size of the dataframes

print(train.shape)
print(test.shape)

(404290, 6)
(2345796, 3)


In [4]:
#Missing Values count

train.isnull().sum()

id              0
qid1            0
qid2            0
question1       1
question2       2
is_duplicate    0
dtype: int64

In [5]:
#Remove the entire row if any column has missing values

train=train.dropna(how='any').reset_index(drop=True)
train.head(10)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0


In [6]:
#Target Label Counts

train['is_duplicate'].value_counts()

0    255024
1    149263
Name: is_duplicate, dtype: int64

In [7]:
#Drop ID, qid1,qid2

train=train.drop(['id','qid1','qid2'],axis=1)
train.columns

Index(['question1', 'question2', 'is_duplicate'], dtype='object')

In [8]:
#Sample examples from each class

ques1= train.iloc[0,0]
ques2= train.iloc[0,1]

ques3= train.iloc[7,0]
ques4= train.iloc[7,1]

print(ques1)
print(ques2)

print(ques3)
print(ques4)

What is the step by step guide to invest in share market in india?
What is the step by step guide to invest in share market?
How can I be a good geologist?
What should I do to be a great geologist?


In [9]:
ques1=ques1.lower().split()
ques2=ques2.lower().split()
ques3=ques3.lower().split()
ques4=ques4.lower().split()


ques1= [i for i in ques1 if i not in stop_words]
ques2= [i for i in ques2 if i not in stop_words]
ques3= [i for i in ques3 if i not in stop_words]
ques4= [i for i in ques4 if i not in stop_words]

### Length based Features

In [10]:
#Sentence length based features 
train['len_q1']= train['question1'].apply(lambda x: len(str(x)))
train['len_q2']= train['question2'].apply(lambda x: len(str(x)))
train['len_diff'] = train['len_q1'] - train['len_q2']

# character length based features
train['len_char_q1'] = train.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
train['len_char_q2'] = train.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))

# word length based features
train['len_word_q1'] = train.question1.apply(lambda x: len(str(x).split()))
train['len_word_q2'] = train.question2.apply(lambda x: len(str(x).split()))

# common words in the two questions
train['common_words'] = train.apply(lambda x: 
len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)

In [39]:
#Features set 1

fs_1= ['len_q1','len_q2', 'len_char_q1','len_char_q2','len_word_q2','len_word_q2','common_words']

### FuzzyWuzzy is a library of Python which is used for string matching. Fuzzy string matching is the process of finding strings that match a given pattern. Basically it uses Levenshtein Distance to calculate the differences between sequences.

In [12]:
#Features from FuzzyWuzzy

train['fuzz_qratio'] = train.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
train['fuzz_WRatio'] = train.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
train['fuzz_partial_ratio'] = train.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
train['fuzz_partial_token_set_ratio'] = train.apply(lambda x:fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
train['fuzz_partial_token_sort_ratio'] = train.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
train['fuzz_token_set_ratio'] = train.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
train['fuzz_token_sort_ratio'] = train.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
train['fuzz_token_sort_ratio'] = train.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)

In [11]:
#Pickle dumped

infile = open(filename,'rb')
train = pickle.load(infile)
infile.close()

In [12]:
train.columns

Index(['question1', 'question2', 'is_duplicate', 'len_q1', 'len_q2',
       'len_diff', 'len_char_q1', 'len_char_q2', 'len_word_q1', 'len_word_q2',
       'common_words', 'fuzz_qratio', 'fuzz_WRatio', 'fuzz_partial_ratio',
       'fuzz_partial_token_set_ratio', 'fuzz_partial_token_sort_ratio',
       'fuzz_token_set_ratio', 'fuzz_token_sort_ratio'],
      dtype='object')

In [38]:
#Feature set 2

fs_2= ['fuzz_qratio', 'fuzz_WRatio', 'fuzz_partial_ratio',
       'fuzz_partial_token_set_ratio', 'fuzz_partial_token_sort_ratio',
       'fuzz_token_set_ratio', 'fuzz_token_sort_ratio']

### Tf-IDF Sparse Matrix of the Corpus

In [14]:
#Each word in Ques1 and Ques2 assigned a TF-IDF Score separately

from sklearn.feature_extraction.text import TfidfVectorizer
tfv_q1 = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', 
token_pattern=r'w{1,}',ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,stop_words='english')

tfv_q2 = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', 
token_pattern=r'w{1,}',ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,stop_words='english')

In [15]:
q1_tfidf = tfv_q1.fit_transform(train.question1.fillna(""))
q2_tfidf = tfv_q2.fit_transform(train.question2.fillna(""))

  'stop_words.' % sorted(inconsistent))


In [17]:
#Feature set 3.1

from scipy import sparse
# obtain features by stacking the sparse matrices together
fs3_1 = sparse.hstack((q1_tfidf, q2_tfidf))

In [18]:
tfv = TfidfVectorizer(min_df=3, 
                      max_features=None, 
                      strip_accents='unicode', 
                      analyzer='word', 
                      token_pattern=r'w{1,}',
                      ngram_range=(1, 2), 
                      use_idf=1, 
                      smooth_idf=1, 
                      sublinear_tf=1,
                      stop_words='english')

In [19]:
#Feature set 3.2
# combine questions and calculate tf-idf
q1q2 = train.question1.fillna("") 
q1q2 += " " + train.question2.fillna("")
fs3_2 = tfv.fit_transform(q1q2)

### Pretrained Model to train Word2Vec imported from Google News

In [20]:
#Using a pretrained model for word2vec

import gensim
model = gensim.models.KeyedVectors.load_word2vec_format(
'GoogleNews-vectors-negative300.bin.gz', binary=True)

In [21]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jyoti.prakash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jyoti.prakash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
from nltk.corpus import stopwords
from nltk import word_tokenize
stop_words = set(stopwords.words('english'))
 
#Summing all the word2vecs of a question to obtain sent2vec    
    
def sent2vec(s,model):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    if len(M) > 0:
        v = M.sum(axis=0)
        return v / np.sqrt((v ** 2).sum())
    else:
        return np.zeros(300)

In [23]:
#word2vec for q1 and q2

w2v_q1 = np.array([sent2vec(q, model) 
                   for q in train.question1])

w2v_q2 = np.array([sent2vec(q, model) 
                   for q in train.question2])

In [24]:
#Word2vec distances

from scipy.spatial.distance import cosine, cityblock,jaccard, canberra, euclidean, minkowski, braycurtis

train['cosine_distance'] = [cosine(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
train['cityblock_distance'] = [cityblock(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
train['jaccard_distance'] = [jaccard(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
train['canberra_distance'] = [canberra(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
train['euclidean_distance'] = [euclidean(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
train['minkowski_distance'] = [minkowski(x,y,3) for (x,y) in zip(w2v_q1, w2v_q2)]
train['braycurtis_distance'] = [braycurtis(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]

  dist = 1.0 - uv / np.sqrt(uu * vv)
  return l1_diff.sum() / l1_sum.sum()


In [40]:
#Feature set 4.1

fs4_1 = ['cosine_distance', 'cityblock_distance', 
         'jaccard_distance', 'canberra_distance', 
         'euclidean_distance', 'minkowski_distance',
         'braycurtis_distance']

In [26]:
w2v = np.hstack((w2v_q1, w2v_q2))

### WMD use word embeddings to calculate the distance so that it can calculate even though there is no common word. The assumption is that similar words should have similar vectors.

In [27]:
#word movers distance

def wmd(s1, s2, model):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return model.wmdistance(s1, s2)

In [28]:
train['wmd'] = train.apply(lambda x: wmd(x['question1'], x['question2'], model), axis=1)
model.init_sims(replace=True)  #Normalizes the vectors in word2vec class
train['norm_wmd'] = train.apply(lambda x: wmd(x['question1'], x['question2'], model), axis=1)

In [41]:
#feature set 4.2

fs4_2 = ['wmd', 'norm_wmd']

In [30]:
#Pickle dump

import pickle
filename = 'trainv2'
infile = open(filename,'rb')
train = pickle.load(infile)
infile.close()

In [31]:
train.columns

Index(['question1', 'question2', 'is_duplicate', 'len_q1', 'len_q2',
       'len_diff', 'len_char_q1', 'len_char_q2', 'len_word_q1', 'len_word_q2',
       'common_words', 'fuzz_qratio', 'fuzz_WRatio', 'fuzz_partial_ratio',
       'fuzz_partial_token_set_ratio', 'fuzz_partial_token_sort_ratio',
       'fuzz_token_set_ratio', 'fuzz_token_sort_ratio', 'cosine_distance',
       'cityblock_distance', 'jaccard_distance', 'canberra_distance',
       'euclidean_distance', 'minkowski_distance', 'braycurtis_distance',
       'wmd', 'norm_wmd'],
      dtype='object')

In [32]:
#Cleaning up the memory

import gc
import psutil
#del([tfv_q1, tfv_q2, tfv, q1q2,q1_tfidf, q2_tfidf])
del([w2v_q1, w2v_q2])
del([model])
gc.collect()
psutil.virtual_memory()

svmem(total=17057210368, available=10876678144, percent=36.2, used=6180532224, free=10876678144)

### Modelling Techniques

In [33]:
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [42]:
#Normalizing the data

scaler = StandardScaler()
y = train.is_duplicate.values
y = y.astype('float32').reshape(-1, 1)
X = train[fs_1+fs_2+fs4_1+fs4_2]
X = X.replace([np.inf, -np.inf], np.nan).fillna(0).values
X = scaler.fit_transform(X)

In [43]:
#Preparing the validation set

np.random.seed(42)
n_all, _ = y.shape
idx = np.arange(n_all)
np.random.shuffle(idx)
n_split = n_all // 10
idx_val = idx[:n_split]
idx_train = idx[n_split:]
x_train = X[idx_train]
y_train = np.ravel(y[idx_train])
x_val = X[idx_val]
y_val = np.ravel(y[idx_val])

In [44]:
#Logistic Regression

logres = linear_model.LogisticRegression(C=0.1, solver='sag', max_iter=1000)
logres.fit(x_train, y_train)
lr_preds = logres.predict(x_val)
log_res_accuracy = np.sum(lr_preds == y_val) / len(y_val)
print("Logistic regr accuracy: %0.3f" % log_res_accuracy)

Logistic regr accuracy: 0.682


In [45]:
#XGBoost

params = dict()
params['objective'] = 'binary:logistic'
params['eval_metric'] = ['logloss', 'error']
params['eta'] = 0.02
params['max_depth'] = 4
d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_val, label=y_val)
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
bst = xgb.train(params, d_train, 5000, watchlist, 
                early_stopping_rounds=50, verbose_eval=100)
xgb_preds = (bst.predict(d_valid) >= 0.5).astype(int)
xgb_accuracy = np.sum(xgb_preds == y_val) / len(y_val)
print("Xgb accuracy: %0.3f" % xgb_accuracy)

[0]	train-logloss:0.687385	train-error:0.297709	valid-logloss:0.687579	valid-error:0.295661
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 50 rounds.
[100]	train-logloss:0.513457	train-error:0.278424	valid-logloss:0.513574	valid-error:0.27758
[200]	train-logloss:0.491254	train-error:0.273098	valid-logloss:0.491917	valid-error:0.273103
[300]	train-logloss:0.483873	train-error:0.270756	valid-logloss:0.484876	valid-error:0.270159
[400]	train-logloss:0.478312	train-error:0.266779	valid-logloss:0.479858	valid-error:0.267216
[500]	train-logloss:0.474123	train-error:0.262552	valid-logloss:0.476123	valid-error:0.264445
[600]	train-logloss:0.470899	train-error:0.259999	valid-logloss:0.473332	valid-error:0.262096
[700]	train-logloss:0.46757	train-error:0.257047	valid-logloss:0.47042	valid-error:0.259845
[800]	train-logloss:0.464768	train-error:0.254442	valid-logloss:0.4681	valid-error:0.257346
[900]	train-lo