In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3

In [None]:
with sqlite3.connect('dataset/Cleaned.db') as conn:
    train = pd.read_sql_query('SELECT * FROM train', conn)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
labels = train.is_duplicate
y_true = list(map(int, labels))

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(train, y_true, stratify=y_true, test_size=0.3)

In [None]:
Xtrain.shape

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(stop_words='english')

In [None]:
from pyemd import emd
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('/home/paperspace/w2v/GoogleNews-vectors-negative300.bin',
                                         binary=True)

In [None]:
def sent2vec(sentence, model, method='tfidf', **kwargs):
    """
    Generic function to convert a sentence to a vector using
    avg or TFIDF vecorization.
    
    :param sentence: Sentence to be converted.
    :param model: The word2vec model
    """
    
    ##### It is recommended to pass seperate stopwords #####
    stopwords = kwargs.get('stopwords')
    if stopwords is None:
        from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
        stopwords = ENGLISH_STOP_WORDS
    
    ##### It is recommended to pass seperate tokenizers #####
    tokenizer = kwargs.get('tokenizer')
    if tokenizer is None:
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer(r'\w+')

    words = tokenizer.tokenize(sentence) # Tokenize the words
    words = {each for each in words if each not in stopwords} # Remove all the stopwords
    
    V = []
    
    for word in words: # Process over all the words in the sentence
        if model.__contains__(word):
            V.append(model[word])
    V = np.array(V)
    
    # If no words were present in the model
    # or blank sentence was passed, return a
    # word vector with all 0's
    if V.shape[0] == 0:
        # If model returns word2vec of different size
        # Default value is taken 300
        custom_shape = kwargs.get('shape', 300)
        return np.zeros(custom_shape)
    
    # If there is atleast one word in the sentence that
    # was vectoried properly
    
    if method.lower() == 'avg':
        V = V.sum(axis=0)
        return V / np.sqrt((V ** 2).sum())
    
    elif method.lower() == 'tfidf':
        tfidf_model = kwargs.get('tfidf_model') # Load the tfidf model
        if tfidf_model: # If model loaded sucessfully
            tfidf_vec = tfidf_model.transform([sentence]) # get TFIDF for the sentence
            indx = tfidf_model.vocabulary_.get(word, -1)
            tfidfs = []
            for word in words:
                if model.__contains__(word):
                    if indx != -1:
                        tfidfs.append(tfidf_vec[0, indx])
                    else:
                        tfidfs.append(0.0)
            tfidfs = np.array(tfidfs)
            denominator = tfidfs.sum()
            if denominator == 0.0: # No word is representred in tfidf and w2v both
                # Better than skipping that sentence
                denominator = tfidf_model.idf_.min() * 0.01
            numerator = V * tfidfs.reshape(V.shape[0], 1)
            numerator = numerator.sum(axis=0)
            return numerator / denominator
        else:
            raise ValueError('No tfidf model is present')

In [None]:
from tqdm import tqdm
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
tfidf.fit(Xtrain.question1)

In [None]:
question1_vectors_train = np.zeros((Xtrain.shape[0], 300))
for i, q in tqdm(enumerate(Xtrain.question1.values), total=283002):
    question1_vectors_train[i, :] = sent2vec(q, model=model, tfidf_model=tfidf,
                                       tokenizer=tokenizer, stopwords=ENGLISH_STOP_WORDS)

In [None]:
question1_vectors_test = np.zeros((Xtest.shape[0], 300))
for i, q in tqdm(enumerate(Xtest.question1.values), total=121287):
    question1_vectors_test[i, :] = sent2vec(q, model=model, tfidf_model=tfidf,
                                       tokenizer=tokenizer, stopwords=ENGLISH_STOP_WORDS)

In [None]:
tfidf.fit(Xtrain.question2)

In [None]:
question2_vectors_train = np.zeros((Xtrain.shape[0], 300))
for i, q in tqdm(enumerate(Xtrain.question2.values), total=283002):
    question2_vectors_train[i, :] = sent2vec(q, model=model, tfidf_model=tfidf,
                                       tokenizer=tokenizer, stopwords=ENGLISH_STOP_WORDS)

In [None]:
question2_vectors_test = np.zeros((Xtest.shape[0], 300))
for i, q in tqdm(enumerate(Xtest.question2.values), total=121287):
    question2_vectors_test[i, :] = sent2vec(q, model=model, tfidf_model=tfidf,
                                       tokenizer=tokenizer, stopwords=ENGLISH_STOP_WORDS)

In [None]:
question1_vectors_train.shape

In [None]:
question2_vectors_train.shape

In [None]:
Xtrain.drop([
    'id',
    'qid1',
    'qid2',
    'question1',
    'question2',
    'is_duplicate'
], axis=1, inplace=True)

In [None]:
Xtest.drop([
    'id',
    'qid1',
    'qid2',
    'question1',
    'question2',
    'is_duplicate'
], axis=1, inplace=True)

In [None]:
X_train = np.hstack((question1_vectors_train, question2_vectors_train, np.array(Xtrain)))
X_test = np.hstack((question1_vectors_test, question2_vectors_test, np.array(Xtest)))

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
import xgboost as xgb
from sklearn.metrics import log_loss

In [None]:
params = {
    'max_depth': [2, 4, 6],  # How deep the base learners need to go -- typically small value
    'subsample': [0.5, 0.75, 1.0], # Row sampling like Random Forest 
    'colsample_bytree': [0.3, 0.5, 0.7, 1.0]  # Column sampling like Random Forest
}


In [None]:
estimator = xgb.XGBClassifier(random_state=42, objective='binary:logistic',  n_jobs=-1, n_estimators=400)

In [None]:
!ls dataset

In [None]:
np.save('dataset/w2v_train.npz', X_train)
np.save('dataset/w2v_test.npz', X_test)

In [None]:
X_train = np.load('dataset/w2v_xtrain.npy')
X_test = np.load('dataset/w2v_xtest.npy')
ytrain = np.load('dataset/w2v_.ytrain.npy')
ytest = np.load('dataset/w2v_ytest.npy')

In [None]:
X_train.shape

In [None]:
from sklearn.model_selection import StratifiedKFold

### Learning Rate & max_depth

In [None]:
learning_rates = [10, 1, 0.1, 0.01, 0.001]
depths = [2, 4, 6]

In [None]:
log_losses = []
for lr in learning_rates:
    for dep in depths:
        params = {}
        params['objective'] = 'binary:logistic'
        params['eval_metric'] = 'logloss'
        params['eta'] = lr
        params['max_depth'] = dep

        d_train = xgb.DMatrix(X_train, label=ytrain)
        d_test = xgb.DMatrix(X_test, label=ytest)

        watchlist = [(d_train, 'train'), (d_test, 'valid')]

        clf = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=20, verbose_eval=2)
        predict_y = clf.predict(d_test)
        log_losses.append(log_loss(ytest, predict_y, eps=1e-15))
        print(f"Done for lr = {lr} and depth = {dep}")

In [None]:
np.argsort(log_losses)[0:4]

In [None]:
log_losses[8]

In [None]:
comb = []
for lr in learning_rates:
    for dep in depths:
        comb.append((lr, dep))

In [None]:
comb[8]

In [None]:
comb[7]

In [None]:
comb[3]

In [None]:
comb[6]

**Best Learning Rate:** 0.1

It seems that 0.1 is the best learning rate. Let's fine tune the max depth which decreses the validation loss but still seems to overfit least

In [None]:
depths = [1, 2, 3, 4, 5, 6]

In [None]:
result = []
for dep in depths:
    params = {}
    params['objective'] = 'binary:logistic'
    params['eval_metric'] = 'logloss'
    params['eta'] = 0.1
    params['max_depth'] = dep
    params['silent'] = 1
#     silent=1

    d_train = xgb.DMatrix(X_train, label=ytrain)
    d_test = xgb.DMatrix(X_test, label=ytest)

    watchlist = [(d_train, 'train'), (d_test, 'valid')]

    clf = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=20, verbose_eval=1)
    predict_y = clf.predict(d_test)
    predict_y_train = clf.predict(d_train)
    res = (dep, log_loss(ytrain, predict_y_train, eps=1e-15), log_loss(ytest, predict_y, eps=1e-15))
    result.append(res)
    print(f"Done for depth = {dep}")

In [None]:
result = pd.DataFrame(result, columns=['max_depth', 'train_logloss', 'test_logloss'])

In [None]:
result['diff'] = abs(result.train_logloss - result.test_logloss)

In [None]:
result.sort_values(by=['test_logloss', 'diff'], ascending=(True, True))

**Best `max_depth`:** 5 is the best.

Here I am choosing `max_depth` of both 5 and 6. I believe a good param tuning on regularisation would do solve the little overfitting. Now let's fix the number of estimatiors

In [None]:
clf.best_ntree_limit

In [None]:
params = {
    'subsample': [0.5, 0.75, 1.0], # Row sampling like Random Forest 
    'colsample_bytree': [0.3, 0.5, 0.7, 1.0],  # Column sampling like Random Forest
}

### For `max_depth = 5`

In [None]:
result = []
for ss in [0.5, 0.75, 1.0]:
    for cs in [0.3, 0.5, 0.7, 1.0]:
        params = {}
        params['objective'] = 'binary:logistic'
        params['eval_metric'] = 'logloss'
        params['eta'] = 0.1
        params['max_depth'] = 5
        params['silent'] = 1
        params['subsample'] = ss
        params['colsample_bytree'] = cs

        d_train = xgb.DMatrix(X_train, label=ytrain)
        d_test = xgb.DMatrix(X_test, label=ytest)

        watchlist = [(d_train, 'train'), (d_test, 'valid')]

        clf = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=20, verbose_eval=1)
        predict_y = clf.predict(d_test)
        predict_y_train = clf.predict(d_train)
        res = (ss, cs, log_loss(ytrain, predict_y_train, eps=1e-15), log_loss(ytest, predict_y, eps=1e-15))
        result.append(res)
        print(f"Done for subsample = {ss} and colsample_bytree = {cs}")

In [None]:
result = pd.DataFrame(result, columns=['subsample', 'colsample_bytree', 'train_logloss', 'test_logloss'])

In [None]:
result['diff'] = abs(result.train_logloss - result.test_logloss)

In [None]:
result.sort_values('diff', ascending=True)

### Finally trying to remove some probable overfitting

In [None]:
result = []
for ra in [1e-3, 1e-2, 0.1, 1, 100]:
    for rl in [1e-3, 1e-2, 0.1, 1, 100]:
        params = {}
        params['objective'] = 'binary:logistic'
        params['eval_metric'] = 'logloss'
        params['eta'] = 0.1
        params['max_depth'] = 5
        params['silent'] = 1
        params['subsample'] = 1
        params['colsample_bytree'] = 0.3
        params['reg_alpha'] = ra
        params['reg_lambda'] = rl

        d_train = xgb.DMatrix(X_train, label=ytrain)
        d_test = xgb.DMatrix(X_test, label=ytest)

        watchlist = [(d_train, 'train'), (d_test, 'valid')]

        clf = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=20, verbose_eval=1)
        predict_y = clf.predict(d_test)
        predict_y_train = clf.predict(d_train)
        res = (ra, rl, log_loss(ytrain, predict_y_train, eps=1e-15), log_loss(ytest, predict_y, eps=1e-15))
        result.append(res)
        print(f"Done for reg_alpha = {ra} and reg_lambda = {rl}")

In [None]:
result = pd.DataFrame(result, columns=['reg_alpha', 'reg_lambda', 'train_logloss', 'test_logloss'])

In [None]:
result['diff'] = abs(result.train_logloss - result.test_logloss)

In [None]:
result.sort_values(['diff', 'test_logloss'], ascending=(True, True))

**Best `reg_alpha` and `reg_lambda`:** 100, 100

**Final loss for all best params:** 0.326

### Leaky features

https://www.kaggle.com/jturkewitz/magic-features-0-03-gain/notebook

In [None]:
df1 = train[['question1']].copy()

df2 = train[['question2']].copy()
df2.rename(columns = {'question2':'question1'},inplace=True)
train_questions = df1.append(df2)
train_questions.drop_duplicates(subset = ['question1'], inplace=True)
questions_dict = pd.Series(train_questions.index.values, index=train_questions.question1.values).to_dict()
train.drop(['qid1','qid2'], axis=1, inplace=True)
train['q1_hash'] = train['question1'].map(questions_dict)
train['q2_hash'] = train['question2'].map(questions_dict)
q1_vc = train.q1_hash.value_counts().to_dict()
q2_vc = train.q2_hash.value_counts().to_dict()

In [None]:
def try_apply_dict(x, dict_to_apply):
    try:
        return dict_to_apply[x]
    except KeyError:
        return 0

#map to frequency space
train['q1_freq'] = train['q1_hash'].map(lambda x: try_apply_dict(x, q1_vc) + try_apply_dict(x, q2_vc))
train['q2_freq'] = train['q2_hash'].map(lambda x: try_apply_dict(x, q1_vc) + try_apply_dict(x, q2_vc))

In [None]:
train.head()

In [None]:
train_comb = train[['id', 'q1_hash', 'q2_hash', 'q1_freq', 'q2_freq', 'is_duplicate']]

In [None]:
corr_mat = train_comb.corr()
corr_mat.head()