In [1]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', 500)
import seaborn as sns
sns.set_style("dark")
plt.rcParams['figure.figsize'] = 16, 12
from tqdm import tqdm, tqdm_notebook
import itertools as it
import pickle
import glob
import os
import string

from scipy import sparse

import nltk
import spacy

from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, make_scorer
from sklearn.decomposition import TruncatedSVD

from scipy.optimize import minimize

import eli5
from IPython.display import display

import xgboost as xgb



In [2]:
df_train = pd.read_csv('data/train.csv', 
                       dtype={
                           'question1': np.str,
                           'question2': np.str
                       })
df_train['test_id'] = -1
df_test = pd.read_csv('data/test.csv', 
                      dtype={
                          'question1': np.str,
                          'question2': np.str
                      })
df_test['id'] = -1
df_test['qid1'] = -1
df_test['qid2'] = -1
df_test['is_duplicate'] = -1

df = pd.concat([df_train, df_test])
df['question1'] = df['question1'].fillna('')
df['question2'] = df['question2'].fillna('')
df['uid'] = np.arange(df.shape[0])
df = df.set_index(['uid'])
print (df.dtypes)
del(df_train, df_test)

id               int64
is_duplicate     int64
qid1             int64
qid2             int64
question1       object
question2       object
test_id          int64
dtype: object


In [3]:
ix_train = np.where(df['id'] >= 0)[0]
ix_test = np.where(df['id'] == -1)[0]
ix_is_dup = np.where(df['is_duplicate'] == 1)[0]
ix_not_dup = np.where(df['is_duplicate'] == 0)[0]
print (df[df['is_duplicate'] >= 0]['is_duplicate'].value_counts(normalize=True))

0    0.630802
1    0.369198
Name: is_duplicate, dtype: float64


In [4]:
score0 = 6.01888
score1 = 28.52056
score05 = 0.69315
score025 = 0.47913
score075 = 1.19485
A = (score0 + score1)*np.log(0.5)/score05
print ('A =', A)
print ('eps_0 =', (1 + np.sqrt(1 - 4*np.exp(A)))/2)
print ('eps_1 =', (1 - np.sqrt(1 - 4*np.exp(A)))/2)
eps = 10e-16
print ('eps =', eps)
B = np.log(1 - eps)
print ('B =', B)
C = np.log(eps)
print ('C =', C)
r1 = (score1 - (C/B)*score0) / ((C*C/B) - B)
print ('r1 =', r1)
r0 = (-score1 - r1*B)/C
print ('r0 =', r0)
print ('r0 + r1 =', r0 + r1)
d = df[df['is_duplicate'] >= 0]['is_duplicate'].value_counts(normalize=True).to_dict()
print ('P(y = 0) =', d[0])
print ('P(y = 1) =', d[1])
print ('P(y\' = 0) =', r0)
print ('P(y\' = 1) =', r1)
gamma_0 = r0/d[0]
gamma_1 = r1/d[1]
print ('gamma_0 =', gamma_0)
print ('gamma_1 =', gamma_1)

def link_function(x):
    return gamma_1*x/(gamma_1*x + gamma_0*(1 - x))

support = np.linspace(0, 1, 1000)
values = link_function(support)

df['len1'] = df['question1'].str.len().astype(np.float32)
df['len2'] = df['question2'].str.len().astype(np.float32)
df['abs_diff_len1_len2'] = np.abs(df['len1'] - df['len2'])

A = -34.5392995082
eps_0 = 1.0
eps_1 = 9.99200722163e-16
eps = 1e-15
B = -9.99200722163e-16
C = -34.5387763949
r1 = 0.174264424749
r0 = 0.825754788586
r0 + r1 = 1.00001921334
P(y = 0) = 0.630802146974
P(y = 1) = 0.369197853026
P(y' = 0) = 0.825754788586
P(y' = 1) = 0.174264424749
gamma_0 = 1.30905513329
gamma_1 = 0.472008228977


In [5]:
max_in_dup = df.loc[ix_is_dup]['abs_diff_len1_len2'].max()
print ('Maximum among duplicates:       ', max_in_dup)
max_in_not_dups = df.loc[ix_not_dup]['abs_diff_len1_len2'].max()
print ('Maximum among non-duplicates:     ', max_in_not_dups)
print ('Maximum among non-duplicates: ', (df.loc[ix_train]['abs_diff_len1_len2'] > max_in_dup).sum())
std_in_dups = df.loc[ix_is_dup]['abs_diff_len1_len2'].std()
print ('Standard deviation in duplicates:', std_in_dups)
replace_value = max_in_dup + 2*std_in_dups
print ('New value:              ', replace_value)

df['abs_diff_len1_len2'] = df['abs_diff_len1_len2'].apply(lambda x: x if x < replace_value else replace_value)
df['log_abs_diff_len1_len2'] = np.log(df['abs_diff_len1_len2'] + 1)
df['ratio_len1_len2'] = df['len1'].apply(lambda x: x if x > 0.0 else 1.0)/\
                        df['len2'].apply(lambda x: x if x > 0.0 else 1.0)

Maximum among duplicates:        196.0
Maximum among non-duplicates:      1080.0
Maximum among non-duplicates:  394
Standard deviation in duplicates: 14.3821
New value:               224.764198303


In [6]:
max_in_dup = df.loc[ix_is_dup]['ratio_len1_len2'].max()
print ('Maximum among duplicates:        ', max_in_dup)
max_in_not_dups = df.loc[ix_not_dup]['ratio_len1_len2'].max()
print ('Maximum among non-duplicates:      ', max_in_not_dups)
print ('Number of lines greater than threshold: ', (df.loc[ix_train]['ratio_len1_len2'] > max_in_dup).sum())
std_in_dups = df.loc[ix_is_dup]['ratio_len1_len2'].std()
print ('Number of lines greater than threshold: ', std_in_dups)
replace_value = max_in_dup + 2*std_in_dups
print ('New value:               ', replace_value)

df['ratio_len1_len2'] = df['ratio_len1_len2'].apply(lambda x: x if x < replace_value else replace_value)
df['log_ratio_len1_len2'] = np.log(df['ratio_len1_len2'] + 1)


Maximum among duplicates:         6.66666666667
Maximum among non-duplicates:       117.0
Number of lines greater than threshold:  152
Number of lines greater than threshold:  0.376106045115
New value:                7.4188787569


In [7]:
predictors = df.columns[7:].tolist()
print (predictors)

def check_model(predictors):
    classifier = lambda: SGDClassifier(
        loss='log', 
        penalty='elasticnet', 
        fit_intercept=True, 
        n_iter=100, 
        shuffle=True, 
        n_jobs=-1,
        class_weight=None)

    model = Pipeline(steps=[
        ('ss', StandardScaler()),
        ('en', classifier())
    ])

    parameters = {
        'en__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.02, 0.1, 0.5, 0.9, 1],
        'en__l1_ratio': [0, 0.0001, 0.001, 0.01, 0.1, 0.3, 0.5, 0.75, 0.9, 1]
    }

    folder = StratifiedKFold(n_splits=5, shuffle=True)
    
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1)
    grid_search = grid_search.fit(df.loc[ix_train][predictors], 
                                  df.loc[ix_train]['is_duplicate'])
    
    return grid_search

if not os.path.isfile('tmp/1_model.pkl'):
    model = check_model(predictors)
    print (model.best_score_)
    print (model.best_params_)
    with open('tmp/1_model.pkl', 'wb') as f:
        pickle.dump(model, f)
else:
    with open('tmp/1_model.pkl', 'rb') as f:
        model = pickle.load(f) 

['len1', 'len2', 'abs_diff_len1_len2', 'log_abs_diff_len1_len2', 'ratio_len1_len2', 'log_ratio_len1_len2']


In [8]:
y_test_pred = model.predict_proba(df.loc[ix_test][predictors])[:, 1]
y_test_pred_fixed = link_function(y_test_pred)

if not os.path.isfile('submits/1_pred.csv'):
    pd.DataFrame.from_records(
        zip(df.loc[ix_test]['test_id'].values, 
            y_test_pred), 
        columns=['test_id', 'is_duplicate']).to_csv('submits/1_pred.csv', index=False)
    
if not os.path.isfile('submits/1_pred_fixed.csv'):
    pd.DataFrame.from_records(
        zip(df.loc[ix_test]['test_id'].values, 
            y_test_pred_fixed), 
        columns=['test_id', 'is_duplicate']).to_csv('submits/1_pred_fixed.csv', index=False)

In [None]:
%%time
if os.path.isfile('tmp/cv_charW.pkl') and os.path.isfile('tmp/ch_freqW.pkl'):
    with open('tmp/cv_charW.pkl', 'rb') as f:
        cv_char = pickle.load(f)
    with open('tmp/ch_freqW.pkl', 'rb') as f:
        ch_freq = pickle.load(f)
else:
    cv_char = CountVectorizer(ngram_range=(1, 3), analyzer='word')
    ch_freq = np.array(cv_char.fit_transform(df['question1'].tolist() + df['question2'].tolist()).sum(axis=0))[0, :]
    with open('tmp/cv_charW.pkl', 'wb') as f:
        pickle.dump(cv_char, f)
    with open('tmp/ch_freqW.pkl', 'wb') as f:
        pickle.dump(ch_freq, f)

In [24]:
unigrams = dict([(k, v) for (k, v) in cv_char.vocabulary_.items() if len(k) == 1])
ix_unigrams = np.sort(unigrams.values(),axis = None)
print ('Unigrams:', len(unigrams))
bigrams = dict([(k, v) for (k, v) in cv_char.vocabulary_.items() if len(k) == 2])
ix_bigrams = np.sort(bigrams.values(), axis = None)
print ('Bigrams: ', len(bigrams))
print(ix_unigrams[:5])
print(ix_bigrams[:5])

Unigrams: 1779
Bigrams:  10723
[ dict_values([8243, 3608, 11630, 7473, 10732, 9740, 6829, 7022, 9527, 11797, 6450, 10207, 12279, 12105, 11644, 11895, 12285, 10742, 7998, 6708, 10295, 11116, 10819, 10897, 2729, 9895, 7192, 10665, 11096, 9551, 6559, 11905, 10899, 11295, 6549, 7032, 9540, 10220, 5986, 10916, 12028, 9828, 7035, 11232, 7514, 11750, 7947, 7971, 10878, 9340, 10773, 9960, 7941, 11734, 11234, 11360, 11109, 8595, 11935, 9852, 10061, 11277, 12164, 9778, 11238, 6519, 11388, 8610, 9709, 12138, 10253, 10647, 11910, 12268, 9994, 9662, 7882, 12385, 7408, 9664, 11145, 12402, 11788, 10008, 11494, 10527, 11055, 11804, 12421, 6043, 11839, 11523, 8655, 11947, 6544, 8674, 12111, 10086, 7888, 10402, 8632, 9660, 8663, 10377, 11370, 11950, 11958, 8803, 9448, 9722, 10494, 8280, 1582, 6405, 9102, 8680, 10214, 10375, 10560, 8809, 11983, 12088, 12223, 12201, 10517, 11271, 7869, 6842, 7359, 10256, 10632, 11721, 11572, 11550, 11930, 6742, 11257, 11915, 7953, 9502, 7485, 12130, 11175, 4634, 11474, 84

In [12]:
if os.path.isfile('tmp/cv_char4.pkl') and os.path.isfile('tmp/ch_freq4.pkl'):
    with open('tmp/cv_char4.pkl', 'rb') as f:
        cv_char = pickle.load(f)
    with open('tmp/ch_freq4.pkl', 'rb') as f:
        ch_freq = pickle.load(f)
else:
    cv_char = CountVectorizer(ngram_range=(3, 3), analyzer='char')
    ch_freq = np.array(cv_char.fit_transform(df['question1'].tolist() + df['question2'].tolist()).sum(axis=0))[0, :]
    with open('tmp/cv_char4.pkl', 'wb') as f:
        pickle.dump(cv_char, f)
    with open('tmp/ch_freq4.pkl', 'wb') as f:
        pickle.dump(ch_freq, f)

KeyboardInterrupt: 

In [None]:
trigrams = dict([(k, v) for (k, v) in cv_char.vocabulary_.items() if len(k) == 3])
ix_trigrams = np.sort(trigrams.values(),axis= None)
print ('Trigrams:', len(trigrams))

In [27]:
print(tamanho)

148938834
