In [1]:
import numpy as np
import pandas as pd
import sqlite3
import warnings
from collections import defaultdict
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV

In [3]:
with sqlite3.connect('dataset/Cleaned.db') as conn:
    train = pd.read_sql_query('SELECT * FROM train', conn)

In [4]:
def get_summary_grid(grid):
    parms = list(grid.param_grid.keys())
    columns = ['Iter #']
    scorings = grid.scoring
    for scoring in scorings:
        for typ in ['train', 'test']:
            columns.append(f'{typ}_{scoring}')
    columns.extend(['fit_time', 'score_time'])
    for param in grid.param_grid.keys():
        columns.append(param)
    df = pd.DataFrame(columns=columns)
    cv_res = grid.cv_results_
    for col in columns[1:-len(parms)]:
        df[col] = cv_res['mean_' + col]
    items = 1
    for k, v in grid.param_grid.items():
        items *= len(v)
    df['Iter #'] = np.array(range(items)) + 1
    res = defaultdict(list)
    for each in grid.cv_results_['params']:
        for p in parms:
            res[p].append(each[p])
    for k, v in res.items():
        df[k] = v
    return df

In [5]:
train.shape

(404289, 46)

In [6]:
train.dropna(inplace=True)  # Sorry to drop nans but code was failing with fillna
# later try with imputations

In [7]:
labels = train.is_duplicate
y_true = list(map(int, labels))

In [8]:
train.drop([
    'id',
    'qid1',
    'qid2',
    'is_duplicate'
], axis=1, inplace=True)

In [9]:
Xtrain, Xtest, ytrain, ytest = train_test_split(train, y_true, stratify=y_true, test_size=0.3)

In [10]:
Xtrain.shape

(281967, 42)

In [11]:
Xtest.shape

(120843, 42)

In [12]:
tfidf = TfidfVectorizer(stop_words='english')

In [13]:
tfidf.fit(Xtrain.question1)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [14]:
q1_train_vec = tfidf.transform(Xtrain.question1)
q1_test_vec = tfidf.transform(Xtest.question1)

In [15]:
q1_train_vec.shape

(281967, 48696)

In [16]:
tfidf.fit(Xtrain.question2)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [17]:
q2_train_vec = tfidf.transform(Xtrain.question2)
q2_test_vec = tfidf.transform(Xtest.question2)

In [18]:
q2_train_vec.shape

(281967, 44357)

In [19]:
Xtrain.drop(['question1', 'question2'], axis=1, inplace=True)
Xtest.drop(['question1', 'question2'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [20]:
assert(q1_train_vec.shape[0] == Xtrain.shape[0])
assert(q2_train_vec.shape[0] == Xtrain.shape[0])
assert(q1_test_vec.shape[0] == Xtest.shape[0])
assert(q2_test_vec.shape[0] == Xtest.shape[0])

In [21]:
from scipy.sparse import hstack

In [22]:
X_train = hstack((q1_train_vec, q2_train_vec, np.array(Xtrain)))
assert(X_train.shape[1] == q1_train_vec.shape[1] + q2_train_vec.shape[1] + Xtrain.shape[1])
X_test = hstack((q1_test_vec, q2_test_vec, np.array(Xtest)))
assert(X_test.shape[1] == q1_test_vec.shape[1] + q2_test_vec.shape[1] + Xtest.shape[1])

In [23]:
from sklearn.metrics import log_loss

#### Logistic Regression with SGDClassifier

In [172]:
estimator = SGDClassifier(penalty='l2', loss='log', random_state=42, n_jobs=10, verbose=0)

In [170]:
params = {
    'alpha': np.logspace(-6, 10, 50)
}

In [173]:
grid = GridSearchCV(estimator=estimator,
                    param_grid=params,
                    scoring={'roc_auc', 'balanced_accuracy', 'neg_log_loss'},
                    refit='neg_log_loss', # Because we are using multiple evaluation metrics
                    cv=10,
                    return_train_score=True,
                    verbose=2,
                    n_jobs=10)

In [174]:
grid.fit(X_train, ytrain)

Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  21 tasks      | elapsed:   12.4s
[Parallel(n_jobs=10)]: Done 142 tasks      | elapsed:   59.6s
[Parallel(n_jobs=10)]: Done 345 tasks      | elapsed:  2.3min
[Parallel(n_jobs=10)]: Done 500 out of 500 | elapsed:  3.3min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=10, penalty='l2',
       power_t=0.5, random_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=10,
       param_grid={'alpha': array([1.00000e-06, 2.12095e-06, 4.49843e-06, 9.54095e-06, 2.02359e-05,
       4.29193e-05, 9.10298e-05, 1.93070e-04, 4.09492e-04, 8.68511e-04,
       1.84207e-03, 3.90694e-03, 8.28643e-03, 1.75751e-02, 3.72759e-02,
       7.90604e-02, 1.67683e-01, 3.55648e-01, 7.54312e-01, 1.59... 1.09854e+08, 2.32995e+08,
       4.94171e+08, 1.04811e+09, 2.22300e+09, 4.71487e+09, 1.00000e+10])},
       pre_dispatch='2*n_jobs', refit='neg_log_loss',
       return_train_s

In [182]:
good_alphas = get_summary_grid(grid).sort_values('test_neg_log_loss', ascending=False).alpha.head(10)

In [185]:
alpha = good_alphas
log_error_array=[]
for i in alpha:
    clf = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42, n_jobs=10, verbose=0)
    clf.fit(X_train, ytrain)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid", cv=10)
    sig_clf.fit(X_train, ytrain)
    predict_y = sig_clf.predict_proba(X_test)
    log_error_array.append(log_loss(ytest, predict_y, labels=clf.classes_, eps=1e-15))
    print('For values of alpha = ', i, "The log loss is:",log_loss(ytest, predict_y, labels=clf.classes_, eps=1e-15))

For values of alpha =  0.07906043210907686 The log loss is: 0.40938657887021995
For values of alpha =  0.16768329368110066 The log loss is: 0.4107767631169659
For values of alpha =  0.03727593720314938 The log loss is: 0.4151317154424956
For values of alpha =  0.35564803062231287 The log loss is: 0.4172341712367717
For values of alpha =  0.7543120063354607 The log loss is: 0.42798150084206454
For values of alpha =  0.017575106248547894 The log loss is: 0.4328488141771995
For values of alpha =  1.5998587196060574 The log loss is: 0.441921111271788
For values of alpha =  3.393221771895323 The log loss is: 0.4571471236919364
For values of alpha =  7.196856730011514 The log loss is: 0.47196286694829365
For values of alpha =  15.264179671752302 The log loss is: 0.4870777622529951


In [206]:
log_error_array=[]
for l in ['l1', 'l2']:
    clf = SGDClassifier(alpha=0.07906043210907686, penalty=l, loss='log', random_state=42, n_jobs=10, verbose=0)
    clf.fit(X_train, ytrain)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid", cv=10)
    sig_clf.fit(X_train, ytrain)
    predict_y = sig_clf.predict_proba(X_test)
    log_error_array.append(log_loss(ytest, predict_y, labels=clf.classes_, eps=1e-15))
    print('For values of penalty = ', l, "The log loss is:",log_loss(ytest, predict_y, labels=clf.classes_, eps=1e-15))

For values of penalty =  l1 The log loss is: 0.6585360623131012
For values of penalty =  l2 The log loss is: 0.40938657887021995


#### Linear SVC with SGDClassifier

In [212]:
alpha = good_alphas
log_error_array=[]
for i in alpha:
    clf = SGDClassifier(alpha=i, penalty='l2', loss='hinge', random_state=42, n_jobs=10, verbose=0)
    clf.fit(X_train, ytrain)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid", cv=10)
    sig_clf.fit(X_train, ytrain)
    predict_y = sig_clf.predict_proba(X_test)
    log_error_array.append(log_loss(ytest, predict_y, labels=clf.classes_, eps=1e-15))
    print('For values of alpha = ', i, "The log loss is:",log_loss(ytest, predict_y, labels=clf.classes_, eps=1e-15))

For values of alpha =  0.07906043210907686 The log loss is: 0.4187434911781894
For values of alpha =  0.16768329368110066 The log loss is: 0.41501052996812443
For values of alpha =  0.03727593720314938 The log loss is: 0.43035992421519814
For values of alpha =  0.35564803062231287 The log loss is: 0.4154973530006115
For values of alpha =  0.7543120063354607 The log loss is: 0.4192151082070379
For values of alpha =  0.017575106248547894 The log loss is: 0.4519119446375938
For values of alpha =  1.5998587196060574 The log loss is: 0.42635427124384584
For values of alpha =  3.393221771895323 The log loss is: 0.4378892370608103
For values of alpha =  7.196856730011514 The log loss is: 0.4565228248487436
For values of alpha =  15.264179671752302 The log loss is: 0.49261554104751976
