https://www.kaggle.com/c/dont-overfit-ii

https://www.kaggle.com/artgor/how-to-not-overfit

### Imports

In [30]:
import itertools
import eli5
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm, skew #for some statistics

from sklearn.model_selection import train_test_split, KFold, cross_val_score, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn import metrics

import xgboost as xgb
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNet, Lasso, LogisticRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline

Using TensorFlow backend.


# Data Preparation

### Load Data

In [34]:
df_raw = pd.read_csv('./data/train.csv', header=0)
df_raw.drop(columns=['id'], inplace=True)
print('Dataframe shape is: ', df_raw.shape)
print('Columns are: ', df_raw.columns)
df_raw.head()

Dataframe shape is:  (250, 301)
Columns are:  Index(['target', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       '290', '291', '292', '293', '294', '295', '296', '297', '298', '299'],
      dtype='object', length=301)


Unnamed: 0,target,0,1,2,3,4,5,6,7,8,...,290,291,292,293,294,295,296,297,298,299
0,1.0,-1.067,-1.114,-0.616,0.376,1.09,0.467,-0.422,0.46,-0.443,...,0.22,-0.339,0.254,-0.179,0.352,0.125,0.347,0.436,0.958,-0.824
1,0.0,-0.831,0.271,1.716,1.096,1.731,-0.197,1.904,-0.265,0.557,...,-0.765,-0.735,-1.158,2.554,0.856,-1.506,0.462,-0.029,-1.932,-0.343
2,0.0,0.099,1.39,-0.732,-1.065,0.005,-0.081,-1.45,0.317,-0.624,...,-1.311,0.799,-1.001,1.544,0.575,-0.309,-0.339,-0.148,-0.646,0.725
3,1.0,-0.989,-0.916,-1.343,0.145,0.543,0.636,1.127,0.189,-0.118,...,-1.37,1.093,0.596,-0.589,-0.649,-0.163,-0.958,-1.081,0.805,3.401
4,0.0,0.811,-1.509,0.522,-0.36,-0.22,-0.959,0.334,-0.566,-0.656,...,-0.178,0.718,-1.017,1.249,-0.596,-0.445,1.751,1.442,-0.393,-0.643


In [48]:
df_test = pd.read_csv('./data/test.csv', header=0)
test_ids = df_test['id'].values
df_test.drop(columns=['id'], inplace=True)
print('Dataframe shape is: ', df_raw.shape)
print('Columns are: ', df_raw.columns)
df_test.head()

Dataframe shape is:  (250, 301)
Columns are:  Index(['target', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       '290', '291', '292', '293', '294', '295', '296', '297', '298', '299'],
      dtype='object', length=301)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.677,1.721,-0.745,-0.838,0.149,-1.138,0.242,0.504,-1.829,-1.38,...,-1.18,-0.403,0.759,-0.6,0.951,-0.349,0.446,-0.819,-0.277,1.297
1,-0.731,-0.251,0.059,0.054,1.149,2.462,0.836,0.719,-2.269,0.584,...,1.297,1.114,0.657,0.76,0.899,-1.612,-1.701,1.107,-0.314,-0.641
2,1.119,1.036,1.22,1.518,0.265,-0.088,0.245,-0.533,-0.921,0.714,...,-0.864,-0.736,0.367,0.154,0.83,-1.352,0.914,0.377,0.588,-0.912
3,-0.933,0.212,-0.053,0.57,-1.54,-1.108,0.462,1.022,-0.215,-0.205,...,0.063,-0.958,0.762,-0.213,-2.171,0.83,1.435,0.125,2.782,0.619
4,-0.208,-0.556,2.641,0.853,-0.384,0.312,0.514,0.481,-1.929,-0.4,...,-0.689,0.213,0.568,-0.935,-0.015,0.267,0.739,1.34,-0.178,1.01


In [4]:
#Inspect data for NaN.
print(list(df_raw.isna().sum()))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


### Create Correlation matrix and select the top most relevant features

In [5]:
corrs = df_raw.corr().abs().unstack().sort_values(kind="quicksort").reset_index()
corrs = corrs[corrs['level_0'] != corrs['level_1']]
corrs.tail(10)

Unnamed: 0,level_0,level_1,0
90290,279,219,0.238512
90291,219,279,0.238512
90292,23,20,0.246984
90293,20,23,0.246984
90294,146,228,0.258447
90295,228,146,0.258447
90296,39,6,0.267727
90297,6,39,0.267727
90298,127,target,0.33754
90299,target,127,0.33754


In [11]:
y_train = df_raw['target']
X_train = df_raw.drop(['target'], axis=1)
X_test = df_test

### Create Folds for validation

In [12]:
n_fold = 20
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)
repeated_folds = RepeatedStratifiedKFold(n_splits=20, n_repeats=20, random_state=42)

### Define function for concise model training

In [25]:
def train_model(X, X_test, y, params, folds, averaging='usual', model=None):
    oof = np.zeros(len(X))
    prediction = np.zeros(len(X_test))
    scores = []
    feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        # print('Fold', fold_n, 'started at', time.ctime())
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
            
        model = model
        model.fit(X_train, y_train)
        y_pred_valid = model.predict(X_valid).reshape(-1,)
        score = metrics.roc_auc_score(y_valid, y_pred_valid)            
        y_pred = model.predict_proba(X_test)[:, 1]

        
        oof[valid_index] = y_pred_valid.reshape(-1,)
        scores.append(metrics.roc_auc_score(y_valid, y_pred_valid))

        if averaging == 'usual':
            prediction += y_pred
        elif averaging == 'rank':
            prediction += pd.Series(y_pred).rank().values  

    prediction /= n_fold
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    return oof, prediction, scores

### Scale features

In [26]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Try Logistic regression

In [27]:
model = LogisticRegression(
  class_weight='balanced', penalty='l1', C=0.1, solver='liblinear')

In [28]:
oof_lr, prediction_lr, scores = train_model(
  X_train, X_test, y_train, params=None, folds=folds, model=model)

CV mean score: 0.7164, std: 0.1360.


### Feature selection with ELI5

In [61]:
eli5.show_weights(model, top=50)

Weight?,Feature
0.327,x18
0.114,x3
0.042,x10
0.025,x6
0.02,x25
-0.012,x17
-0.279,x16


In [62]:
(model.coef_ != 0).sum()

7

In [63]:
top_features = [i[1:] for i in eli5.formatters.as_dataframe.explain_weights_df(model).feature if 'BIAS' not in i]
X_train = df_raw[top_features]
X_test = df_test[top_features]

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [64]:
print(X_train.shape)
print(X_test.shape)

(250, 7)
(19750, 7)


In [65]:
model = LogisticRegression(class_weight='balanced', penalty='l1', C=0.1, solver='liblinear')
oof_lr1, prediction_lr1, _ = train_model(X_train, X_test, y_train, params=None, folds=folds, model=model)

CV mean score: 0.5932, std: 0.1431.


In [66]:
sub = pd.DataFrame()
sub['Id'] = test_ids
sub['target'] = np.round(prediction_lr1).astype(int)
sub.to_csv('submission.csv',index=False)