In [2]:
#!pip install eli5
#!pip install Boruta

Collecting Boruta
  Downloading https://files.pythonhosted.org/packages/35/03/ca2b7e352bf1f0e2dd0e17e1b8c92f75dbb9f218d36eba4e894efa2a0478/Boruta-0.1.5.tar.gz (55kB)
Building wheels for collected packages: Boruta
  Running setup.py bdist_wheel for Boruta: started
  Running setup.py bdist_wheel for Boruta: finished with status 'done'
  Stored in directory: C:\Users\JoonH\AppData\Local\pip\Cache\wheels\5c\5a\72\13e8ea10ba10e22e9ca7f76f8b451c9f98fa190d428c8857dd
Successfully built Boruta
Installing collected packages: Boruta
Successfully installed Boruta-0.1.5


In [3]:
import matplotlib.pyplot as plt
import cv2
import pandas as pd
import numpy as np
import tensorflow as tf
from keras import backend as K
from keras.layers import Layer
from keras.layers import *
from keras.metrics import *
from keras.models import Model
from keras.callbacks import *
from keras.optimizers import *
from keras.applications import *
from keras import activations
from keras import utils
from keras.regularizers import l2
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.metrics import accuracy_score, roc_auc_score
import json
import ast
import time
from sklearn import linear_model
import eli5
import gc
gc.enable()
gc.collect()

11

In [4]:
train_df = pd.read_csv("/Users/JoonH/dont-overfit-ii/train.csv")
test_df = pd.read_csv("/Users/JoonH/dont-overfit-ii/test.csv")

In [6]:
x_train = train_df.drop(['id','target'], axis = 1)
y_train = train_df['target']
X_test = test_df.drop(['id'], axis = 1)
n_fold = 20
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(x_train)
X_test = scaler.transform(X_test)

In [39]:
def train_model(X, X_test, y, params, folds=folds, model_type='lgb', plot_feature_importance=False, averaging='usual', model=None):
    oof = np.zeros(len(X))
    prediction = np.zeros(len(X_test))
    scores = []
    feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        # print('Fold', fold_n, 'started at', time.ctime())
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        
        if model_type == 'lgb':
            train_data = lgb.Dataset(X_train, label=y_train)
            valid_data = lgb.Dataset(X_valid, label=y_valid)
            
            model = lgb.train(params,
                    train_data,
                    num_boost_round=2000,
                    valid_sets = [train_data, valid_data],
                    verbose_eval=500,
                    early_stopping_rounds = 200)
            
            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
            
        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X_tr.columns)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X_tr.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=500, params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X_tr.columns), ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X_tr.columns), ntree_limit=model.best_ntree_limit)
        
        if model_type == 'cat':
            model = CatBoostClassifier(iterations=20000,  eval_metric='AUC', **params)
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)
            
            
        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)
            y_pred_valid = model.predict(X_valid).reshape(-1,)
            score = roc_auc_score(y_valid, y_pred_valid)
            # print(f'Fold {fold_n}. AUC: {score:.4f}.')
            # print('')
            
            y_pred = model.predict_proba(X_test)[:, 1]
            
            
        if model_type == 'glm':
            model = sm.GLM(y_train, X_train, family=sm.families.Binomial())
            model_results = model.fit()
            model_results.predict(X_test)
            y_pred_valid = model_results.predict(X_valid).reshape(-1,)
            score = roc_auc_score(y_valid, y_pred_valid)
            
            y_pred = model_results.predict(X_test)
        
        oof[valid_index] = y_pred_valid.reshape(-1,)
        scores.append(roc_auc_score(y_valid, y_pred_valid))

        if averaging == 'usual':
            prediction += y_pred
        elif averaging == 'rank':
            prediction += pd.Series(y_pred).rank().values  
        
        if model_type == 'lgb':
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = X.columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= n_fold
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    if model_type == 'lgb':
        feature_importance["importance"] /= n_fold
        if plot_feature_importance:
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');
        
            return oof, prediction, feature_importance
        return oof, prediction, scores
    
    else:
        return oof, prediction, scores

In [36]:
model = linear_model.LogisticRegression(class_weight='balanced', penalty='l1', C=0.1, solver='liblinear')
oof_lr, prediction_lr, _ = train_model(X_train, X_test, y_train, params=None, model_type='sklearn', model=model)

CV mean score: 0.7112, std: 0.1198.


In [15]:
#permutation importance 
from eli5.sklearn import PermutationImportance
perm = PermutationImportance(model, random_state=1).fit(X_train, y_train)
eli5.show_weights(perm, top=50)

Weight,Feature
0.1504  ± 0.0394,x33
0.0928  ± 0.0442,x65
0.0552  ± 0.0265,x217
0.0464  ± 0.0306,x199
0.0432  ± 0.0163,x91
0.0272  ± 0.0274,x189
0.0216  ± 0.0187,x133
0.0160  ± 0.0051,x165
0.0112  ± 0.0199,x117
0.0112  ± 0.0078,x43


In [32]:
#select only the 'important features'
from sklearn.feature_selection import SelectFromModel
sel = SelectFromModel(perm,threshold=0.005, prefit=True)
X_trans = sel.transform(X_train)
X_test_trans = sel.transform(X_test)

In [67]:
model = linear_model.LogisticRegression(class_weight='balanced', penalty='l1', C=0.09, solver='liblinear', max_iter = 50000)
oof_lr, prediction_lr, _ = train_model(X_trans, X_test_trans, y_train, params=None, model_type='sklearn', model=model)

CV mean score: 0.7731, std: 0.0917.


In [35]:
results = prediction_lr
predictions = pd.DataFrame(results, columns = ['target'])

ids = test_df['id']
predictions = pd.concat([ids, predictions], axis = 1, sort=False)
predictions.to_csv('dont_overfit_2_logreg_less_features1.csv',index = False)

# blending

In [66]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB(var_smoothing = 1.1)
oof_lr_BR, prediction_lr_BR, _ = train_model(X_train, X_test, y_train, params=None, model_type='sklearn', model=model)

CV mean score: 0.7000, std: 0.1566.


In [69]:
results = prediction_lr_BR * 0.2 + prediction_lr * 0.8

In [70]:
predictions = pd.DataFrame(results, columns = ['target'])

ids = test_df['id']
predictions = pd.concat([ids, predictions], axis = 1, sort=False)
predictions.to_csv('dont_overfit_2_blending.csv',index = False)

# Feature generation
Let's build some features automatically from the given dataframe, and see if we get anything usefl

In [52]:
#!pip install featuretools
import featuretools as ft

In [56]:
# initialize entityset
es = ft.EntitySet('data')
es2 = ft.EntitySet('test')

# add entities (application table itself)
es.entity_from_dataframe(
    entity_id='main', # define entity id
    dataframe=train_df.drop(['target'], axis=1), # select underlying data
    index='id', # define unique index column
    # specify some datatypes manually (if needed)
    variable_types={
        f: ft.variable_types.Categorical 
        for f in train_df.columns if f.startswith('FLAG_')
    }
)

es2.entity_from_dataframe(
    entity_id='test', # define entity id
    dataframe=test_df, # select underlying data
    index='id', # define unique index column
    # specify some datatypes manually (if needed)
    variable_types={
        f: ft.variable_types.Categorical 
        for f in train_df.columns if f.startswith('FLAG_')
    }
)


Entityset: test
  Entities:
    test [Rows: 19750, Columns: 301]
  Relationships:
    No relationships

In [27]:
# inspect list of all built-in primitives for feature construction
ft.list_primitives()

Unnamed: 0,name,type,description
0,any,aggregation,Test if any value is 'True'.
1,std,aggregation,Finds the standard deviation of a numeric feat...
2,sum,aggregation,Sums elements of a numeric or boolean feature.
3,count,aggregation,Counts the number of non null values.
4,time_since_last,aggregation,Time since last related instance.
5,skew,aggregation,Computes the skewness of a data set.
6,mean,aggregation,Computes the average value of a numeric feature.
7,median,aggregation,Finds the median value of any feature with wel...
8,percent_true,aggregation,Finds the percent of 'True' values in a boolea...
9,num_true,aggregation,Finds the number of 'True' values in a boolean.


In [28]:
# see feature set definitions (no actual computations yet)
# used for faster prototyping
feature_defs = ft.dfs(
    entityset=es, 
    target_entity="main", 
    features_only=True,
    agg_primitives=[
        "mean",
        "mode", 
        "max", 
        "min", 
        "sum", 
        "std"
        
    ],
    trans_primitives=[
        "not",
        "diff",
        "not",
        "percentile",
        "cum_sum"
    ],
    max_depth=1,
    #cutoff_time=cutoff_times,
    #training_window=ft.Timedelta(60, "d"), # use only last X days in computations
    max_features=1000,
    chunk_size=10000,
    verbose=True,
)

Built 600 features


In [57]:
#The actual feature construction
# see feature set definitions (no actual computations yet)
# used for faster prototyping
fm_train,feature_defs = ft.dfs(
    entityset=es, 
    target_entity="main", 
    features_only=False,
    agg_primitives=[
        "mean",
        "mode", 
        "max", 
        "min", 
        "sum", 
        "std"
        
    ],
    trans_primitives=[
        "not",
        "diff",
        "not",
        "percentile",
        "cum_sum"
    ],
    max_depth=2,
    #cutoff_time=cutoff_times,
    #training_window=ft.Timedelta(60, "d"), # use only last X days in computations
    max_features=1000,
    chunk_size=5000,
    verbose=True,
)

fm_test,feature_defs = ft.dfs(
    entityset=es2, 
    target_entity="test", 
    features_only=False,
    agg_primitives=[
        "mean",
        "mode", 
        "max", 
        "min", 
        "sum", 
        "std"
        
    ],
    trans_primitives=[
        "not",
        "diff",
        "not",
        "percentile",
        "cum_sum"
    ],
    max_depth=1,
    #cutoff_time=cutoff_times,
    #training_window=ft.Timedelta(60, "d"), # use only last X days in computations
    max_features=1000,
    chunk_size=10000,
    verbose=True,
)


Built 600 features
Elapsed: 00:00 | Remaining: 00:00 | Progress: 100%|████████████████████████████████████████████| Calculated: 1/1 chunks
Built 600 features
Elapsed: 00:02 | Remaining: 00:00 | Progress: 100%|████████████████████████████████████████████| Calculated: 2/2 chunks


In [58]:
# check sample of extracted features
fm_train = fm_train.drop_duplicates()
fm_test = fm_test.drop_duplicates()
print(fm.shape)
fm[50:100]

(250, 600)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,PERCENTILE(290),PERCENTILE(291),PERCENTILE(292),PERCENTILE(293),PERCENTILE(294),PERCENTILE(295),PERCENTILE(296),PERCENTILE(297),PERCENTILE(298),PERCENTILE(299)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
50,-0.815,-1.832,-0.163,-0.727,0.053,-0.809,0.6,-0.096,0.151,-0.576,...,0.856,0.388,0.648,0.376,0.064,0.364,0.142,0.796,0.256,0.412
51,0.146,-1.676,2.139,1.038,0.481,0.491,-1.324,0.684,-1.035,-0.686,...,0.976,0.846,0.356,0.668,0.66,0.704,0.188,0.968,0.004,0.48
52,0.261,-0.874,-0.391,-0.24,-2.055,-0.322,-0.918,-0.053,1.228,0.978,...,0.956,0.462,0.724,0.388,0.308,0.612,0.484,0.064,0.362,0.464
53,-0.132,0.58,0.581,1.186,-1.514,0.518,0.418,0.662,-1.446,-1.773,...,0.432,0.556,0.426,0.444,0.012,0.796,0.54,0.944,0.372,0.096
54,-0.398,-0.695,0.019,1.317,-0.271,0.567,0.191,0.175,0.702,-2.361,...,0.572,0.428,0.548,0.98,0.428,0.68,0.884,0.336,0.412,0.744
55,0.312,0.39,-0.198,-0.113,-0.784,0.844,-0.709,2.846,-1.139,-0.045,...,0.592,0.636,0.2,0.312,0.136,0.892,0.512,0.32,0.278,0.868
56,-1.422,-1.067,0.384,0.465,0.055,0.97,-0.252,-0.758,-0.275,1.351,...,0.9,0.548,0.132,0.092,0.804,0.604,0.716,0.048,0.908,0.812
57,-0.227,-0.996,1.777,-1.39,0.413,0.459,1.497,-1.43,0.373,-0.373,...,0.024,0.336,0.616,0.248,0.692,0.31,0.788,0.188,0.512,0.736
58,0.077,-1.279,1.264,-0.13,0.063,0.462,0.678,0.746,-0.316,-0.013,...,0.34,0.084,0.024,0.616,0.724,0.652,0.432,0.812,0.736,0.42
59,-1.604,-0.893,1.406,1.571,0.728,0.701,1.149,0.52,0.398,0.681,...,0.472,0.216,0.504,0.152,0.52,0.7,0.024,0.316,0.764,0.676


In [60]:
#Let's train this new dataset with our logistic regression model and see what it thinks

n_fold = 20
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(fm_train)
X_test = scaler.transform(fm_test)

model = linear_model.LogisticRegression(class_weight='balanced', penalty='l1', C=0.1, solver='liblinear')
oof_lr, prediction_lr, _ = train_model(X_train, X_test, y_train, params=None, model_type='sklearn', model=model)

CV mean score: 0.7281, std: 0.1201.


# Boruta feature elimination

In [41]:
from boruta import BorutaPy

In [9]:
from sklearn.ensemble import RandomForestClassifier

In [61]:
rfc = RandomForestClassifier(n_estimators = 100, n_jobs = -1, class_weight = 'balanced')
boruta_selector = BorutaPy(rfc, n_estimators = 'auto', verbose = 0)
boruta_selector.fit(X_train,y_train)

BorutaPy(alpha=0.05,
     estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=58, n_jobs=-1, oob_score=False,
            random_state=<mtrand.RandomState object at 0x00000200DCB0DAB0>,
            verbose=0, warm_start=False),
     max_iter=100, n_estimators='auto', perc=100,
     random_state=<mtrand.RandomState object at 0x00000200DCB0DAB0>,
     two_step=True, verbose=0)

In [62]:
print(boruta_selector.n_features_)

11


In [65]:
feature_df = pd.DataFrame(fm_train.columns.tolist(),columns = ['features'])
feature_df['rank'] = boruta_selector.ranking_
feature_df = feature_df.sort_values('rank',ascending=True).reset_index(drop=True)

In [66]:
feature_df

Unnamed: 0,features,rank
0,PERCENTILE(91),1
1,PERCENTILE(199),1
2,PERCENTILE(33),1
3,33,1
4,65,1
5,117,1
6,PERCENTILE(217),1
7,91,1
8,217,1
9,PERCENTILE(65),1


Given these ranks, we can find the most important features according to RFC

In [112]:
columns_to_keep = feature_df.features[0:300]

In [113]:
boruta_train = fm_train[columns_to_keep]
boruta_test = fm_test[columns_to_keep]


In [114]:
#Let's train this new dataset with our logistic regression model and see what it thinks

n_fold = 50
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(boruta_train)
X_test = scaler.transform(boruta_test)

model = linear_model.LogisticRegression(class_weight='balanced', penalty='l1', C=0.14, solver='liblinear')
oof_lr, prediction_lr, _ = train_model(X_train, X_test, y_train, params=None, model_type='sklearn', model=model)

CV mean score: 0.7512, std: 0.1034.


In [115]:
from sklearn.svm import SVC

svc = SVC(class_weight='balanced', C=15.0, kernel='rbf', degree = 2, probability = True, gamma = 'auto')
_, svc_prediction_lr, _ = train_model(X_train, X_test, y_train, params=None, model_type='sklearn', model=svc)

CV mean score: 0.7831, std: 0.1159.


In [116]:
from sklearn.ensemble import *
ada = AdaBoostClassifier(n_estimators = 75, learning_rate=1.0)
_, ada_prediction_lr, _ = train_model(X_train, X_test, y_train, params=None, model_type='sklearn', model=ada)

CV mean score: 0.6400, std: 0.0956.


In [106]:
rfc = RandomForestClassifier(class_weight='balanced', n_estimators = 25)
_, rfc_prediction_lr, _ = train_model(X_train, X_test, y_train, params=None, model_type='sklearn', model=rfc)

CV mean score: 0.6287, std: 0.1055.


In [117]:
results = prediction_lr
predictions = pd.DataFrame(results, columns = ['target'])

ids = test_df['id']
predictions = pd.concat([ids, predictions], axis = 1, sort=False)
predictions.to_csv('dont_overfit_2_logreg2.csv',index = False)