In [1]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import progressbar
import catboost
from catboost import CatBoostClassifier
from catboost import Pool
from sklearn.preprocessing import LabelEncoder
from catboost import MetricVisualizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score, f1_score
from sklearn.model_selection import train_test_split

import shap
shap.initjs()
import timeit

from sklearn.model_selection import train_test_split, GroupShuffleSplit, GridSearchCV, GroupKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score, f1_score
from xgboost.sklearn import XGBRegressor, XGBClassifier
from imblearn.over_sampling import SMOTE

from catboost.utils import select_threshold
from catboost.utils import get_roc_curve
from catboost.utils import get_fpr_curve
from catboost.utils import get_fnr_curve

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_pickle('../../data/interim/final_last_view.pickle')

In [3]:
def label_encode_categories(df):
    '''
    label encodes gender and id, removing previous created one hot encoding    
    
    '''
    le = LabelEncoder()
    df.id = le.fit_transform(df.id)
    df.person = le.fit_transform(df.person)
    le.fit_transform(df.id)
    df['gender'] = df[['F', 'M', 'O']].idxmax(1)
    df.gender = le.fit_transform(df.gender)
    df = df.drop(['F', 'M', 'O'], axis=1)
    return df
df = label_encode_categories(df)
df.drop(['received_spend', 'viewed_spend', 'viewed_days_left', 'remaining to complete', 'viewed_in_valid', 'offer_spend'], axis=1, inplace=True)

In [4]:
def shuffled_datasets(X, y):
    '''
    Splits dataset 
    This checks whether folds are independent, positives per fold (stratification )
    '''

    train_fold=[]
    test_fold=[]
    total=[]
    intersect=[]
    positive_ratio=[]
    train_X=[]
    test_X=[]
    train_y, test_y = [], []
    test_lists = []
    test_overlap = []
    
    #for i,j in enumerate(GroupShuffleSplit(test_size=.2, n_splits=5, random_state=0).split(X, y, groups=X.person)):
    

    for i,j in enumerate(GroupKFold(n_splits=5).split(X, y, groups=X.person)):
        
        train_X.append(X.iloc[j[0]])
        train_y.append(y.iloc[j[0]])
        
        test_X.append(X.iloc[j[1]])
        test_y.append(y.iloc[j[1]])
        
        train_fold.append(X.iloc[j[0]].person)
        test_fold.append(X.iloc[j[1]].person)
        
        total.append(X.iloc[j[0]].person.nunique() + X.iloc[j[1]].person.nunique())
        
        intersect.append(np.intersect1d(X.iloc[j[0]].person, X.iloc[j[1]].person))
        
        positive_ratio.append(round(y_train.iloc[j[1]].sum() / y_train.iloc[j[1]].count(),3))
        
        test_lists.append(X.iloc[j[1]].person)
    
    for i in range(1,5):
        test_overlap.append(np.intersect1d(test_lists[0], test_lists[i]))
    
                        
    print('Total unique persons across train and test: ', total)
    print('Intersection of persons across train and test: ', intersect)
    print('Percentage of positive class per split: ', positive_ratio)
    print('Test overlap with first fold: ', test_overlap)
        
    return train_X, test_X, train_y, test_y

In [5]:
def train_test_by_time(df, split):
    '''
    Splits dataframe into 75% train and 25% test by signed_up date.
    Test data is taken from the 25% newest offers, this comprises a different customer base to the train set and requries prediction of the future.    
    '''
        
    test_data = df[df.signed_up >= df.signed_up.quantile(q=split)]
    train_data = df[df.signed_up < df.signed_up.quantile(q=split)]
    
    y_train = train_data.complete
    X_train = train_data.drop('complete', axis=1)

    y_test = test_data.complete
    X_test = test_data.drop('complete', axis=1)
    
    
    print('X_train', X_train.shape, round(X_train.shape[0] / (X_train.shape[0]+X_test.shape[0]), 4))
    print('X_test', X_test.shape, round(X_test.shape[0] / (X_train.shape[0]+X_test.shape[0]), 4))
    print('y_train', y_train.shape)
    print('y_test', y_test.shape)
    
    return X_train, X_test, y_train, y_test

In [6]:
def last_offer_split(df):
    '''
    Splits dataframe into 75% train and 25% test by signed_up date.
    Test data is taken from the 25% newest offers, this comprises a different customer base to the train set and requries prediction of the future.    
    '''
    
    # get indices with latest offer time for each person.
    idx = df.groupby(['person'])['time'].transform(max) == df['time']
        
    test_data = df[idx]
    train_data = df[~idx]
    
    y_train = train_data.complete
    X_train = train_data.drop('complete', axis=1)

    y_test = test_data.complete
    X_test = test_data.drop('complete', axis=1)
    
    
    print('X_train', X_train.shape, round(X_train.shape[0] / (X_train.shape[0]+X_test.shape[0]), 4))
    print('X_test', X_test.shape, round(X_test.shape[0] / (X_train.shape[0]+X_test.shape[0]), 4))
    print('y_train', y_train.shape)
    print('y_test', y_test.shape)
    
    return X_train, X_test, y_train, y_test

In [7]:
X_train, X_test, y_train, y_test = last_offer_split(df)

X_train (59283, 54) 0.7772
X_test (16994, 54) 0.2228
y_train (59283,)
y_test (16994,)


In [128]:
train_pool = Pool(data=X_train, label=y_train, cat_features=[0,4,53])
test_pool = Pool(data=X_test, label=y_test, cat_features=[0,4,53])

In [30]:
start_time = timeit.default_timer()

model5 = CatBoostClassifier(
    iterations=3000,
    learning_rate=0.01,
    max_depth=5,
    early_stopping_rounds=200,
    scale_pos_weight=y_train.count() / y_train.sum(),
    task_type='GPU',
    #custom_loss=['AUC', 'Accuracy', 'F1', 'Recall', 'Precision'],
    cat_features=[0,4,53],
    
    #boosting_type = 'Plain',
    #gpu_cat_features_storage = 'CpuPinnedMemory',
    verbose=200
)
model5.fit(
    train_pool,
    eval_set=test_pool,
    verbose=200,
    plot=False
);

elapsed = timeit.default_timer() - start_time

0:	learn: 0.6910330	test: 0.6912348	best: 0.6912348 (0)	total: 34.1ms	remaining: 1m 42s
200:	learn: 0.5471025	test: 0.5862092	best: 0.5862092 (200)	total: 6.74s	remaining: 1m 33s
400:	learn: 0.5257079	test: 0.5755013	best: 0.5755013 (400)	total: 13.4s	remaining: 1m 26s
600:	learn: 0.4183640	test: 0.5743160	best: 0.5740644 (536)	total: 19.9s	remaining: 1m 19s
bestTest = 0.5740644176
bestIteration = 536
Shrink model to first 537 iterations.


In [31]:
elapsed

25.603012399999898

### Testing with reduced features

In [33]:
df = pd.read_pickle('../../data/interim/final_last_view.pickle')
df = label_encode_categories(df)
df.drop(['received_spend', 'viewed_spend', 'viewed_days_left', 'remaining to complete', 'viewed_in_valid', 'offer_spend'], axis=1, inplace=True)

In [34]:
df.drop(['weekday',
    'month',
    'ratio_viewed_complete',
    'hist_viewed_spend',
    't_14',
    'social',
    'discount',
    'hist_difficulty_completed',
    'year',
    't_7c',
    'rewarded',
    'web',
    'bogo',
    't_14c',
    't_28c',
    'hist_reward_completed',
    'hist_previous_completed',
    'hist_viewed_and_completed',
    'hist_complete_not_viewed',
    'hist_viewed'], axis=1, inplace=True)

In [48]:
X_train, X_test, y_train, y_test = last_offer_split(df)

X_train (59283, 54) 0.7772
X_test (16994, 54) 0.2228
y_train (59283,)
y_test (16994,)


In [53]:
train_pool = Pool(data=X_train, label=y_train, cat_features=[0,4,53])
test_pool = Pool(data=X_test, label=y_test, cat_features=[0,4,53])
cat_features=[0,4,53]

In [50]:
start_time = timeit.default_timer()

model5 = CatBoostClassifier(
    iterations=3000,
    learning_rate=0.01,
    max_depth=10,
    early_stopping_rounds=200,
    scale_pos_weight=y_train.count() / y_train.sum(),
    task_type='GPU',
    #custom_loss=['AUC', 'Accuracy', 'F1', 'Recall', 'Precision'],
    cat_features=[0,4,53],
    
    #boosting_type = 'Plain',
    #gpu_cat_features_storage = 'CpuPinnedMemory',
    verbose=200
)
model5.fit(
    train_pool,
    eval_set=test_pool,
    verbose=200,
    plot=False
);

elapsed = timeit.default_timer() - start_time

0:	learn: 0.6895956	test: 0.6905185	best: 0.6905185 (0)	total: 117ms	remaining: 5m 49s
200:	learn: 0.4188764	test: 0.5942568	best: 0.5933509 (166)	total: 23.7s	remaining: 5m 30s
bestTest = 0.5933509069
bestIteration = 166
Shrink model to first 167 iterations.


In [51]:
elapsed

47.15411060000042

In [42]:
X_train.shape

(59283, 34)

In [43]:
start_time = timeit.default_timer()

model5 = CatBoostClassifier(
    iterations=3000,
    learning_rate=0.01,
    max_depth=10,
    early_stopping_rounds=200,
    scale_pos_weight=y_train.count() / y_train.sum(),
    task_type='GPU',
    #custom_loss=['AUC', 'Accuracy', 'F1', 'Recall', 'Precision'],
    cat_features=[0,4,33],
    
    #boosting_type = 'Plain',
    #gpu_cat_features_storage = 'CpuPinnedMemory',
    verbose=200
)
model5.fit(
    train_pool,
    eval_set=test_pool,
    verbose=10,
    plot=False
);

elapsed = timeit.default_timer() - start_time

0:	learn: 0.6892492	test: 0.6906364	best: 0.6906364 (0)	total: 118ms	remaining: 5m 52s
200:	learn: 0.3944276	test: 0.5916147	best: 0.5900391 (174)	total: 21.9s	remaining: 5m 5s
bestTest = 0.590039124
bestIteration = 174
Shrink model to first 175 iterations.


In [44]:
elapsed

41.47813680000036

### Trying gridsearch with latest offer per person as test split

Lets keep all features in for now....


In [106]:
param_grid = {"learning_rate": [0.005, 0.01, 0.02],
              "iterations": [10],
              "max_depth": [5,6,7,8,9,10]}
              #"l2_leaf_reg":[1,3,5,10,100],
              #"border_count":[32,5,10,20,50,100,200],
              #"ctr_border_count":[50,5,10,20,100,200]}

In [108]:
model = CatBoostClassifier(
    cat_features=cat_features, 
    verbose=20,
    task_type='GPU'
)

grid_search = GridSearchCV(model, param_grid=param_grid, refit=True, cv=GroupKFold(n_splits=5).split(X_train, y_train, groups=X_train.person))

In [109]:
start_time = timeit.default_timer()

grid_search.fit(X_train, y=y_train)

elapsed = timeit.default_timer() - start_time

0:	learn: 0.6863176	total: 25.4ms	remaining: 229ms
9:	learn: 0.6295701	total: 215ms	remaining: 0us
0:	learn: 0.6863363	total: 15.4ms	remaining: 139ms
9:	learn: 0.6297755	total: 172ms	remaining: 0us
0:	learn: 0.6863977	total: 16.8ms	remaining: 151ms
9:	learn: 0.6308656	total: 175ms	remaining: 0us
0:	learn: 0.6863196	total: 16.8ms	remaining: 151ms
9:	learn: 0.6298921	total: 181ms	remaining: 0us
0:	learn: 0.6863277	total: 16.5ms	remaining: 148ms
9:	learn: 0.6302779	total: 164ms	remaining: 0us
0:	learn: 0.6862259	total: 21.6ms	remaining: 195ms
9:	learn: 0.6295771	total: 210ms	remaining: 0us
0:	learn: 0.6862609	total: 26.1ms	remaining: 235ms
9:	learn: 0.6299990	total: 204ms	remaining: 0us
0:	learn: 0.6863438	total: 23ms	remaining: 207ms
9:	learn: 0.6307446	total: 227ms	remaining: 0us
0:	learn: 0.6862301	total: 20.8ms	remaining: 187ms
9:	learn: 0.6295844	total: 200ms	remaining: 0us
0:	learn: 0.6862715	total: 17.5ms	remaining: 158ms
9:	learn: 0.6301738	total: 195ms	remaining: 0us
0:	learn: 0.

In [110]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_iterations,param_learning_rate,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.799666,0.099188,0.0378,0.005561,10,0.005,5,"{'iterations': 10, 'learning_rate': 0.005, 'ma...",0.940373,0.940035,0.947289,0.939356,0.943657,0.942142,0.00297,1
1,0.758221,0.012179,0.031483,0.003443,10,0.005,6,"{'iterations': 10, 'learning_rate': 0.005, 'ma...",0.940373,0.940035,0.947289,0.939356,0.943657,0.942142,0.00297,1
2,0.816615,0.018999,0.036899,0.009792,10,0.005,7,"{'iterations': 10, 'learning_rate': 0.005, 'ma...",0.940373,0.940035,0.947289,0.939356,0.943657,0.942142,0.00297,1
3,0.888763,0.012457,0.035306,0.002326,10,0.005,8,"{'iterations': 10, 'learning_rate': 0.005, 'ma...",0.940373,0.940035,0.947289,0.939356,0.943657,0.942142,0.00297,1
4,1.020631,0.018823,0.035688,0.002754,10,0.005,9,"{'iterations': 10, 'learning_rate': 0.005, 'ma...",0.940373,0.940035,0.947289,0.939356,0.943657,0.942142,0.00297,1
5,1.678764,0.840027,0.066767,0.046869,10,0.005,10,"{'iterations': 10, 'learning_rate': 0.005, 'ma...",0.940373,0.940035,0.947289,0.939356,0.943657,0.942142,0.00297,1
6,1.43373,0.223851,0.063713,0.013304,10,0.01,5,"{'iterations': 10, 'learning_rate': 0.01, 'max...",0.940373,0.940035,0.947289,0.939356,0.943657,0.942142,0.00297,1
7,1.043056,0.040634,0.050557,0.008667,10,0.01,6,"{'iterations': 10, 'learning_rate': 0.01, 'max...",0.940373,0.940035,0.947289,0.939356,0.943657,0.942142,0.00297,1
8,0.94739,0.03964,0.044378,0.002883,10,0.01,7,"{'iterations': 10, 'learning_rate': 0.01, 'max...",0.940373,0.940035,0.947289,0.939356,0.943657,0.942142,0.00297,1
9,0.983474,0.017775,0.04148,0.010772,10,0.01,8,"{'iterations': 10, 'learning_rate': 0.01, 'max...",0.940373,0.940035,0.947289,0.939356,0.943657,0.942142,0.00297,1


In [111]:
grid_search.best_estimator_

<catboost.core.CatBoostClassifier at 0x1576c9a0048>

In [112]:
grid_search.best_params_

{'iterations': 10, 'learning_rate': 0.005, 'max_depth': 5}

In [116]:
grid_search.refit_time_

0.911334753036499

In [None]:
model = CatBoostClassifier(
    iterations=2000,
    scale_pos_weight=y_train.count() / y_train.sum(),
    task_type='GPU',
    custom_loss=['AUC', 'Accuracy', 'F1', 'Recall', 'Precision'],
    cat_features=[0,4,33],
    
    #boosting_type = 'Plain',
    #gpu_cat_features_storage = 'CpuPinnedMemory',
    verbose=20
)
model5.fit(
    train_pool,
    eval_set=test_pool,
    verbose=False,
    plot=True
);

In [14]:
train_pool = Pool(data=X_train, label=y_train, cat_features=[0,4,53])
test_pool = Pool(data=X_test, label=y_test, cat_features=[0,4,53])
cat_features=[0,4,53]

In [11]:
param_grid = {"learning_rate": [0.5], #, [0.005, 0.01, 
              "max_depth": [5,6],
              "iterations": [200]
             } 
              #"l2_leaf_reg":[1,3,5,10,100],
              #"border_count":[32,5,10,20,50,100,200],
              #"ctr_border_count":[50,5,10,20,100,200]}

In [2]:
model = CatBoostClassifier(
    cat_features=cat_features,
    early_stopping_rounds=1,
    scale_pos_weight=y_train.count() / y_train.sum(),
    verbose=20,
    task_type='GPU'
)

grid_search = GridSearchCV(model, scoring='roc_auc', param_grid=param_grid, refit=True, cv=GroupKFold(n_splits=5).split(X_train, y_train, groups=X_train.person))

NameError: name 'CatBoostClassifier' is not defined

In [142]:
start_time = timeit.default_timer()

grid_search.fit(X_train, y=y_train)

elapsed = timeit.default_timer() - start_time

0:	learn: 0.6173306	total: 32.2ms	remaining: 6.4s
20:	learn: 0.3075878	total: 668ms	remaining: 5.7s
40:	learn: 0.1997521	total: 1.27s	remaining: 4.91s
60:	learn: 0.1708804	total: 1.86s	remaining: 4.23s
80:	learn: 0.1485835	total: 2.48s	remaining: 3.64s
100:	learn: 0.1224945	total: 3.1s	remaining: 3.04s
120:	learn: 0.1097060	total: 3.7s	remaining: 2.41s
140:	learn: 0.0986297	total: 4.3s	remaining: 1.8s
160:	learn: 0.0896862	total: 4.92s	remaining: 1.19s
180:	learn: 0.0827121	total: 5.55s	remaining: 582ms
199:	learn: 0.0762255	total: 6.17s	remaining: 0us
0:	learn: 0.6199492	total: 26.5ms	remaining: 5.27s
20:	learn: 0.4902503	total: 642ms	remaining: 5.47s
40:	learn: 0.4004076	total: 1.22s	remaining: 4.74s
60:	learn: 0.3512515	total: 1.83s	remaining: 4.17s
80:	learn: 0.1701213	total: 2.43s	remaining: 3.57s
100:	learn: 0.1426972	total: 3.1s	remaining: 3.04s
120:	learn: 0.1270439	total: 3.71s	remaining: 2.42s
140:	learn: 0.1101042	total: 4.37s	remaining: 1.83s
160:	learn: 0.0996216	total: 5.

In [158]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_iterations,param_learning_rate,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,7.056698,0.180455,0.047971,0.02361,200,0.5,5,"{'iterations': 200, 'learning_rate': 0.5, 'max...",0.544935,0.507377,0.58426,0.56725,0.567447,0.554253,0.026562,2
1,11.210505,3.019512,0.058857,0.027005,200,0.5,6,"{'iterations': 200, 'learning_rate': 0.5, 'max...",0.548099,0.555447,0.559794,0.566323,0.548832,0.555699,0.00685,1


In [149]:
grid_search.best_score_

0.5556989281488495

In [150]:
grid_search.best_params_

{'iterations': 200, 'learning_rate': 0.5, 'max_depth': 6}

In [151]:
grid_search.best_index_

1

In [152]:
grid_search.scorer_

make_scorer(roc_auc_score, needs_threshold=True)

In [153]:
grid_search.n_splits_

5

In [154]:
grid_search.refit_time_

11.524693012237549

In [156]:
pred = grid_search.predict(X_test)
confusion_matrix(y_test, pred)

array([[14888,   883],
       [ 1087,   136]], dtype=int64)

### Testing parameter grid method

In [8]:
from sklearn.model_selection import ParameterGrid

In [14]:
# need to loop through 5 test pool, train pool combinations 

train_pool = Pool(data=X_train, label=y_train, cat_features=[0,4,53])
test_pool = Pool(data=X_test, label=y_test, cat_features=[0,4,53])
cat_features=[0,4,53]

In [47]:
for i in range(5):
    test_pool = Pool(data=test_X[i], label=test_y[i], cat_features=[0,4,53])
    train_pool = Pool(data=train_X[i], label=train_y[i], cat_features=[0,4,53])
        
    

0
1
2
3
4


In [77]:
test_X[i]

Unnamed: 0,person,time,age,income,id,rewarded,difficulty,reward,duration,mobile,web,social,bogo,discount,informational,signed_up,date,weekday,month,year,day,last_transaction,last_transaction_days,last_amount,t_7,t_14,t_28,t_84,t_365,t_2000,t_7c,t_14c,t_28c,t_84c,t_365c,t_2000c,hist_reward_completed,hist_reward_possible,hist_difficulty_completed,hist_difficulty_possible,hist_previous_completed,hist_previous_offers,hist_viewed_and_completed,hist_complete_not_viewed,hist_failed_complete,ratio_reward/offered,ratio_difficulty/offered,completed_ratio,hist_viewed,hist_received_spend,hist_viewed_spend,ratio_viewed_complete,last_view_date,gender
13,9522,0,,,3,0.0,10.0,2.0,10.0,1.0,1.0,1.0,0.0,1.0,0.0,-304,-304,0,9,2017,25,,,0.00,0.0,0.0,0.0,0.00,0.00,0.00,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,,,,0,0.0,0.0,,,0
14,9522,168,,,2,0.0,10.0,10.0,5.0,1.0,1.0,1.0,1.0,0.0,0.0,-304,-136,0,3,2018,12,,,0.00,0.0,0.0,0.0,0.00,0.00,0.00,0,0,0,0,0,0,0.0,2.0,0.0,10.0,0,1,0,0,1,0.0,0.0,0.0,0,0.0,0.0,,,0
15,9522,336,,,8,0.0,0.0,0.0,4.0,1.0,1.0,0.0,0.0,0.0,1.0,-304,32,0,8,2018,27,,,0.00,0.0,0.0,0.0,0.00,0.00,0.00,0,0,0,0,0,0,0.0,12.0,0.0,20.0,0,2,0,0,2,0.0,0.0,0.0,0,0.0,0.0,,,0
16,9522,408,,,4,0.0,10.0,2.0,7.0,1.0,1.0,0.0,0.0,1.0,0.0,-304,104,2,11,2018,7,,,0.00,0.0,0.0,0.0,0.00,0.00,0.00,0,0,0,0,0,0,0.0,12.0,0.0,20.0,0,3,0,0,3,0.0,0.0,0.0,0,0.0,0.0,,,0
65,14188,0,88.0,53000.0,0,0.0,20.0,5.0,10.0,0.0,1.0,0.0,0.0,1.0,0.0,-434,-434,3,5,2017,18,,,0.00,0.0,0.0,0.0,0.00,0.00,0.00,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,,,,0,0.0,0.0,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76249,15296,504,56.0,43000.0,5,0.0,7.0,3.0,7.0,1.0,1.0,1.0,0.0,1.0,0.0,-313,191,5,2,2019,2,149.0,42.0,3.88,0.0,0.0,0.0,3.88,3.88,3.88,0,0,0,1,1,1,0.0,5.0,0.0,5.0,0,1,0,0,1,0.0,0.0,0.0,0,0.0,0.0,,,1
76253,9763,408,60.0,67000.0,2,0.0,10.0,10.0,5.0,1.0,1.0,1.0,1.0,0.0,0.0,-605,-197,2,1,2018,10,,,0.00,0.0,0.0,0.0,0.00,0.00,0.00,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,,,,0,0.0,0.0,,,0
76255,2435,408,75.0,116000.0,5,0.0,7.0,3.0,7.0,1.0,1.0,1.0,0.0,1.0,0.0,-423,-15,2,7,2018,11,,,0.00,0.0,0.0,0.0,0.00,0.00,0.00,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,,,,0,0.0,0.0,,,1
76270,8872,504,48.0,58000.0,2,0.0,10.0,10.0,5.0,1.0,1.0,1.0,1.0,0.0,0.0,-46,458,6,10,2019,27,,,0.00,0.0,0.0,0.0,0.00,0.00,0.00,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,,,,0,0.0,0.0,,,1


In [10]:
grid = {"learning_rate": [0.01, 0.02], #, [0.005, 0.01, 
        "max_depth": [5,6]}

In [None]:
test_pool = Pool(data=test_X[i], label=test_y[i], cat_features=[0,4,53])
train_pool = Pool(data=train_X[i], label=train_y[i], cat_features=[0,4,53])

In [84]:
test_X[0].shape, test_y[0].shape, train_X[0].shape, train_y[0].shape

((11857, 54), (11857,), (47426, 54), (47426,))

In [118]:
def generate_folds(cv):
    train_X, train_y, test_X, test_y = [], [], [], []

    for i in cv:
        train_X.append(X_train.iloc[i[0]])
        train_y.append(y_train.iloc[i[0]])

        test_X.append(X_train.iloc[i[1]])
        test_y.append(y_train.iloc[i[1]])
    
    return train_X, train_y, test_X, test_y

cv = GroupKFold(n_splits=5).split(X_train, y_train, groups=X_train.person)
train_X, train_y, test_X, test_y = generate_folds(cv)

In [133]:
grid = {"learning_rate": [0.1, 0.02, 0.01, 0.005],
        "max_depth": [5,6,7,8,9,10]}

In [132]:
def gridsearch_early_stopping(train_X, train_y, test_X, test_y, fold, grid):
    
    test_pool = Pool(data=test_X[fold], label=test_y[fold], cat_features=[0,4,53])
    train_pool = Pool(data=train_X[fold], label=train_y[fold], cat_features=[0,4,53])

    results_df = pd.DataFrame(columns=['params' + str(fold), 'logloss'+ str(fold), 'AUC'+ str(fold), 'iteration'+ str(fold)])

    best_score = 99999
    for params in ParameterGrid(grid):

        model = CatBoostClassifier(cat_features=cat_features,
                                    early_stopping_rounds=50,
                                    scale_pos_weight=y_train.count() / y_train.sum(),
                                    task_type='GPU',
                                    custom_loss=['AUC'],
                                   iterations=3000,
                                    **params)

        model.fit(train_pool, eval_set=test_pool, verbose=100)

        results_df = results_df.append(pd.DataFrame([[params, model.get_best_score()['validation']['Logloss:use_weights=true'], 
                                                      model.get_best_score()['validation']['AUC'], 
                                                      model.get_best_iteration()]], 
                                                    columns=['params' + str(fold), 'logloss' + str(fold), 'AUC' + str(fold), 'iteration' + str(fold)]))

        # save if best
        if model.get_best_score()['validation']['Logloss:use_weights=true'] < best_score:
            best_score = model.get_best_score()['validation']['Logloss:use_weights=true']
            best_grid = params

    print("Best logloss: ", best_score) 
    print("Grid:", best_grid)
    display(results_df)
    return results_df

In [134]:
results_df1 = gridsearch_early_stopping(train_X, train_y, test_X, test_y, 1, grid)

0:	learn: 0.6719602	test: 0.6852992	best: 0.6852992 (0)	total: 78.4ms	remaining: 3m 55s
100:	learn: 0.5407313	test: 0.6185525	best: 0.6183758 (81)	total: 7.58s	remaining: 3m 37s
200:	learn: 0.5354613	test: 0.6166576	best: 0.6163693 (189)	total: 14.9s	remaining: 3m 28s
bestTest = 0.6163692783
bestIteration = 189
Shrink model to first 190 iterations.
0:	learn: 0.6700874	test: 0.6797002	best: 0.6797002 (0)	total: 86.8ms	remaining: 4m 20s
100:	learn: 0.5140354	test: 0.6159941	best: 0.6159941 (100)	total: 9.48s	remaining: 4m 32s
bestTest = 0.615287982
bestIteration = 136
Shrink model to first 137 iterations.
0:	learn: 0.6684351	test: 0.6742885	best: 0.6742885 (0)	total: 279ms	remaining: 13m 56s
100:	learn: 0.4733362	test: 0.6111957	best: 0.6102300 (83)	total: 15.1s	remaining: 7m 13s
bestTest = 0.6102300426
bestIteration = 83
Shrink model to first 84 iterations.
0:	learn: 0.6684029	test: 0.6742872	best: 0.6742872 (0)	total: 178ms	remaining: 8m 52s
100:	learn: 0.3626749	test: 0.6101409	best: 

KeyboardInterrupt: 

In [131]:
results_df0

Unnamed: 0,params0,logloss0,AUC0,iteration0
0,"{'learning_rate': 0.1, 'max_depth': 5}",0.653899,0.726867,4
0,"{'learning_rate': 0.1, 'max_depth': 6}",0.647679,0.736308,5
0,"{'learning_rate': 0.1, 'max_depth': 7}",0.64331,0.737003,4
0,"{'learning_rate': 0.1, 'max_depth': 8}",0.647431,0.736705,2
0,"{'learning_rate': 0.1, 'max_depth': 9}",0.632965,0.736672,6
0,"{'learning_rate': 0.1, 'max_depth': 10}",0.629436,0.732582,7
0,"{'learning_rate': 0.02, 'max_depth': 5}",0.660207,0.722625,25
0,"{'learning_rate': 0.02, 'max_depth': 6}",0.66525,0.734688,13
0,"{'learning_rate': 0.02, 'max_depth': 7}",0.663853,0.726029,15
0,"{'learning_rate': 0.02, 'max_depth': 8}",0.656592,0.736671,14


In [115]:
model.get_best_score()

{'learn': {'Logloss:use_weights=true': 0.6349915993429311,
  'AUC': 0.7834212779998779},
 'validation': {'Logloss:use_weights=true': 0.6652498233629587,
  'AUC': 0.734688013792038}}

In [104]:
results_df

Unnamed: 0,params,logloss,iteration
0,"{'learning_rate': 0.01, 'max_depth': 5}",0.669595,29
0,"{'learning_rate': 0.01, 'max_depth': 6}",0.648677,60
0,"{'learning_rate': 0.02, 'max_depth': 5}",0.660207,25
0,"{'learning_rate': 0.02, 'max_depth': 6}",0.66525,13


In [76]:
results_df = pd.DataFrame(columns=['params', 'logloss', 'iteration'])

In [92]:
results_df

Unnamed: 0,params,logloss,iteration


In [93]:
results_df.append(pd.DataFrame([[params, model.get_best_score()['validation']['Logloss:use_weights=true'], model.get_best_iteration()]], columns=['params', 'logloss', 'iteration']))

Unnamed: 0,params,logloss,iteration
0,"{'learning_rate': 0.02, 'max_depth': 6}",0.66525,13


In [94]:
results_df

Unnamed: 0,params,logloss,iteration


In [88]:
results_df.append(pd.DataFrame([[1,2,3]], columns=['params', 'logloss', 'iteration']))

Unnamed: 0,params,logloss,iteration
0,1,2,3


In [84]:
results_df.append(pd.DataFrame([1,2,3]))

Unnamed: 0,0,iteration,logloss,params
0,1.0,,,
1,2.0,,,
2,3.0,,,


In [82]:
results_df

Unnamed: 0,params,logloss,iteration


In [58]:
pd.DataFrame(results)

Unnamed: 0,0,1,2
0,"{'learning_rate': 0.01, 'max_depth': 5}",0.669595,29
1,"{'learning_rate': 0.01, 'max_depth': 6}",0.648677,60
2,"{'learning_rate': 0.02, 'max_depth': 5}",0.660207,25
3,"{'learning_rate': 0.02, 'max_depth': 6}",0.66525,13


In [47]:
pd.DataFrame(results)

Unnamed: 0,0,1,2
0,"{'learning_rate': 0.01, 'max_depth': 5}",{'learn': {'Logloss:use_weights=true': 0.53840...,
1,"{'learning_rate': 0.01, 'max_depth': 6}",{'learn': {'Logloss:use_weights=true': 0.51460...,997.0
2,"{'learning_rate': 0.02, 'max_depth': 5}",{'learn': {'Logloss:use_weights=true': 0.54037...,630.0
3,"{'learning_rate': 0.02, 'max_depth': 6}",{'learn': {'Logloss:use_weights=true': 0.51817...,426.0


In [98]:
model.get_evals_result()['validation']['Logloss:use_weights=true']

In [None]:
model = CatBoostClassifier(
    iterations=3000,
    learning_rate=0.01,
    early_stopping_rounds=50,
    scale_pos_weight=30,
    max_depth=8,    
    custom_loss=['AUC', 'Accuracy', 'F1', 'Recall', 'Precision']
)

model.fit(
    train_pool,
    eval_set=test_pool,
    verbose=False,
    plot=True
);

In [31]:
results

In [39]:
results[0].pop(-1)

583

In [41]:
pd.DataFrame(results[0])

Unnamed: 0,learning_rate,max_depth,learn,validation
0,0.01,5.0,,
1,,,{'Logloss:use_weights=true': 0.5384075827689264},{'Logloss:use_weights=true': 0.615135675801536}


In [105]:
pd.DataFrame(results)

Unnamed: 0,0,1,2
0,"{'learning_rate': 0.01, 'max_depth': 5}",{'learn': {'Logloss:use_weights=true': 0.45182...,536
1,"{'learning_rate': 0.01, 'max_depth': 6}",{'learn': {'Logloss:use_weights=true': 0.47043...,383
2,"{'learning_rate': 0.02, 'max_depth': 5}",{'learn': {'Logloss:use_weights=true': 0.46549...,248
3,"{'learning_rate': 0.02, 'max_depth': 6}",{'learn': {'Logloss:use_weights=true': 0.47954...,212


In [102]:
pd.DataFrame(results)

Unnamed: 0,0,1,2
0,"{'learning_rate': 0.1, 'max_depth': 5}",{'learn': {'Logloss:use_weights=true': 0.22154...,39
1,"{'learning_rate': 0.1, 'max_depth': 6}",{'learn': {'Logloss:use_weights=true': 0.22959...,29
2,"{'learning_rate': 0.2, 'max_depth': 5}",{'learn': {'Logloss:use_weights=true': 0.19027...,21
3,"{'learning_rate': 0.2, 'max_depth': 6}",{'learn': {'Logloss:use_weights=true': 0.20318...,18


In [99]:
pd.DataFrame(results)

Unnamed: 0,0,1,2
0,"{'learning_rate': 0.5, 'max_depth': 5}",{'learn': {'Logloss:use_weights=true': 0.16781...,6
1,"{'learning_rate': 0.5, 'max_depth': 6}",{'learn': {'Logloss:use_weights=true': 0.14435...,6
2,"{'learning_rate': 0.5, 'max_depth': 7}",{'learn': {'Logloss:use_weights=true': 0.12055...,4
3,"{'learning_rate': 0.1, 'max_depth': 5}",{'learn': {'Logloss:use_weights=true': 0.22154...,39
4,"{'learning_rate': 0.1, 'max_depth': 6}",{'learn': {'Logloss:use_weights=true': 0.22959...,29
5,"{'learning_rate': 0.1, 'max_depth': 7}",{'learn': {'Logloss:use_weights=true': 0.28921...,25


In [None]:
model.

In [17]:
model.fit(train_pool, eval_set=test_pool)

Learning rate set to 0.119787
0:	learn: 0.6677736	test: 0.6704917	best: 0.6704917 (0)	total: 44.8ms	remaining: 44.8s
20:	learn: 0.5085975	test: 0.5819325	best: 0.5817256 (19)	total: 919ms	remaining: 42.8s
bestTest = 0.5817255879
bestIteration = 19
Shrink model to first 20 iterations.


<catboost.core.CatBoostClassifier at 0x248ba5f6518>

In [18]:
model.get_best_score()

{'learn': {'Logloss:use_weights=true': 0.5085975424852496},
 'validation': {'Logloss:use_weights=true': 0.5817255879044358}}

In [19]:
model.get_best_iteration()

19

In [None]:
model.get_object_importance(train_pool,
    train_pool,
                      top_size=-1,
                      type='Average',
                      update_method='SinglePoint',
                      importance_values_sign='All',
                      thread_count=-1,
                      verbose=False)

In [20]:
model.get_test_eval()

In [21]:
model.get_evals_result()

{'learn': {'Logloss:use_weights=true': [0.6677735550394238,
   0.6437368332520323,
   0.6273966475382478,
   0.6147375023655787,
   0.6032615366785301,
   0.5800370336468236,
   0.5738188442758154,
   0.5680215226434986,
   0.5352593525768335,
   0.5318268971160399,
   0.5290954468319363,
   0.5259274883393682,
   0.5237151930844832,
   0.5207826898201713,
   0.5189144849002969,
   0.5163074459017865,
   0.5144634989773305,
   0.512844421974039,
   0.5112494333298343,
   0.5100658806442843,
   0.5085975424852496]},
 'validation': {'Logloss:use_weights=true': [0.6704917307558182,
   0.6529562784755173,
   0.6384501359818565,
   0.6269557346962599,
   0.618640134867416,
   0.614182055136338,
   0.6091115622731497,
   0.6042570782585983,
   0.6022805602513873,
   0.6001894231441892,
   0.5979519694111997,
   0.5944733908531418,
   0.5927953799295077,
   0.5893572831865422,
   0.586878420250348,
   0.5853928170125648,
   0.5839287511587589,
   0.5830425962887754,
   0.5825808919271006,
   

In [86]:
model.fit(train_pool, eval_set=test_pool)

CatBoostError: Can't deepcopy _PoolBase object

In [23]:
rf.set_params()

<catboost.core.CatBoostClassifier at 0x1f1f0e91a58>

In [107]:
model = CatBoostClassifier(
    cat_features=cat_features,
    early_stopping_rounds=1,
    scale_pos_weight=y_train.count() / y_train.sum(),
    verbose=20,
    task_type='GPU',
    custom_loss=['AUC', 'BalancedAccuracy']
)

In [31]:
rf.get_best_score()['learn']['Logloss:use_weights=true']

0.0927799791673014

In [137]:
results_df0 = pd.read_pickle('data.pkl')

In [141]:
results_df1 = pd.read_pickle('data1.pkl')

In [144]:
results_df2 = pd.read_pickle('data2.pkl')

In [148]:
results_df3 = pd.read_pickle('data3.pkl')

In [154]:
results_df4 = pd.read_pickle('data4.pkl')

In [138]:
results_df0

Unnamed: 0,params0,logloss0,AUC0,iteration0
0,"{'learning_rate': 0.1, 'max_depth': 5}",0.608287,0.734832,145
0,"{'learning_rate': 0.1, 'max_depth': 6}",0.601999,0.735076,109
0,"{'learning_rate': 0.1, 'max_depth': 7}",0.607333,0.748159,116
0,"{'learning_rate': 0.1, 'max_depth': 8}",0.606928,0.730487,104
0,"{'learning_rate': 0.1, 'max_depth': 9}",0.619681,0.714168,89
0,"{'learning_rate': 0.1, 'max_depth': 10}",0.610642,0.729771,63
0,"{'learning_rate': 0.02, 'max_depth': 5}",0.62116,0.731875,344
0,"{'learning_rate': 0.02, 'max_depth': 6}",0.608377,0.744995,461
0,"{'learning_rate': 0.02, 'max_depth': 7}",0.607477,0.735902,539
0,"{'learning_rate': 0.02, 'max_depth': 8}",0.607547,0.737098,537


In [142]:
results_df1

Unnamed: 0,params1,logloss1,AUC1,iteration1
0,"{'learning_rate': 0.1, 'max_depth': 5}",0.614636,0.741048,56
0,"{'learning_rate': 0.1, 'max_depth': 6}",0.618852,0.725555,81
0,"{'learning_rate': 0.1, 'max_depth': 7}",0.612979,0.726987,117
0,"{'learning_rate': 0.1, 'max_depth': 8}",0.632317,0.710375,70
0,"{'learning_rate': 0.1, 'max_depth': 9}",0.61724,0.738019,90
0,"{'learning_rate': 0.1, 'max_depth': 10}",0.614557,0.732774,30
0,"{'learning_rate': 0.02, 'max_depth': 5}",0.614626,0.744373,241
0,"{'learning_rate': 0.02, 'max_depth': 6}",0.615461,0.739815,215
0,"{'learning_rate': 0.02, 'max_depth': 7}",0.62317,0.725185,177
0,"{'learning_rate': 0.02, 'max_depth': 8}",0.631351,0.715685,102


In [143]:
results_df1.to_pickle('results_df1.pickle')

In [145]:
results_df2

Unnamed: 0,params2,logloss2,AUC2,iteration2
0,"{'learning_rate': 0.1, 'max_depth': 5}",0.612018,0.73128,118
0,"{'learning_rate': 0.1, 'max_depth': 6}",0.638834,0.715092,32
0,"{'learning_rate': 0.1, 'max_depth': 7}",0.616381,0.731701,56
0,"{'learning_rate': 0.1, 'max_depth': 8}",0.618958,0.728353,71
0,"{'learning_rate': 0.1, 'max_depth': 9}",0.620921,0.71956,73
0,"{'learning_rate': 0.1, 'max_depth': 10}",0.618597,0.718984,43
0,"{'learning_rate': 0.02, 'max_depth': 5}",0.615482,0.740213,755
0,"{'learning_rate': 0.02, 'max_depth': 6}",0.615426,0.730193,617
0,"{'learning_rate': 0.02, 'max_depth': 7}",0.6255,0.72793,151
0,"{'learning_rate': 0.02, 'max_depth': 8}",0.621934,0.725533,171


In [146]:
results_df2.to_pickle('results_df2.pickle')

In [149]:
results_df3

Unnamed: 0,params3,logloss3,AUC3,iteration3
0,"{'learning_rate': 0.1, 'max_depth': 5}",0.634773,0.707201,35
0,"{'learning_rate': 0.1, 'max_depth': 6}",0.628044,0.719755,57
0,"{'learning_rate': 0.1, 'max_depth': 7}",0.631853,0.708617,41
0,"{'learning_rate': 0.1, 'max_depth': 8}",0.636004,0.717468,18
0,"{'learning_rate': 0.1, 'max_depth': 9}",0.617004,0.716435,55
0,"{'learning_rate': 0.1, 'max_depth': 10}",0.629258,0.707765,23
0,"{'learning_rate': 0.02, 'max_depth': 5}",0.623315,0.721419,266
0,"{'learning_rate': 0.02, 'max_depth': 6}",0.628768,0.714292,151
0,"{'learning_rate': 0.02, 'max_depth': 7}",0.630626,0.715307,264
0,"{'learning_rate': 0.02, 'max_depth': 8}",0.632324,0.719181,140


In [150]:
results_df3.to_pickle('results_df3.pickle')

In [155]:
results_df4.to_pickle('results_df4.pickle')

### Setting up to try XGBoost and then do gridsearch

In [153]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [152]:
xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)