This notebook handles all modeling. I tried most classification models on sklearn as well as some ensemble methods. Each model was tuned for precision using a gridsearch when available. the fitted models were saved to the models folder so that they can be analyzed in the roc_auc notebook.

In [8]:
import pandas as pd
import numpy as np
from joblib import dump, load

Increase regularization for all models
also try not using smote

In [2]:
X_train_sc = pd.read_csv('X_train_sc.csv')
X_val_sc = pd.read_csv('X_val_sc.csv')
y_train_enc = np.ravel(pd.read_csv('y_train_enc.csv'))
y_val_enc = np.ravel(pd.read_csv('y_val_enc.csv'))

#make train and val customer id frames and drop from data
X_train_id = X_train_sc['customerid']
X_train_sc.drop('customerid',axis=1,inplace=True)
X_val_id = X_val_sc['customerid']
X_val_sc.drop('customerid',axis=1,inplace=True)


In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier

I tried using smote to resample the data because there is an imbalance between ones and zeros, but it did not help the scores.

In [4]:
# from imblearn.over_sampling import SMOTE
# sm = SMOTE()
# X_train_sc, y_train_enc = sm.fit_resample(X_train_sc,y_train_enc)

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report



This function fits the model and scores the train and validation precision. It also gives a report on the model.

In [6]:
def modeler(est,X_train,y_train,X_val,y_val):
    
    est.fit(X_train,y_train)
    
    
    train_acc_score = np.mean(cross_val_score(est,X_train,y_train,scoring='precision',cv=7,n_jobs=-1))
    val_acc_score = np.mean(cross_val_score(est,X_val,y_val,scoring='precision',cv=7,n_jobs=4))
    diff = abs(train_acc_score - val_acc_score)
    
    y_val_pred = est.predict(X_val)
    
    
    print(f'train precision: {round(train_acc_score,5)}')
    print('-'*40)
    print(f'val precision: {round(val_acc_score,5)}')
    print('-'*40)
    print(f'difference: {round(diff,7)}')
    print('-'*40)
    print(f'val report: ')
    print(classification_report(y_val,y_val_pred))
    print('-'*40)
    return est

This is where the models begin.

In [7]:
lr = LogisticRegression(random_state=1,solver='liblinear',C=1,tol=0.001)

modeler(lr,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
dump(lr,'models/best_lr.joblib')

train precision: 0.67286
----------------------------------------
val precision: 0.65909
----------------------------------------
difference: 0.0137675
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.86      0.90      0.88       971
           1       0.67      0.58      0.62       350

    accuracy                           0.81      1321
   macro avg       0.76      0.74      0.75      1321
weighted avg       0.81      0.81      0.81      1321

----------------------------------------


['models/best_lr.joblib']

This one is less accurate, but scores are still close. Might be good for bagging.

In [70]:
#bag the above model

lr2 = LogisticRegression(penalty='l2',
                         random_state=1,
                         solver='saga',
                         C=0.002,
                         tol=0.01)

bag = BaggingClassifier(lr2,n_estimators=100,
                        max_features=0.9,
                        bootstrap_features=False,
                        oob_score=True,
                        n_jobs=4,
                        random_state=1)

lrbag = modeler(bag,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

dump(lrbag,'models/lr_bag.joblib')

train precision: 0.78172
----------------------------------------
val precision: 0.87245
----------------------------------------
difference: 0.0907296
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.81      0.95      0.87       971
           1       0.73      0.36      0.48       350

    accuracy                           0.79      1321
   macro avg       0.77      0.66      0.68      1321
weighted avg       0.78      0.79      0.77      1321

----------------------------------------


['models/lr_bag.joblib']

In [40]:
from sklearn.linear_model import RidgeClassifier

ridge = RidgeClassifier(random_state=1)


In [42]:
param_grid = {'alpha':[1000], 'solver':['saga']}

grid = GridSearchCV(ridge,param_grid,n_jobs=4,scoring='precision')



ridge_bag = BaggingClassifier(grid,n_estimators=100,n_jobs=4,max_features=0.8)




modeler(ridge_bag,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

# ridge_bag = ridge_bag.best_estimator_

dump(ridge_bag,'models/ridge_bag.joblib')

train precision: 0.73525
----------------------------------------
val precision: 0.77917
----------------------------------------
difference: 0.0439247
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.82      0.94      0.87       971
           1       0.70      0.42      0.53       350

    accuracy                           0.80      1321
   macro avg       0.76      0.68      0.70      1321
weighted avg       0.79      0.80      0.78      1321

----------------------------------------


['models/ridge_bag.joblib']

In [None]:
grid.best_params_

In [12]:
lr3 = LogisticRegression(random_state=1)
param_grid = {
    'penalty':['l1'],
    'C':[0.01],
    'solver':['saga']}


grid = GridSearchCV(lr3,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
lr3_best = grid.best_estimator_

dump(lr3_best,'models/3rdlr.joblib')

train precision: 0.74518
----------------------------------------
val precision: 0.0
----------------------------------------
difference: 0.7451807
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.82      0.94      0.87       971
           1       0.70      0.42      0.52       350

    accuracy                           0.80      1321
   macro avg       0.76      0.68      0.70      1321
weighted avg       0.79      0.80      0.78      1321

----------------------------------------


['models/3rdlr.joblib']

In [13]:
grid.best_params_

{'C': 0.01, 'penalty': 'l1', 'solver': 'saga'}

In [14]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(random_state=1)

param_grid = {'loss':['modified_huber'],
             'penalty':['elasticnet'],
             'alpha':[1.5],
              'l1_ratio':[0.05],
             'learning_rate':['adaptive'],
             'eta0':[1]}
grid = GridSearchCV(sgd,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
sgd_best = grid.best_estimator_

dump(sgd_best,'models/sgd.joblib')

train precision: 0.80703
----------------------------------------
val precision: 0.77804
----------------------------------------
difference: 0.0289827
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.78      0.98      0.86       971
           1       0.77      0.21      0.34       350

    accuracy                           0.78      1321
   macro avg       0.77      0.60      0.60      1321
weighted avg       0.77      0.78      0.72      1321

----------------------------------------


['models/sgd.joblib']

In [15]:
grid.best_params_

{'alpha': 1.5,
 'eta0': 1,
 'l1_ratio': 0.05,
 'learning_rate': 'adaptive',
 'loss': 'modified_huber',
 'penalty': 'elasticnet'}

These are the params that Griffin found

In [10]:
from sklearn.svm import SVC

svc = SVC(random_state=10,probability=True)

param_grid = {'kernel':['poly'],
             'C':[0.01],
             'gamma':['scale'],
              'degree':[4]
             }
grid = GridSearchCV(svc,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
svc_best = grid.best_estimator_

dump(svc_best,'models/svc.joblib')

train precision: 0.86759
----------------------------------------
val precision: 0.0
----------------------------------------
difference: 0.867592
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.76      0.99      0.86       971
           1       0.84      0.13      0.23       350

    accuracy                           0.76      1321
   macro avg       0.80      0.56      0.55      1321
weighted avg       0.78      0.76      0.69      1321

----------------------------------------


['models/svc.joblib']

In [11]:
grid.best_params_

{'C': 0.01, 'degree': 4, 'gamma': 'scale', 'kernel': 'poly'}

In [18]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

param_grid = {'n_neighbors':[5],
             'weights':['uniform'],
             'algorithm':['auto'],
             'leaf_size':[90],
             'p':[3]}
grid = GridSearchCV(knn,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
knn_best = grid.best_estimator_

dump(knn_best,'models/knn.joblisb')

train precision: 0.56244
----------------------------------------
val precision: 0.5367
----------------------------------------
difference: 0.0257406
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       971
           1       0.59      0.52      0.55       350

    accuracy                           0.78      1321
   macro avg       0.71      0.70      0.70      1321
weighted avg       0.77      0.78      0.77      1321

----------------------------------------


['models/knn.joblisb']

In [19]:
grid.best_params_

{'algorithm': 'auto',
 'leaf_size': 90,
 'n_neighbors': 5,
 'p': 3,
 'weights': 'uniform'}

In [20]:
# from sklearn.gaussian_process import GaussianProcessClassifier

# gp = GaussianProcessClassifier()

# param_grid = {'n_restarts_optimizer':[0]}
# grid = GridSearchCV(gp,param_grid,n_jobs=-1,scoring='accuracy')
# modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
# gp_best = grid.best_estimator_


In [21]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

param_grid = {'var_smoothing':[2]}

grid = GridSearchCV(nb,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
nb_best = grid.best_estimator_

dump(nb_best,'models/nb.joblib')

train precision: 0.60452
----------------------------------------
val precision: 0.60108
----------------------------------------
difference: 0.0034369
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.85      0.86      0.85       971
           1       0.59      0.57      0.58       350

    accuracy                           0.78      1321
   macro avg       0.72      0.71      0.72      1321
weighted avg       0.78      0.78      0.78      1321

----------------------------------------


['models/nb.joblib']

In [22]:
grid.best_params_

{'var_smoothing': 2}

In [23]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(random_state=1)

param_grid = {'criterion':['entropy'],
             'splitter':['random'],
             'max_depth':[2],
             'max_features':['auto']}

grid = GridSearchCV(tree,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
tree_best = grid.best_estimator_

dump(tree_best,'models/tree.joblib')

train precision: 0.766
----------------------------------------
val precision: 0.45175
----------------------------------------
difference: 0.3142461
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.75      0.98      0.85       971
           1       0.69      0.10      0.17       350

    accuracy                           0.75      1321
   macro avg       0.72      0.54      0.51      1321
weighted avg       0.73      0.75      0.67      1321

----------------------------------------


['models/tree.joblib']

In [24]:
grid.best_params_

{'criterion': 'entropy',
 'max_depth': 2,
 'max_features': 'auto',
 'splitter': 'random'}

In [25]:
from sklearn.ensemble import RandomForestClassifier

rfr = RandomForestClassifier(random_state=1)

param_grid = {'n_estimators':[100],
             'criterion':['gini'],
             'max_depth':[2],
             'max_features':['auto'],
             'oob_score':[True]}

grid = GridSearchCV(rfr,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
rfr_best = grid.best_estimator_

dump(rfr_best,'models/rfr.joblib')

train precision: 0.75988
----------------------------------------
val precision: 0.65372
----------------------------------------
difference: 0.106158
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.79      0.96      0.86       971
           1       0.71      0.28      0.40       350

    accuracy                           0.78      1321
   macro avg       0.75      0.62      0.63      1321
weighted avg       0.77      0.78      0.74      1321

----------------------------------------


['models/rfr.joblib']

In [26]:
grid.best_params_

{'criterion': 'gini',
 'max_depth': 2,
 'max_features': 'auto',
 'n_estimators': 100,
 'oob_score': True}

In [27]:
from sklearn.ensemble import ExtraTreesClassifier

etree = ExtraTreesClassifier(random_state=2)
param_grid = {'n_estimators':[100],
             'criterion':['entropy'],
             'max_depth':[2],
             'max_features':['sqrt']}

grid = GridSearchCV(etree,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
etree_best = grid.best_estimator_



dump(etree_best,'models/etree.joblib')

train precision: 0.88956
----------------------------------------
val precision: 0.85204
----------------------------------------
difference: 0.037515
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.75      0.99      0.86       971
           1       0.85      0.08      0.15       350

    accuracy                           0.75      1321
   macro avg       0.80      0.54      0.50      1321
weighted avg       0.78      0.75      0.67      1321

----------------------------------------


['models/etree.joblib']

In [28]:
grid.best_params_

{'criterion': 'entropy',
 'max_depth': 2,
 'max_features': 'sqrt',
 'n_estimators': 100}

In [29]:

import xgboost as xgb

boost_model = xgb.XGBClassifier(booster='gbtree',random_state=1,tree_method='gpu_hist',
                               grow_policy='depthwise',
                               subsample=0.9,verbosity=1,
                               sampling_method='uniform',
                               colsample_bytree=0.7,
                               reg_lambda=10,
                               alpha=100)

xgb = modeler(boost_model,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

dump(xgb,'models/xgb.joblib')


train precision: 0.7647
----------------------------------------
val precision: 0.0
----------------------------------------
difference: 0.7647035
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.80      0.95      0.87       971
           1       0.72      0.33      0.45       350

    accuracy                           0.79      1321
   macro avg       0.76      0.64      0.66      1321
weighted avg       0.78      0.79      0.76      1321

----------------------------------------


['models/xgb.joblib']

In [30]:
from sklearn.ensemble import AdaBoostClassifier

adatree = DecisionTreeClassifier(criterion='entropy',
             splitter='random',
             max_depth=1,
             max_features=5)

ada = AdaBoostClassifier(base_estimator=adatree,n_estimators=100,learning_rate=0.1,random_state=1)

ada = modeler(ada,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

dump(ada,'models/ada.joblib')

train precision: 0.68978
----------------------------------------
val precision: 0.69476
----------------------------------------
difference: 0.0049777
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.84      0.91      0.87       971
           1       0.67      0.51      0.58       350

    accuracy                           0.80      1321
   macro avg       0.75      0.71      0.72      1321
weighted avg       0.79      0.80      0.79      1321

----------------------------------------


['models/ada.joblib']

After trying the above models, I tried adding various combinations to a stack. It ended up not working well enough to use.

In [44]:
from sklearn.ensemble import StackingClassifier
best_lr = load('models/best_lr.joblib')
lr_bag = load('models/lr_bag.joblib')
ridge_bag = load('models/ridge_bag.joblib')
sgd_best = load('models/sgd.joblib')
ridge_best = load('models/best_ridge.joblib')
first_lr = load('models/1stlr.joblib')
second_lr = load('models/2ndlr.joblib')
third_lr = load('models/3rdlr.joblib')
best_lr3 = load('models/best_lr3.joblib')
etree = load('models/etree.joblib')
knn = load('models/knn.joblisb')
nb = load('models/nb.joblib')
rfr_best = load('models/rfr.joblib')
svc = load('models/svc.joblib')
tree = load('models/tree.joblib')
boost_model = load('models/xgb.joblib')
ada = load('models/ada.joblib')
stack = load('models/stack.joblib')



best_estimators = [
                   ('lr_best',best_lr),
                   ('lr3_best',best_lr3)
                   ]

stack = StackingClassifier(estimators=best_estimators,n_jobs=4)

stack = modeler(stack,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

dump(stack,'models/stack.joblib')

#('tree',tree_best),('knn',knn_best),('svc',svc_best),('nb',nb_best),

train precision: 0.68597
----------------------------------------
val precision: 0.67879
----------------------------------------
difference: 0.0071759
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.85      0.90      0.88       971
           1       0.68      0.57      0.62       350

    accuracy                           0.81      1321
   macro avg       0.77      0.73      0.75      1321
weighted avg       0.81      0.81      0.81      1321

----------------------------------------


['models/stack.joblib']