In [6]:
import pandas as pd
import numpy as np
from joblib import dump, load

In [7]:
X_train_sc = pd.read_csv('X_train_sc.csv')
X_val_sc = pd.read_csv('X_val_sc.csv')
y_train_enc = np.ravel(pd.read_csv('y_train_enc.csv'))
y_val_enc = np.ravel(pd.read_csv('y_val_enc.csv'))

#make train and val customer id frames and drop from data
X_train_id = X_train_sc['customerid']
X_train_sc.drop('customerid',axis=1,inplace=True)
X_val_id = X_val_sc['customerid']
X_val_sc.drop('customerid',axis=1,inplace=True)


In [8]:
from sklearn.model_selection import GridSearchCV


In [9]:
from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_train_sc, y_train_enc = sm.fit_resample(X_train_sc,y_train_enc)

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

lr = LogisticRegression(solver='liblinear',C=1,tol=0.001)
dump(lr,'1stlr.joblib')

['1stlr.joblib']

In [11]:
def modeler(est,X_train,y_train,X_val,y_val):
    
    est.fit(X_train,y_train)
    
    
    train_acc_score = np.mean(cross_val_score(est,X_train,y_train,scoring='precision',cv=7,n_jobs=-1))
    val_acc_score = np.mean(cross_val_score(est,X_val,y_val,scoring='precision',cv=7,n_jobs=4))
    diff = abs(train_acc_score - val_acc_score)
    
    y_val_pred = est.predict(X_val)
    
    
    print(f'train precision: {round(train_acc_score,5)}')
    print('-'*40)
    print(f'val precision: {round(val_acc_score,5)}')
    print('-'*40)
    print(f'difference: {round(diff,7)}')
    print('-'*40)
    print(f'val report: ')
    print(classification_report(y_val,y_val_pred))
    print('-'*40)
    return est

In [12]:
modeler(lr,X_train_sc,y_train_enc,X_val_sc,y_val_enc)


train precision: 0.76871
----------------------------------------
val precision: 0.66031
----------------------------------------
difference: 0.1084043
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.90      0.73      0.81       971
           1       0.51      0.77      0.61       350

    accuracy                           0.74      1321
   macro avg       0.70      0.75      0.71      1321
weighted avg       0.79      0.74      0.76      1321

----------------------------------------


LogisticRegression(C=1, solver='liblinear', tol=0.001)

This one is less accurate, but scores are still close. Might be good for bagging.

In [7]:
lr2 = LogisticRegression(solver='liblinear',C=1,tol=5)
modeler(lr2,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

dump(lr2,'2ndlr.joblib')

train accuracy: 0.5
----------------------------------------
val accuracy: 0.73505
----------------------------------------
difference: 0.2350477
----------------------------------------
              precision    recall  f1-score   support

           0       0.74      1.00      0.85       971
           1       0.00      0.00      0.00       350

    accuracy                           0.74      1321
   macro avg       0.37      0.50      0.42      1321
weighted avg       0.54      0.74      0.62      1321

val report: None
----------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


LogisticRegression(C=1, solver='liblinear', tol=5)

In [8]:
from sklearn.linear_model import RidgeClassifier

ridge = RidgeClassifier()


In [9]:
param_grid = {'alpha':[0.5,1], 'solver':['sparse_cg','lsqr','saga']}
grid = GridSearchCV(ridge,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

ridge_best = grid.best_estimator_

dump(ridge_best,'ridge.joblib')

train accuracy: 0.79948
----------------------------------------
val accuracy: 0.80093
----------------------------------------
difference: 0.0014443
----------------------------------------
              precision    recall  f1-score   support

           0       0.90      0.73      0.80       971
           1       0.51      0.77      0.61       350

    accuracy                           0.74      1321
   macro avg       0.70      0.75      0.71      1321
weighted avg       0.80      0.74      0.75      1321

val report: None
----------------------------------------


In [10]:
grid.best_params_

{'alpha': 1, 'solver': 'sparse_cg'}

In [11]:
lr3 = LogisticRegression()
param_grid = {'C':[0.01,1], 'solver':['liblinear']}
grid = GridSearchCV(lr3,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
lr3_best = grid.best_estimator_

dump(lr3_best.'3rdlr.joblib')

train accuracy: 0.80103
----------------------------------------
val accuracy: 0.80015
----------------------------------------
difference: 0.0008807
----------------------------------------
              precision    recall  f1-score   support

           0       0.90      0.73      0.81       971
           1       0.51      0.77      0.61       350

    accuracy                           0.74      1321
   macro avg       0.70      0.75      0.71      1321
weighted avg       0.80      0.74      0.76      1321

val report: None
----------------------------------------


In [12]:
grid.best_params_

{'C': 1, 'solver': 'liblinear'}

In [13]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()

param_grid = {'loss':['modified_huber'],
             'penalty':['l2'],
             'alpha':[0.001,1,10],
             'learning_rate':['adaptive'],
             'eta0':[1]}
grid = GridSearchCV(sgd,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
sgd_best = grid.best_estimator_

dump(sgd_best,'sgd.joblib')

train accuracy: 0.79914
----------------------------------------
val accuracy: 0.80089
----------------------------------------
difference: 0.0017523
----------------------------------------
              precision    recall  f1-score   support

           0       0.90      0.73      0.81       971
           1       0.51      0.78      0.61       350

    accuracy                           0.74      1321
   macro avg       0.70      0.75      0.71      1321
weighted avg       0.80      0.74      0.76      1321

val report: None
----------------------------------------


In [14]:
grid.best_params_

{'alpha': 0.001,
 'eta0': 1,
 'learning_rate': 'adaptive',
 'loss': 'modified_huber',
 'penalty': 'l2'}

In [15]:
from sklearn.svm import SVC

svc = SVC()

param_grid = {'kernel':['linear'],
             'C':[0.01,1]}
grid = GridSearchCV(svc,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
svc_best = grid.best_estimator_

dump(svc_best,'svc.joblib')

train accuracy: 0.80052
----------------------------------------
val accuracy: 0.79335
----------------------------------------
difference: 0.0071631
----------------------------------------
              precision    recall  f1-score   support

           0       0.90      0.71      0.79       971
           1       0.49      0.79      0.61       350

    accuracy                           0.73      1321
   macro avg       0.70      0.75      0.70      1321
weighted avg       0.79      0.73      0.74      1321

val report: None
----------------------------------------


In [16]:
grid.best_params_

{'C': 1, 'kernel': 'linear'}

In [17]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

param_grid = {'n_neighbors':[3,5,7],
             'weights':['uniform','distance'],
             'algorithm':['kd_tree'],
             'leaf_size':[15,30],
             'p':[1]}
grid = GridSearchCV(knn,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
knn_best = grid.best_estimator_

dump(knn_best,'knn.joblisb')

train accuracy: 0.82048
----------------------------------------
val accuracy: 0.74488
----------------------------------------
difference: 0.0756001
----------------------------------------
              precision    recall  f1-score   support

           0       0.88      0.72      0.79       971
           1       0.48      0.72      0.58       350

    accuracy                           0.72      1321
   macro avg       0.68      0.72      0.69      1321
weighted avg       0.77      0.72      0.74      1321

val report: None
----------------------------------------


In [18]:
grid.best_params_

{'algorithm': 'kd_tree',
 'leaf_size': 15,
 'n_neighbors': 5,
 'p': 1,
 'weights': 'uniform'}

In [19]:
# from sklearn.gaussian_process import GaussianProcessClassifier

# gp = GaussianProcessClassifier()

# param_grid = {'n_restarts_optimizer':[0]}
# grid = GridSearchCV(gp,param_grid,n_jobs=-1,scoring='accuracy')
# modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
# gp_best = grid.best_estimator_


In [20]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

param_grid = {'var_smoothing':[0.0001,0.01,1]}

grid = GridSearchCV(nb,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
nb_best = grid.best_estimator_

dump(nb_best,'nb.joblib')

train accuracy: 0.78141
----------------------------------------
val accuracy: 0.76461
----------------------------------------
difference: 0.0168048
----------------------------------------
              precision    recall  f1-score   support

           0       0.90      0.74      0.81       971
           1       0.51      0.77      0.61       350

    accuracy                           0.74      1321
   macro avg       0.70      0.75      0.71      1321
weighted avg       0.79      0.74      0.76      1321

val report: None
----------------------------------------


In [21]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()

param_grid = {'criterion':['gini','entropy'],
             'splitter':['best'],
             'max_depth':[2,3,4],
             'max_features':['auto','sqrt']}

grid = GridSearchCV(tree,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
tree_best = grid.best_estimator_

dump(tree_best,'tree.joblib')

train accuracy: 0.73666
----------------------------------------
val accuracy: 0.76834
----------------------------------------
difference: 0.0316807
----------------------------------------
              precision    recall  f1-score   support

           0       0.88      0.63      0.73       971
           1       0.43      0.75      0.54       350

    accuracy                           0.66      1321
   macro avg       0.65      0.69      0.64      1321
weighted avg       0.76      0.66      0.68      1321

val report: None
----------------------------------------


In [22]:
from sklearn.ensemble import RandomForestClassifier

rfr = RandomForestClassifier()

param_grid = {'n_estimators':[100,200],
             'criterion':['gini','entropy'],
             'max_depth':[2,3,4],
             'max_features':['sqrt']}

grid = GridSearchCV(rfr,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
rfr_best = grid.best_estimator_

dump(rfr_best,'rfr.joblib')

train accuracy: 0.78434
----------------------------------------
val accuracy: 0.77973
----------------------------------------
difference: 0.0046056
----------------------------------------
              precision    recall  f1-score   support

           0       0.91      0.71      0.80       971
           1       0.50      0.80      0.62       350

    accuracy                           0.74      1321
   macro avg       0.70      0.76      0.71      1321
weighted avg       0.80      0.74      0.75      1321

val report: None
----------------------------------------


In [23]:
from sklearn.ensemble import ExtraTreesClassifier

etree = ExtraTreesClassifier()
param_grid = {'n_estimators':[100],
             'criterion':['gini','entropy'],
             'max_depth':[2,3,4],
             'max_features':['sqrt']}

grid = GridSearchCV(etree,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
etree_best = grid.best_estimator_

dump(etree_best,'etree.joblib')

train accuracy: 0.78468
----------------------------------------
val accuracy: 0.76911
----------------------------------------
difference: 0.015572
----------------------------------------
              precision    recall  f1-score   support

           0       0.91      0.70      0.79       971
           1       0.49      0.80      0.61       350

    accuracy                           0.72      1321
   macro avg       0.70      0.75      0.70      1321
weighted avg       0.80      0.72      0.74      1321

val report: None
----------------------------------------


In [24]:
grid.best_params_

{'criterion': 'gini',
 'max_depth': 3,
 'max_features': 'sqrt',
 'n_estimators': 100}

In [25]:
import xgboost as xgb
boost_model = xgb.XGBClassifier(max_depth=2,n_estimators=200, eta=0.01, gamma=1,n_jobs=4)
modeler(boost_model,X_train_sc,y_train_enc,X_val_sc,y_val_enc)


train accuracy: 0.78348
----------------------------------------
val accuracy: 0.77746
----------------------------------------
difference: 0.0060126
----------------------------------------
              precision    recall  f1-score   support

           0       0.91      0.64      0.76       971
           1       0.46      0.83      0.59       350

    accuracy                           0.69      1321
   macro avg       0.69      0.74      0.67      1321
weighted avg       0.79      0.69      0.71      1321

val report: None
----------------------------------------


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.01, gamma=1,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.00999999978, max_delta_step=0, max_depth=2,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [26]:
from sklearn.ensemble import AdaBoostClassifier

adatree = DecisionTreeClassifier(max_depth=1,criterion='gini',max_features='sqrt',random_state=1)

ada = AdaBoostClassifier(base_estimator=adatree,n_estimators=500,learning_rate=0.1,random_state=1)

modeler(ada,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

train accuracy: 0.80017
----------------------------------------
val accuracy: 0.80166
----------------------------------------
difference: 0.0014876
----------------------------------------
              precision    recall  f1-score   support

           0       0.90      0.73      0.81       971
           1       0.51      0.77      0.61       350

    accuracy                           0.74      1321
   macro avg       0.70      0.75      0.71      1321
weighted avg       0.80      0.74      0.76      1321

val report: None
----------------------------------------


AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1,
                                                         max_features='sqrt',
                                                         random_state=1),
                   learning_rate=0.1, n_estimators=500, random_state=1)

In [27]:
from sklearn.ensemble import StackingClassifier

best_estimators = [('ridge',ridge_best),
                   ('lr',lr),
                   ('sgd',sgd_best),
                   ('svc',svc_best),
                   ('knn',knn_best),
                   ('nb',nb_best),
                   ('tree',tree_best),
                   ('rfr',rfr_best),
                   ('xgb',boost_model),
                   ('ada',ada),
                  ('etree',etree)]

stack = StackingClassifier(estimators=best_estimators,n_jobs=4)

modeler(stack,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

train accuracy: 0.85456
----------------------------------------
val accuracy: 0.80168
----------------------------------------
difference: 0.0528813
----------------------------------------
              precision    recall  f1-score   support

           0       0.85      0.85      0.85       971
           1       0.59      0.59      0.59       350

    accuracy                           0.78      1321
   macro avg       0.72      0.72      0.72      1321
weighted avg       0.78      0.78      0.78      1321

val report: None
----------------------------------------


StackingClassifier(estimators=[('ridge',
                                RidgeClassifier(alpha=1, solver='sparse_cg')),
                               ('lr',
                                LogisticRegression(C=1, solver='liblinear',
                                                   tol=0.001)),
                               ('sgd',
                                SGDClassifier(alpha=0.001, eta0=1,
                                              learning_rate='adaptive',
                                              loss='modified_huber')),
                               ('svc', SVC(C=1, kernel='linear')),
                               ('knn',
                                KNeighborsClassifier(algorithm='kd_tree',
                                                     leaf_size=15, p=1)),
                               ('nb', GaussianNB...
                                              num_parallel_tree=1,
                                              random_state=0, reg_alpha=0,
     