In [1]:
import pandas as pd
import numpy as np
from joblib import dump, load

In [2]:
X_train_sc = pd.read_csv('X_train_sc.csv')
X_val_sc = pd.read_csv('X_val_sc.csv')
y_train_enc = np.ravel(pd.read_csv('y_train_enc.csv'))
y_val_enc = np.ravel(pd.read_csv('y_val_enc.csv'))

#make train and val customer id frames and drop from data
X_train_id = X_train_sc['customerid']
X_train_sc.drop('customerid',axis=1,inplace=True)
X_val_id = X_val_sc['customerid']
X_val_sc.drop('customerid',axis=1,inplace=True)


In [3]:
from sklearn.model_selection import GridSearchCV


In [4]:
from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_train_sc, y_train_enc = sm.fit_resample(X_train_sc,y_train_enc)

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

lr = LogisticRegression(solver='liblinear',C=1,tol=0.001)


In [None]:
def modeler(est,X_train,y_train,X_val,y_val):
    
    est.fit(X_train,y_train)
    
    
    train_acc_score = np.mean(cross_val_score(est,X_train,y_train,scoring='precision',cv=7,n_jobs=-1))
    val_acc_score = np.mean(cross_val_score(est,X_val,y_val,scoring='precision',cv=7,n_jobs=4))
    diff = abs(train_acc_score - val_acc_score)
    
    y_val_pred = est.predict(X_val)
    
    
    print(f'train precision: {round(train_acc_score,5)}')
    print('-'*40)
    print(f'val precision: {round(val_acc_score,5)}')
    print('-'*40)
    print(f'difference: {round(diff,7)}')
    print('-'*40)
    print(f'val report: ')
    print(classification_report(y_val,y_val_pred))
    print('-'*40)
    return est

In [None]:
modeler(lr,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
dump(lr,'1stlr.joblib')

train precision: 0.77363
----------------------------------------
val precision: 0.66031
----------------------------------------
difference: 0.1133237
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.90      0.73      0.81       971
           1       0.51      0.77      0.61       350

    accuracy                           0.74      1321
   macro avg       0.70      0.75      0.71      1321
weighted avg       0.79      0.74      0.75      1321

----------------------------------------


['1stlr.joblib']

This one is less accurate, but scores are still close. Might be good for bagging.

In [8]:
lr2 = LogisticRegression(solver='liblinear',C=1,tol=5)
modeler(lr2,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

dump(lr2,'2ndlr.joblib')

train precision: 0.0
----------------------------------------
val precision: 0.0
----------------------------------------
difference: 0.0
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.74      1.00      0.85       971
           1       0.00      0.00      0.00       350

    accuracy                           0.74      1321
   macro avg       0.37      0.50      0.42      1321
weighted avg       0.54      0.74      0.62      1321

----------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


['2ndlr.joblib']

In [9]:
from sklearn.linear_model import RidgeClassifier

ridge = RidgeClassifier()


In [10]:
param_grid = {'alpha':[0.5,1], 'solver':['sparse_cg','lsqr','saga']}
grid = GridSearchCV(ridge,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

ridge_best = grid.best_estimator_

dump(ridge_best,'ridge.joblib')

train precision: 0.77551
----------------------------------------
val precision: 0.67657
----------------------------------------
difference: 0.0989436
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.90      0.73      0.80       971
           1       0.51      0.77      0.61       350

    accuracy                           0.74      1321
   macro avg       0.70      0.75      0.71      1321
weighted avg       0.79      0.74      0.75      1321

----------------------------------------


['ridge.joblib']

In [11]:
grid.best_params_

{'alpha': 0.5, 'solver': 'lsqr'}

In [31]:
lr3 = LogisticRegression()
param_grid = {'C':[0.01], 'solver':['liblinear']}
grid = GridSearchCV(lr3,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
lr3_best = grid.best_estimator_

dump(lr3_best,'3rdlr.joblib')

train precision: 0.77209
----------------------------------------
val precision: 0.68464
----------------------------------------
difference: 0.0874491
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.90      0.74      0.81       971
           1       0.51      0.77      0.62       350

    accuracy                           0.75      1321
   macro avg       0.71      0.75      0.71      1321
weighted avg       0.80      0.75      0.76      1321

----------------------------------------


['3rdlr.joblib']

In [32]:
grid.best_params_

{'C': 0.01, 'solver': 'liblinear'}

In [15]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()

param_grid = {'loss':['modified_huber'],
             'penalty':['l2'],
             'alpha':[0.001,1,10],
             'learning_rate':['adaptive'],
             'eta0':[1]}
grid = GridSearchCV(sgd,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
sgd_best = grid.best_estimator_

dump(sgd_best,'sgd.joblib')

train precision: 0.77425
----------------------------------------
val precision: 0.69551
----------------------------------------
difference: 0.0787391
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.90      0.73      0.80       971
           1       0.50      0.77      0.61       350

    accuracy                           0.74      1321
   macro avg       0.70      0.75      0.71      1321
weighted avg       0.79      0.74      0.75      1321

----------------------------------------


['sgd.joblib']

In [16]:
grid.best_params_

{'alpha': 0.001,
 'eta0': 1,
 'learning_rate': 'adaptive',
 'loss': 'modified_huber',
 'penalty': 'l2'}

In [17]:
from sklearn.svm import SVC

svc = SVC()

param_grid = {'kernel':['linear'],
             'C':[0.01,1]}
grid = GridSearchCV(svc,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
svc_best = grid.best_estimator_

dump(svc_best,'svc.joblib')

train precision: 0.76881
----------------------------------------
val precision: 0.67478
----------------------------------------
difference: 0.0940281
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.90      0.72      0.80       971
           1       0.50      0.78      0.61       350

    accuracy                           0.74      1321
   macro avg       0.70      0.75      0.71      1321
weighted avg       0.80      0.74      0.75      1321

----------------------------------------


['svc.joblib']

In [18]:
grid.best_params_

{'C': 1, 'kernel': 'linear'}

In [19]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

param_grid = {'n_neighbors':[3,5,7],
             'weights':['uniform','distance'],
             'algorithm':['kd_tree'],
             'leaf_size':[15,30],
             'p':[1]}
grid = GridSearchCV(knn,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
knn_best = grid.best_estimator_

dump(knn_best,'knn.joblisb')

train precision: 0.78371
----------------------------------------
val precision: 0.56585
----------------------------------------
difference: 0.2178575
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.86      0.75      0.80       971
           1       0.49      0.67      0.57       350

    accuracy                           0.73      1321
   macro avg       0.68      0.71      0.68      1321
weighted avg       0.76      0.73      0.74      1321

----------------------------------------


['knn.joblisb']

In [20]:
grid.best_params_

{'algorithm': 'kd_tree',
 'leaf_size': 15,
 'n_neighbors': 3,
 'p': 1,
 'weights': 'distance'}

In [21]:
# from sklearn.gaussian_process import GaussianProcessClassifier

# gp = GaussianProcessClassifier()

# param_grid = {'n_restarts_optimizer':[0]}
# grid = GridSearchCV(gp,param_grid,n_jobs=-1,scoring='accuracy')
# modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
# gp_best = grid.best_estimator_


In [22]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

param_grid = {'var_smoothing':[0.0001,0.01,1]}

grid = GridSearchCV(nb,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
nb_best = grid.best_estimator_

dump(nb_best,'nb.joblib')

train precision: 0.75883
----------------------------------------
val precision: 0.57537
----------------------------------------
difference: 0.1834641
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.89      0.73      0.80       971
           1       0.50      0.75      0.60       350

    accuracy                           0.73      1321
   macro avg       0.69      0.74      0.70      1321
weighted avg       0.79      0.73      0.75      1321

----------------------------------------


['nb.joblib']

In [23]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()

param_grid = {'criterion':['gini','entropy'],
             'splitter':['best'],
             'max_depth':[2,3,4],
             'max_features':['auto','sqrt']}

grid = GridSearchCV(tree,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
tree_best = grid.best_estimator_

dump(tree_best,'tree.joblib')

train precision: 0.73558
----------------------------------------
val precision: 0.6326
----------------------------------------
difference: 0.1029794
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.82      0.84      0.83       971
           1       0.52      0.49      0.50       350

    accuracy                           0.74      1321
   macro avg       0.67      0.66      0.67      1321
weighted avg       0.74      0.74      0.74      1321

----------------------------------------


['tree.joblib']

In [24]:
from sklearn.ensemble import RandomForestClassifier

rfr = RandomForestClassifier()

param_grid = {'n_estimators':[100,200],
             'criterion':['gini','entropy'],
             'max_depth':[2,3,4],
             'max_features':['sqrt']}

grid = GridSearchCV(rfr,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
rfr_best = grid.best_estimator_

dump(rfr_best,'rfr.joblib')

train precision: 0.76126
----------------------------------------
val precision: 0.7078
----------------------------------------
difference: 0.0534629
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.90      0.73      0.81       971
           1       0.51      0.79      0.62       350

    accuracy                           0.74      1321
   macro avg       0.71      0.76      0.71      1321
weighted avg       0.80      0.74      0.76      1321

----------------------------------------


['rfr.joblib']

In [25]:
from sklearn.ensemble import ExtraTreesClassifier

etree = ExtraTreesClassifier()
param_grid = {'n_estimators':[100],
             'criterion':['gini','entropy'],
             'max_depth':[2,3,4],
             'max_features':['sqrt']}

grid = GridSearchCV(etree,param_grid,n_jobs=4,scoring='precision')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
etree_best = grid.best_estimator_



dump(etree_best,'etree.joblib')

train precision: 0.75489
----------------------------------------
val precision: 0.79494
----------------------------------------
difference: 0.040052
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.90      0.71      0.80       971
           1       0.49      0.77      0.60       350

    accuracy                           0.73      1321
   macro avg       0.70      0.74      0.70      1321
weighted avg       0.79      0.73      0.74      1321

----------------------------------------


['etree.joblib']

In [26]:
grid.best_params_

{'criterion': 'entropy',
 'max_depth': 4,
 'max_features': 'sqrt',
 'n_estimators': 100}

In [35]:
import xgboost as xgb
boost_model = xgb.XGBClassifier(max_depth=2,n_estimators=200, eta=0.01, gamma=1,n_jobs=4)
xgb = modeler(boost_model,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

dump(xgb,'xgb.joblib')

train precision: 0.73366
----------------------------------------
val precision: 0.64185
----------------------------------------
difference: 0.0918145
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.91      0.66      0.77       971
           1       0.47      0.82      0.59       350

    accuracy                           0.70      1321
   macro avg       0.69      0.74      0.68      1321
weighted avg       0.79      0.70      0.72      1321

----------------------------------------


['xgb.joblib']

In [36]:
from sklearn.ensemble import AdaBoostClassifier

adatree = DecisionTreeClassifier(max_depth=1,criterion='gini',max_features='sqrt',random_state=1)

ada = AdaBoostClassifier(base_estimator=adatree,n_estimators=500,learning_rate=0.1,random_state=1)

ada = modeler(ada,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

dump(ada,'ada.joblib')

train precision: 0.77052
----------------------------------------
val precision: 0.64999
----------------------------------------
difference: 0.1205249
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.90      0.74      0.81       971
           1       0.52      0.78      0.62       350

    accuracy                           0.75      1321
   macro avg       0.71      0.76      0.72      1321
weighted avg       0.80      0.75      0.76      1321

----------------------------------------


['ada.joblib']

In [37]:
from sklearn.ensemble import StackingClassifier

best_estimators = [('ridge',ridge_best),
                   ('lr',lr),
                   ('sgd',sgd_best),
                   ('svc',svc_best),
                   ('knn',knn_best),
                   ('nb',nb_best),
                   ('tree',tree_best),
                   ('rfr',rfr_best),
                   ('xgb',boost_model),
                   ('ada',ada),
                  ('etree',etree)]

stack = StackingClassifier(estimators=best_estimators,n_jobs=4)

stack = modeler(stack,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

dump(stack,'stack.joblib')

train precision: 0.83879
----------------------------------------
val precision: 0.67361
----------------------------------------
difference: 0.165176
----------------------------------------
val report: 
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       971
           1       0.56      0.60      0.58       350

    accuracy                           0.77      1321
   macro avg       0.71      0.72      0.71      1321
weighted avg       0.78      0.77      0.77      1321

----------------------------------------


['stack.joblib']

array([[0.9865013 , 0.0134987 ],
       [0.86873178, 0.13126822],
       [0.85128411, 0.14871589],
       ...,
       [0.22850614, 0.77149386],
       [0.99349563, 0.00650437],
       [0.13091009, 0.86908991]])