In [1]:
import pandas as pd
import numpy as np


In [2]:
X_train_sc = pd.read_csv('X_train_sc.csv')
X_val_sc = pd.read_csv('X_val_sc.csv')
y_train_enc = np.ravel(pd.read_csv('y_train_enc.csv'))
y_val_enc = np.ravel(pd.read_csv('y_val_enc.csv'))
from sklearn.model_selection import GridSearchCV


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

lr = LogisticRegression(solver='liblinear',C=1,tol=0.001)


In [4]:
def modeler(est,X_train,y_train,X_val,y_val):
    
    est.fit(X_train,y_train)
    
    
    train_score = np.mean(cross_val_score(est,X_train,y_train,scoring='accuracy',cv=7,n_jobs=-1))
    val_score = np.mean(cross_val_score(est,X_val,y_val,scoring='accuracy',cv=7,n_jobs=4))
    diff = abs(train_score - val_score)
    print(f'train accuracy: {round(train_score,5)}')
    print('-'*40)
    print(f'val accuracy: {round(val_score,5)}')
    print('-'*40)
    print(f'difference: {round(diff,7)}')
    print('-'*40)
    return est

In [5]:
modeler(lr,X_train_sc,y_train_enc,X_val_sc,y_val_enc)


train accuracy: 0.80156
----------------------------------------
val accuracy: 0.79182
----------------------------------------
difference: 0.0097461
----------------------------------------


LogisticRegression(C=1, solver='liblinear', tol=0.001)

This one is less accurate, but scores are still close. Might be good for bagging.

In [6]:
lr2 = LogisticRegression(solver='liblinear',C=1,tol=5)
modeler(lr2,X_train_sc,y_train_enc,X_val_sc,y_val_enc)


train accuracy: 0.7334
----------------------------------------
val accuracy: 0.73505
----------------------------------------
difference: 0.0016467
----------------------------------------


LogisticRegression(C=1, solver='liblinear', tol=5)

In [7]:
from sklearn.linear_model import RidgeClassifier

ridge = RidgeClassifier()


In [8]:
param_grid = {'alpha':[0.001,1,10,100], 'solver':['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}
grid = GridSearchCV(ridge,param_grid,n_jobs=4,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

ridge_best = grid.best_estimator_


train accuracy: 0.80056
----------------------------------------
val accuracy: 0.7956
----------------------------------------
difference: 0.004951
----------------------------------------


In [9]:
lr3 = LogisticRegression()
param_grid = {'C':[0.001,1,10], 'solver':['liblinear','lbfgs','newton-cg','sag','saga']}
grid = GridSearchCV(lr3,param_grid,n_jobs=4,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
lr3_best = grid.best_estimator_


train accuracy: 0.80207
----------------------------------------
val accuracy: 0.79182
----------------------------------------
difference: 0.0102522
----------------------------------------


In [10]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()

param_grid = {'loss':['hinge','modified_huber'],
             'penalty':['l2','l1','elasticnet'],
             'alpha':[0.001,1,10],
             'learning_rate':['constant','invscaling','adaptive'],
             'eta0':[0.01,0.5,1,10]}
grid = GridSearchCV(sgd,param_grid,n_jobs=4,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
sgd_best = grid.best_estimator_


train accuracy: 0.80055
----------------------------------------
val accuracy: 0.78349
----------------------------------------
difference: 0.0170666
----------------------------------------


In [11]:
from sklearn.svm import SVC

svc = SVC()

param_grid = {'kernel':['linear','rbf'],
             'C':[0.01,1,10]}
grid = GridSearchCV(svc,param_grid,n_jobs=4,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
svc_best = grid.best_estimator_


train accuracy: 0.8003
----------------------------------------
val accuracy: 0.78425
----------------------------------------
difference: 0.016048
----------------------------------------


In [12]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

param_grid = {'n_neighbors':[5],
             'weights':['uniform'],
             'algorithm':['kd_tree'],
             'leaf_size':[15,30],
             'p':[1,2,3]}
grid = GridSearchCV(knn,param_grid,n_jobs=4,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
knn_best = grid.best_estimator_


train accuracy: 0.76142
----------------------------------------
val accuracy: 0.74109
----------------------------------------
difference: 0.020327
----------------------------------------


In [13]:
# from sklearn.gaussian_process import GaussianProcessClassifier

# gp = GaussianProcessClassifier()

# param_grid = {'n_restarts_optimizer':[0]}
# grid = GridSearchCV(gp,param_grid,n_jobs=-1,scoring='accuracy')
# modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
# gp_best = grid.best_estimator_


In [14]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

param_grid = {'var_smoothing':[0.000000001]}

grid = GridSearchCV(nb,param_grid,n_jobs=4,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
nb_best = grid.best_estimator_


train accuracy: 0.75839
----------------------------------------
val accuracy: 0.7578
----------------------------------------
difference: 0.0005983
----------------------------------------


In [15]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()

param_grid = {'criterion':['gini'],
             'splitter':['best'],
             'max_depth':[3],
             'max_features':['auto']}

grid = GridSearchCV(tree,param_grid,n_jobs=4,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
tree_best = grid.best_estimator_


train accuracy: 0.76899
----------------------------------------
val accuracy: 0.77442
----------------------------------------
difference: 0.0054298
----------------------------------------


In [33]:
from sklearn.ensemble import RandomForestClassifier

rfr = RandomForestClassifier()

param_grid = {'n_estimators':[200],
             'criterion':['gini'],
             'max_depth':[3],
             'max_features':['sqrt']}

grid = GridSearchCV(rfr,param_grid,n_jobs=4,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
rfr_best = grid.best_estimator_


train accuracy: 0.79197
----------------------------------------
val accuracy: 0.77668
----------------------------------------
difference: 0.0152815
----------------------------------------


In [48]:
import xgboost as xgb
boost_model = xgb.XGBClassifier(max_depth=2,n_estimators=200, eta=0.01, gamma=1,n_jobs=4)
modeler(boost_model,X_train_sc,y_train_enc,X_val_sc,y_val_enc)


train accuracy: 0.79954
----------------------------------------
val accuracy: 0.78805
----------------------------------------
difference: 0.011491
----------------------------------------


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.01, gamma=1,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.00999999978, max_delta_step=0, max_depth=2,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [69]:
from sklearn.ensemble import AdaBoostClassifier

adatree = DecisionTreeClassifier(max_depth=1,criterion='gini',max_features='sqrt',random_state=1)

ada = AdaBoostClassifier(base_estimator=adatree,n_estimators=500,learning_rate=0.1,random_state=1)

modeler(ada,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

train accuracy: 0.8051
----------------------------------------
val accuracy: 0.80166
----------------------------------------
difference: 0.0034365
----------------------------------------


AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1,
                                                         max_features='sqrt',
                                                         random_state=1),
                   learning_rate=0.1, n_estimators=500, random_state=1)

In [70]:
from sklearn.ensemble import StackingClassifier

best_estimators = [('ridge',ridge_best),('lr',lr),('sgd',sgd_best),('svc',svc_best),('knn',knn_best),('nb',nb_best),('tree',tree_best),('rfr',rfr_best),('xgb',boost_model),('ada',ada)]

vote = StackingClassifier(estimators=best_estimators,n_jobs=4)

modeler(vote,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

train accuracy: 0.79929
----------------------------------------
val accuracy: 0.79411
----------------------------------------
difference: 0.0051815
----------------------------------------


StackingClassifier(estimators=[('ridge',
                                RidgeClassifier(alpha=0.001, solver='lsqr')),
                               ('lr',
                                LogisticRegression(C=1, solver='liblinear',
                                                   tol=0.001)),
                               ('sgd',
                                SGDClassifier(alpha=0.001, eta0=0.5,
                                              learning_rate='invscaling')),
                               ('svc', SVC(C=10, kernel='linear')),
                               ('knn',
                                KNeighborsClassifier(algorithm='kd_tree',
                                                     leaf_size=15, p=1)),
                               ('nb', GaussianNB()),
                               ('tree',
                                Decisi...
                                              n_estimators=200, n_jobs=4,
                                              num_paral

In [None]:
#save and load each model with pickle 
#add griffin's xgboost
#adaboost?