In [23]:
import pandas as pd
import numpy as np


In [24]:
X_train_sc = pd.read_csv('X_train_sc.csv')
X_val_sc = pd.read_csv('X_val_sc.csv')
y_train_enc = np.ravel(pd.read_csv('y_train_enc.csv'))
y_val_enc = np.ravel(pd.read_csv('y_val_enc.csv'))
from sklearn.model_selection import GridSearchCV


In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

lr = LogisticRegression(solver='liblinear',C=1,tol=0.001)


In [26]:
def modeler(est,X_train,y_train,X_val,y_val):
    
    est.fit(X_train,y_train)
    
    
    train_score = np.mean(cross_val_score(est,X_train,y_train,scoring='accuracy',cv=7,n_jobs=-1))
    val_score = np.mean(cross_val_score(est,X_val,y_val,scoring='accuracy',cv=7,n_jobs=4))
    diff = abs(train_score - val_score)
    print(f'train accuracy: {round(train_score,5)}')
    print('-'*40)
    print(f'val accuracy: {round(val_score,5)}')
    print('-'*40)
    print(f'difference: {round(diff,7)}')
    print('-'*40)
    return est

In [27]:
modeler(lr,X_train_sc,y_train_enc,X_val_sc,y_val_enc)


train accuracy: 0.80939
----------------------------------------
val accuracy: 0.80091
----------------------------------------
difference: 0.0084814
----------------------------------------


LogisticRegression(C=1, solver='liblinear', tol=0.001)

This one is less accurate, but scores are still close. Might be good for bagging.

In [28]:
lr2 = LogisticRegression(solver='liblinear',C=1,tol=5)
modeler(lr2,X_train_sc,y_train_enc,X_val_sc,y_val_enc)


train accuracy: 0.7334
----------------------------------------
val accuracy: 0.73505
----------------------------------------
difference: 0.0016467
----------------------------------------


LogisticRegression(C=1, solver='liblinear', tol=5)

In [29]:
from sklearn.linear_model import RidgeClassifier

ridge = RidgeClassifier()


In [32]:
param_grid = {'alpha':[1], 'solver':['sparse_cg']}
grid = GridSearchCV(ridge,param_grid,n_jobs=4,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

ridge_best = grid.best_estimator_


train accuracy: 0.80914
----------------------------------------
val accuracy: 0.80093
----------------------------------------
difference: 0.0082084
----------------------------------------


In [33]:
grid.best_params_

{'alpha': 1, 'solver': 'sparse_cg'}

In [34]:
lr3 = LogisticRegression()
param_grid = {'C':[1], 'solver':['liblinear']}
grid = GridSearchCV(lr3,param_grid,n_jobs=4,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
lr3_best = grid.best_estimator_


train accuracy: 0.80863
----------------------------------------
val accuracy: 0.79712
----------------------------------------
difference: 0.0115115
----------------------------------------


In [35]:
grid.best_params_

{'C': 1, 'solver': 'liblinear'}

In [39]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()

param_grid = {'loss':['modified_huber'],
             'penalty':['l2'],
             'alpha':[0.001],
             'learning_rate':['adaptive'],
             'eta0':[1]}
grid = GridSearchCV(sgd,param_grid,n_jobs=4,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
sgd_best = grid.best_estimator_


train accuracy: 0.80939
----------------------------------------
val accuracy: 0.78881
----------------------------------------
difference: 0.020584
----------------------------------------


In [40]:
grid.best_params_

{'alpha': 0.001,
 'eta0': 1,
 'learning_rate': 'adaptive',
 'loss': 'modified_huber',
 'penalty': 'l2'}

In [41]:
from sklearn.svm import SVC

svc = SVC()

param_grid = {'kernel':['linear'],
             'C':[1]}
grid = GridSearchCV(svc,param_grid,n_jobs=4,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
svc_best = grid.best_estimator_


train accuracy: 0.80434
----------------------------------------
val accuracy: 0.79562
----------------------------------------
difference: 0.0087141
----------------------------------------


In [42]:
grid.best_params_

{'C': 1, 'kernel': 'linear'}

In [43]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

param_grid = {'n_neighbors':[5],
             'weights':['uniform'],
             'algorithm':['kd_tree'],
             'leaf_size':[15],
             'p':[1]}
grid = GridSearchCV(knn,param_grid,n_jobs=4,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
knn_best = grid.best_estimator_


train accuracy: 0.76546
----------------------------------------
val accuracy: 0.7517
----------------------------------------
difference: 0.0137633
----------------------------------------


In [44]:
grid.best_params_

{'algorithm': 'kd_tree',
 'leaf_size': 15,
 'n_neighbors': 5,
 'p': 1,
 'weights': 'uniform'}

In [13]:
# from sklearn.gaussian_process import GaussianProcessClassifier

# gp = GaussianProcessClassifier()

# param_grid = {'n_restarts_optimizer':[0]}
# grid = GridSearchCV(gp,param_grid,n_jobs=-1,scoring='accuracy')
# modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
# gp_best = grid.best_estimator_


In [45]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

param_grid = {'var_smoothing':[0.0001]}

grid = GridSearchCV(nb,param_grid,n_jobs=4,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
nb_best = grid.best_estimator_


train accuracy: 0.76218
----------------------------------------
val accuracy: 0.76461
----------------------------------------
difference: 0.002422
----------------------------------------


In [46]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()

param_grid = {'criterion':['gini'],
             'splitter':['best'],
             'max_depth':[3],
             'max_features':['auto']}

grid = GridSearchCV(tree,param_grid,n_jobs=4,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
tree_best = grid.best_estimator_


train accuracy: 0.76976
----------------------------------------
val accuracy: 0.75931
----------------------------------------
difference: 0.0104476
----------------------------------------


In [47]:
from sklearn.ensemble import RandomForestClassifier

rfr = RandomForestClassifier()

param_grid = {'n_estimators':[200],
             'criterion':['gini'],
             'max_depth':[3],
             'max_features':['sqrt']}

grid = GridSearchCV(rfr,param_grid,n_jobs=4,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
rfr_best = grid.best_estimator_


train accuracy: 0.79702
----------------------------------------
val accuracy: 0.77974
----------------------------------------
difference: 0.0172833
----------------------------------------


In [54]:
from sklearn.ensemble import ExtraTreesClassifier

etree = ExtraTreesClassifier()
param_grid = {'n_estimators':[100],
             'criterion':['gini'],
             'max_depth':[2,3],
             'max_features':['sqrt']}

grid = GridSearchCV(etree,param_grid,n_jobs=4,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
etree_best = grid.best_estimator_


train accuracy: 0.77354
----------------------------------------
val accuracy: 0.76835
----------------------------------------
difference: 0.0051848
----------------------------------------


In [55]:
grid.best_params_

{'criterion': 'gini',
 'max_depth': 3,
 'max_features': 'sqrt',
 'n_estimators': 100}

In [48]:
import xgboost as xgb
boost_model = xgb.XGBClassifier(max_depth=2,n_estimators=200, eta=0.01, gamma=1,n_jobs=4)
modeler(boost_model,X_train_sc,y_train_enc,X_val_sc,y_val_enc)


train accuracy: 0.79853
----------------------------------------
val accuracy: 0.77746
----------------------------------------
difference: 0.0210679
----------------------------------------


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.01, gamma=1,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.00999999978, max_delta_step=0, max_depth=2,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [49]:
from sklearn.ensemble import AdaBoostClassifier

adatree = DecisionTreeClassifier(max_depth=1,criterion='gini',max_features='sqrt',random_state=1)

ada = AdaBoostClassifier(base_estimator=adatree,n_estimators=500,learning_rate=0.1,random_state=1)

modeler(ada,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

train accuracy: 0.80888
----------------------------------------
val accuracy: 0.80166
----------------------------------------
difference: 0.0072243
----------------------------------------


AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1,
                                                         max_features='sqrt',
                                                         random_state=1),
                   learning_rate=0.1, n_estimators=500, random_state=1)

In [62]:
from sklearn.ensemble import StackingClassifier

best_estimators = [('ridge',ridge_best),
                   ('lr',lr),
                   ('sgd',sgd_best),
                   ('svc',svc_best),
                   ('knn',knn_best),
                   ('nb',nb_best),
                   ('tree',tree_best),
                   ('rfr',rfr_best),
                   ('xgb',boost_model),
                   ('ada',ada),
                  ('etree',etree)]

vote = StackingClassifier(estimators=best_estimators,n_jobs=4)

modeler(vote,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

train accuracy: 0.81141
----------------------------------------
val accuracy: 0.80243
----------------------------------------
difference: 0.0089777
----------------------------------------


StackingClassifier(estimators=[('ridge',
                                RidgeClassifier(alpha=1, solver='sparse_cg')),
                               ('lr',
                                LogisticRegression(C=1, solver='liblinear',
                                                   tol=0.001)),
                               ('sgd',
                                SGDClassifier(alpha=0.001, eta0=1,
                                              learning_rate='adaptive',
                                              loss='modified_huber')),
                               ('svc', SVC(C=1, kernel='linear')),
                               ('knn',
                                KNeighborsClassifier(algorithm='kd_tree',
                                                     leaf_size=15, p=1)),
                               ('nb', GaussianNB...
                                              num_parallel_tree=1,
                                              random_state=0, reg_alpha=0,
     