In [1]:
import pandas as pd
import numpy as np

In [2]:
X_train_sc = pd.read_csv('X_train_sc.csv')
X_val_sc = pd.read_csv('X_val_sc.csv')
y_train_enc = np.ravel(pd.read_csv('y_train_enc.csv'))
y_val_enc = np.ravel(pd.read_csv('y_val_enc.csv'))
from sklearn.model_selection import GridSearchCV


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

lr = LogisticRegression(solver='liblinear',C=1,tol=0.001)


In [4]:
def modeler(est,X_train,y_train,X_val,y_val):
    
    est.fit(X_train,y_train)
    
    train_score = np.mean(cross_val_score(est,X_train,y_train,scoring='accuracy',cv=7,n_jobs=-1))
    val_score = np.mean(cross_val_score(est,X_val,y_val,scoring='accuracy',cv=7,n_jobs=-1))
    diff = abs(train_score - val_score)
    print(f'train accuracy: {round(train_score,5)}')
    print('-'*40)
    print(f'val accuracy: {round(val_score,5)}')
    print('-'*40)
    print(f'difference: {round(diff,7)}')
    print('-'*40)
    return est

In [5]:
modeler(lr,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

train accuracy: 0.80207
----------------------------------------
val accuracy: 0.79258
----------------------------------------
difference: 0.0094874
----------------------------------------


LogisticRegression(C=1, solver='liblinear', tol=0.001)

This one is less accurate, but scores are still close. Might be good for bagging.

In [6]:
lr2 = LogisticRegression(solver='liblinear',C=1,tol=5)
modeler(lr2,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

train accuracy: 0.7334
----------------------------------------
val accuracy: 0.73505
----------------------------------------
difference: 0.0016467
----------------------------------------


LogisticRegression(C=1, solver='liblinear', tol=5)

In [7]:
from sklearn.linear_model import RidgeClassifier

ridge = RidgeClassifier()


In [8]:

param_grid = {'alpha':[0.001,1,10,100], 'solver':['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}
grid = GridSearchCV(ridge,param_grid,n_jobs=-1,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

ridge_best = grid.best_estimator_

train accuracy: 0.80081
----------------------------------------
val accuracy: 0.7888
----------------------------------------
difference: 0.0120092
----------------------------------------


In [9]:
lr3 = LogisticRegression()
param_grid = {'C':[0.001,1,10], 'solver':['liblinear','lbfgs','newton-cg','sag','saga']}
grid = GridSearchCV(lr3,param_grid,n_jobs=-1,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
lr_best = grid.best_estimator_


train accuracy: 0.80283
----------------------------------------
val accuracy: 0.79409
----------------------------------------
difference: 0.0087343
----------------------------------------


In [10]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()

param_grid = {'loss':['hinge','modified_huber'],
             'penalty':['l2','l1','elasticnet'],
             'alpha':[0.001,1,10],
             'learning_rate':['constant','invscaling','adaptive'],
             'eta0':[0.01,0.5,1,10]}
grid = GridSearchCV(sgd,param_grid,n_jobs=-1,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
sgd_best = grid.best_estimator_


train accuracy: 0.80283
----------------------------------------
val accuracy: 0.78425
----------------------------------------
difference: 0.0185711
----------------------------------------


In [11]:
from sklearn.svm import SVC

svc = SVC()

param_grid = {'kernel':['linear','rbf'],
             'C':[0.01,1,10]}
grid = GridSearchCV(svc,param_grid,n_jobs=-1,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
svc_best = grid.best_estimator_


train accuracy: 0.79575
----------------------------------------
val accuracy: 0.79182
----------------------------------------
difference: 0.0039374
----------------------------------------


In [12]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

param_grid = {'n_neighbors':[5],
             'weights':['uniform'],
             'algorithm':['kd_tree'],
             'leaf_size':[15,30],
             'p':[1,2,3]}
grid = GridSearchCV(knn,param_grid,n_jobs=-1,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
knn_best = grid.best_estimator_


train accuracy: 0.75713
----------------------------------------
val accuracy: 0.74111
----------------------------------------
difference: 0.0160157
----------------------------------------


In [13]:
# from sklearn.gaussian_process import GaussianProcessClassifier

# gp = GaussianProcessClassifier()

# param_grid = {'n_restarts_optimizer':[0]}
# grid = GridSearchCV(gp,param_grid,n_jobs=-1,scoring='accuracy')
# modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
# gp_best = grid.best_estimator_


In [14]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

param_grid = {'var_smoothing':[0.000000001]}

grid = GridSearchCV(nb,param_grid,n_jobs=-1,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
nb_best = grid.best_estimator_


train accuracy: 0.75966
----------------------------------------
val accuracy: 0.75248
----------------------------------------
difference: 0.0071728
----------------------------------------


In [15]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()

param_grid = {'criterion':['gini'],
             'splitter':['best'],
             'max_depth':[3],
             'max_features':['auto']}

grid = GridSearchCV(tree,param_grid,n_jobs=-1,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
tree_best = grid.best_estimator_


train accuracy: 0.76395
----------------------------------------
val accuracy: 0.75396
----------------------------------------
difference: 0.0099947
----------------------------------------


In [16]:
from sklearn.ensemble import RandomForestClassifier

rfr = RandomForestClassifier()

param_grid = {'criterion':['gini'],
             'max_depth':[3],
             'max_features':['log2']}

grid = GridSearchCV(rfr,param_grid,n_jobs=-1,scoring='accuracy')
modeler(grid,X_train_sc,y_train_enc,X_val_sc,y_val_enc)
rfr_best = grid.best_estimator_


train accuracy: 0.79727
----------------------------------------
val accuracy: 0.77972
----------------------------------------
difference: 0.0175549
----------------------------------------


In [17]:
from sklearn.ensemble import VotingClassifier

best_estimators = [('ridge',ridge_best),('lr',lr_best),('sgd',sgd_best),('svc',svc_best),('knn',knn_best),('nb',nb_best),('tree',tree_best),('rfr',rfr_best)]

vote = VotingClassifier(estimators=best_estimators)

modeler(vote,X_train_sc,y_train_enc,X_val_sc,y_val_enc)

train accuracy: 0.80182
----------------------------------------
val accuracy: 0.78577
----------------------------------------
difference: 0.0160467
----------------------------------------


VotingClassifier(estimators=[('ridge',
                              RidgeClassifier(alpha=0.001, solver='sag')),
                             ('lr', LogisticRegression(C=1)),
                             ('sgd',
                              SGDClassifier(alpha=0.001, eta0=1,
                                            learning_rate='adaptive',
                                            loss='modified_huber',
                                            penalty='l1')),
                             ('svc', SVC(C=1, kernel='linear')),
                             ('knn',
                              KNeighborsClassifier(algorithm='kd_tree',
                                                   leaf_size=15)),
                             ('nb', GaussianNB()),
                             ('tree',
                              DecisionTreeClassifier(max_depth=3,
                                                     max_features='auto')),
                             ('rfr',
                

In [18]:
#save and load each model with pickle 
#add griffin's xgboost
#adaboost?