# Imports

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

# Get data locations(path)

In [2]:
path_to_features_list = ['features/'+i for i in os.listdir('features')]
path_to_extra_features_list = ['extra_features/'+i for i in os.listdir('extra_features')]

# Build classifier grids

In [3]:
number_of_parallel_models = 45

#SVM
classifier_svc = SVC()
parameters_svc = {
    'kernel': ('linear', 'rbf')
}
grid_search_svc = GridSearchCV(classifier_svc, parameters_svc, n_jobs=number_of_parallel_models)

#SGDClassifier
classifier_sgd = SGDClassifier()
parameters_sgd = {
    'loss': ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 
                  'squared_loss', 'huber', 'epsilon_insensitive',  'squared_epsilon_insensitive'),
    'penalty': ('l1', 'l2'),
    'learning_rate': ('constant', 'optimal', 'invscaling'),
    'eta0': tuple([10**(-i) for i in range(2, 5)]),
}
grid_search_sgd = GridSearchCV(classifier_sgd, parameters_sgd, n_jobs=number_of_parallel_models)

#MultinomialNB
classifier_mnb = MultinomialNB()
parameters_mnb = {}
grid_search_mnb = GridSearchCV(classifier_mnb, parameters_mnb, n_jobs=number_of_parallel_models)

#GaussianNB
classifier_gnb = GaussianNB()
parameters_gnb = {}
grid_search_gnb = GridSearchCV(classifier_gnb, parameters_gnb, n_jobs=number_of_parallel_models)


#GradientBoostingClassifier
classifier_gbc = GradientBoostingClassifier()
parameters_gbc = {
    'loss': ('deviance', 'exponential'),
    'learning_rate':  tuple([10**(-i) for i in range(2, 5)]),
    'n_estimators': tuple([10**i for i in range(2, 4)]),
    'max_depth': tuple(range(2,7)),
}
grid_search_gbc = GridSearchCV(classifier_gbc, parameters_gbc, n_jobs=number_of_parallel_models)


#AdaBoostClassifier
classifier_abc = AdaBoostClassifier()
parameters_abc = {
    'n_estimators': tuple(range(10, 101, 10)),
    'learning_rate': tuple([10**(-i) for i in range(1, 5)]),
}
grid_search_abc = GridSearchCV(classifier_abc, parameters_abc, n_jobs=number_of_parallel_models)


#RandomForestClassifier
classifier_rfc = RandomForestClassifier()
parameters_rfc = {
    'n_estimators': tuple(range(10, 101, 10)),
    'criterion': ('gini', 'entropy'),
    'n_jobs': (5,)    
}
grid_search_rfc = GridSearchCV(classifier_rfc, parameters_rfc, n_jobs=number_of_parallel_models)


#DecisionTreeClassifier
classifier_dtc = DecisionTreeClassifier()
parameters_dtc = {
    'criterion': ('gini', 'entropy'),
}
grid_search_dtc = GridSearchCV(classifier_dtc, parameters_dtc, n_jobs=number_of_parallel_models)


#GaussianProcessClassifier
classifier_gpc = GaussianProcessClassifier()
parameters_gpc = {}
grid_search_gpc = GridSearchCV(classifier_gpc, parameters_gpc, n_jobs=number_of_parallel_models)


#KNeighborsClassifier
classifier_knc = KNeighborsClassifier()
parameters_knc = {
    'n_neighbors': tuple(range(1,9)),
}
grid_search_knc = GridSearchCV(classifier_knc, parameters_knc, n_jobs=number_of_parallel_models)


#MLPClassifier 
# classifier_mlp = MLPClassifier()
# parameters_mlp = {
#     'hidden_layer_sizes' : [(max_num//2,), (max_num,), (max_num*2,), (max_num*4,), 
#                      (max_num//2,max_num//2), (max_num,max_num//2), 
#                       (max_num*2, max_num//2), (max_num*4, max_num//2)],
# 	'activation' : ('identity', 'logistic', 'tanh', 'relu'),
# 	'solver' : ['sgd', 'adam'],
# 	'alpha' : [10**(-i) for i in range(1, 5, 2)],
# 	'batch_size' : [50, 150, 250, 350],
# 	'learning_rate' : ['constant', 'invscaling', 'adaptive'],
# 	'learning_rate_init' : [10**(-i) for i in range(1, 5, 2)],
# 	'max_iter' : [500, 1000, 2000]
# }
# grid_search_mlp = GridSearchCV(classifier_mlp, parameters_mlp, n_jobs=number_of_parallel_models)

#list of grid_objects
grid_search_dict = {
    'SVM_linear': SVC(C=1, kernel='linear'),
#     'SVM_rbf': SVC(C=25, kernel='rbf')
#     'SVM': grid_search_svc,
#     'Ada Boost': grid_search_abc,
#     'Decision Tree': grid_search_dtc,
#     'Gradient Boost': grid_search_gbc,
#     'Gausian Naive Bayes': grid_search_gnb,
#     'Gaussian Process': grid_search_gpc,
#     'K Nearest Neighbours': grid_search_knc,
#     'Multinomial Naive Bayes': grid_search_mnb,
#     'Random Forest': grid_search_rfc,
#     'Stochastic Gradient Descent': grid_search_sgd,
}

# RUN models and save accuracy

In [None]:
%%time

force = True

# path_to_extra_features_list = []

for path in list(path_to_extra_features_list+path_to_features_list):
    #load data
    path_to_write = 'results/SVM_updated/'+path.split('/')[1]
    if os.path.isfile(path_to_write) and not force:
        print(path, "already exists. Moving on........\n\n")
        continue
    data = pickle.load(open(path, 'rb'))
    X_train = data['X_train'].toarray()
    y_train = data['y_train']
    X_test = data['X_test'].toarray()
    y_test = data['y_test']
    print("Loaded", path)
    evaluation_dict = {'Criteria': ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 score', 'Confusion Matrix']}
    for grid in grid_search_dict:
        model = grid_search_dict[grid]
        print("\n\tAttempting", grid)
        %time model.fit(X_train, y_train)
        print("\t\tFinished fitting")
        y_pred = model.predict(X_test)
        print("\t\tFinished predicting")
        evaluation_dict[grid] = [
                                model,
                                accuracy_score(y_test, y_pred),
                                precision_score(y_test, y_pred, average='weighted'),
                                recall_score(y_test, y_pred, average='weighted'),
                                f1_score(y_test, y_pred, average='weighted'),
                                confusion_matrix(y_test, y_pred)
                                ]
        print("\t\tFinished evaluating!")
    pickle.dump(evaluation_dict, open(path_to_write, 'wb'))
    print("Dumping successful!!!!!!!!\n\n")
