In [83]:
import tensorflow as tf 
%matplotlib inline
import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils 
from keras import optimizers
from keras import backend
from sklearn.utils import class_weight
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier


In [138]:
#LOAD AND STORE DATA
m6     = pd.read_csv("Training data/m06.csv")
m6_s   = pd.read_csv("Training data/m06_select.csv")
m12    = pd.read_csv("Training data/m12.csv")
m12_s  = pd.read_csv("Training data/m12_select.csv")
m18    = pd.read_csv("Training data/m18.csv")
m18_s  = pd.read_csv("Training data/m18_select.csv")
m24    = pd.read_csv("Training data/m24.csv")
m24_s  = pd.read_csv("Training data/m24_select.csv")
m36    = pd.read_csv("Training data/m36.csv")
m36_s  = pd.read_csv("Training data/m36_select.csv")
m48    = pd.read_csv("Training data/m48.csv")
m48_s  = pd.read_csv("Training data/m48_select.csv")
labels = ["Month_6", "Month_12", "Month_18", "Month_24", "Month_36", "Month_48","Month_6", "Month_12", "Month_18", "Month_24", "Month_36", "Month_48"]
months = [m6,m12,m18,m24,m36,m48,m6_s,m12_s,m18_s,m24_s,m36_s,m48_s]


In [129]:
#BUILD VARIABLES TO STORE POSSIBLE HYPERPARAMETER VALUES

max_depths = [2,3,4,5,6,7,8,9,10]
k_values = [3,4,5,6,7,8,9,10,11,12,13]
c_values = [0.01,0.02,0.1,0.2,0.3,0.5,0.75,0.9,1]
estimators = [25,50,75,100,150,200]

In [130]:
#BUILD QUICK ACCESS FUNCTIONS TO INPUT A VARIETY OF HYPERPARAMETERS
def neural_network_model():
    model = Sequential()
    model.add(Dense(units=64, activation=lambda x: backend.relu(x,threshold=.5)))
    model.add(Dense(units=64, activation=lambda x: backend.relu(x,threshold=.5)))
    model.add(Dense(units=64, activation=lambda x: backend.relu(x,threshold=.5)))
    model.add(Dense(units=4,activation='softmax'))
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model

def run_decision_tree_model(mx,train_x,train_y,test_x,test_y):
    classifier = DecisionTreeClassifier(max_depth=mx)
    classifier.fit(train_x,train_y)
    return classifier.score(train_x,train_y),classifier.score(test_x,test_y)

def run_knn_model(k,train_x,train_y,test_x,test_y):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(train_x,train_y)
    return knn.score(train_x,train_y),knn.score(test_x,test_y)

def run_svm_model(c,train_x,train_y,test_x,test_y):
    svc = SVC(kernel="linear",C=c)
    svc.fit(train_x,train_y)
    return svc.score(train_x,train_y),svc.score(test_x,test_y)

def run_random_forest_model(mx,n_est,train_x,train_y,test_x,test_y):
    clf = RandomForestClassifier(max_depth=mx, random_state=0,n_estimators=n_est)
    clf.fit(train_x, train_y)
    return clf.score(train_x,train_y),clf.score(test_x,test_y)

def run_ada_boost_model(mx,n_est,train_x,train_y,test_x,test_y):
    bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=mx),algorithm="SAMME",n_estimators=n_est)
    bdt.fit(train_x,train_y)
    return bdt.score(train_x,train_y),bdt.score(test_x,test_y)


In [131]:
def split_data(df, print_shape=False):
    train   = df.sample(frac=0.7)
    test    = df.drop(train.index)
    train_y = train['MMSCORE'].values
    train_x = train.drop('MMSCORE',axis=1).values
    test_y  = test['MMSCORE'].values
    test_x  = test.drop('MMSCORE',axis=1).values
    if print_shape:
        print(train_x.shape,train_y.shape,test_x.shape,test_y.shape)
    return train_x,train_y,test_x,test_y

In [141]:
def run_data(data):
    count = 1
    dt_data = {'Dataset':[],'Selective':[],'Max Depth':[],'Training Accuracy':[],'Testing Accuracy':[]}
    knn_data = {'Dataset':[],'Selective':[],'K Value':[],'Training Accuracy':[],'Testing Accuracy':[]}
    svm_data = {'Dataset':[],'Selective':[],'C Value':[],'Training Accuracy':[],'Testing Accuracy':[]}
    rf_data = {'Dataset':[],'Selective':[],'Max Depth':[],'Estimators':[],'Training Accuracy':[],'Testing Accuracy':[]}
    ada_data = {'Dataset':[],'Selective':[],'Max Depth':[],'Estimators':[],'Training Accuracy':[],'Testing Accuracy':[]}
    best_data = {'Dataset':[],'Selective':[],'Model':[],'Testing Accuracy':[]}
    
    for index in range(len(data)):
        if index > 5:
            selective = True
        else:
            selective = False
        train_x,train_y,test_x,test_y = split_data(data[index])
        
        best_dt = 0
        for mx in max_depths:
            train_acc, test_acc = run_decision_tree_model(mx,train_x,train_y,test_x,test_y)
            dt_data['Dataset'].append(labels[index])
            dt_data['Selective'].append(selective)
            dt_data['Max Depth'].append(mx)
            dt_data['Training Accuracy'].append(train_acc)
            dt_data['Testing Accuracy'].append(test_acc)
            if test_acc > best_dt:
                best_dt = test_acc
                    
        best_data['Dataset'].append(labels[index])
        best_data['Selective'].append(selective)
        best_data['Model'].append('Decision Tree')
        best_data['Testing Accuracy'].append(best_dt)
        
        best_knn = 0
        for k in k_values:
            train_acc, test_acc = run_knn_model(k,train_x,train_y,test_x,test_y)
            if (test_acc < 1.0):
                knn_data['Dataset'].append(labels[index])
                knn_data['Selective'].append(selective)
                knn_data['K Value'].append(k)
                knn_data['Training Accuracy'].append(train_acc)
                knn_data['Testing Accuracy'].append(test_acc)
                if test_acc > best_knn:
                    best_knn = test_acc
                    
        best_data['Dataset'].append(labels[index])
        best_data['Selective'].append(selective)
        best_data['Model'].append('KNN')
        best_data['Testing Accuracy'].append(best_knn)
        
        best_svm = 0
        for c in c_values:
            train_acc, test_acc = run_svm_model(c,train_x,train_y,test_x,test_y)
            if (test_acc < 1.0):
                svm_data['Dataset'].append(labels[index])
                svm_data['Selective'].append(selective)
                svm_data['C Value'].append(c)
                svm_data['Training Accuracy'].append(train_acc)
                svm_data['Testing Accuracy'].append(test_acc)
                if test_acc > best_svm:
                    best_svm = test_acc
                    
        best_data['Dataset'].append(labels[index])
        best_data['Selective'].append(selective)
        best_data['Model'].append('SVM')
        best_data['Testing Accuracy'].append(best_svm)
        
        best_rf = 0
        for mx in max_depths:
            for ne in estimators:
                train_acc, test_acc = run_random_forest_model(mx,ne,train_x,train_y,test_x,test_y)
                if (test_acc < 1.0):
                    rf_data['Dataset'].append(labels[index])
                    rf_data['Selective'].append(selective)
                    rf_data['Max Depth'].append(mx)
                    rf_data['Estimators'].append(ne)
                    rf_data['Training Accuracy'].append(train_acc)
                    rf_data['Testing Accuracy'].append(test_acc)
                    if test_acc > best_rf:
                        best_rf = test_acc
                        
        best_data['Dataset'].append(labels[index])
        best_data['Selective'].append(selective)
        best_data['Model'].append('Random Forest')
        best_data['Testing Accuracy'].append(best_rf)
        
        best_ada = 0
        for mx in max_depths:
            for ne in estimators:
                train_acc, test_acc = run_ada_boost_model(mx,ne,train_x,train_y,test_x,test_y)
                if (test_acc < 1.0):
                    ada_data['Dataset'].append(labels[index])
                    ada_data['Selective'].append(selective)
                    ada_data['Max Depth'].append(mx)
                    ada_data['Estimators'].append(ne)
                    ada_data['Training Accuracy'].append(train_acc)
                    ada_data['Testing Accuracy'].append(test_acc)
                    if test_acc > best_ada:
                        best_ada = test_acc
                        
        best_data['Dataset'].append(labels[index])
        best_data['Selective'].append(selective)
        best_data['Model'].append('ADA Boost')
        best_data['Testing Accuracy'].append(best_ada)
        
    best = pd.DataFrame(data=best_data)
    best.to_csv('Model_Results/Best.csv')
    
    dt = pd.DataFrame(data=dt_data)
    dt.to_csv('Model_Results/decisiontree.csv')
    
    knn = pd.DataFrame(data=knn_data)
    knn.to_csv('Model_Results/knn.csv')
    
    svm = pd.DataFrame(data=svm_data)    
    svm.to_csv('Model_Results/svm.csv')
    
    rf = pd.DataFrame(data=rf_data)    
    rf.to_csv('Model_Results/randomforest.csv')
    
    ada = pd.DataFrame(data=ada_data)   
    ada.to_csv('Model_Results/adaboost.csv')
    

In [142]:
run_data(months)
