In [1]:
# importing needed packages
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import GridSearchCV
from warnings import simplefilter
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [2]:
# list of datasets
df_adult_income = pd.read_csv('adult.data', sep=",", names = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','class'])
df_krk = pd.read_csv('krkopt.data', sep=",", names=['wkf','wkr','wrf','wrr','bkf','bkr','outcome'])
df_connect_4 = pd.read_csv('connect-4.data', sep=",")
df_HTRU_2 = pd.read_csv('HTRU_2.csv')

In [3]:
# data cleaning for census income data
df_adult_income_enc = pd.DataFrame(df_adult_income['age'])

enc = OneHotEncoder(handle_unknown='ignore')
enc_df = pd.DataFrame(enc.fit_transform(df_adult_income[['workclass']]).toarray())
df_adult_income_enc = df_adult_income_enc.join(enc_df)
df_adult_income_enc = df_adult_income_enc.join(df_adult_income['fnlwgt'])

enc_df = pd.DataFrame(enc.fit_transform(df_adult_income[['education']]).toarray())
df_adult_income_enc = pd.concat([df_adult_income_enc, enc_df], axis = 1)
df_adult_income_enc = df_adult_income_enc.join(df_adult_income['education-num'])

enc_df = pd.DataFrame(enc.fit_transform(df_adult_income[['marital-status']]).toarray())
df_adult_income_enc = pd.concat([df_adult_income_enc, enc_df], axis = 1)

enc_df = pd.DataFrame(enc.fit_transform(df_adult_income[['occupation','relationship','sex','race']]).toarray())
df_adult_income_enc = pd.concat([df_adult_income_enc, enc_df], axis = 1)

df_adult_income_enc = pd.concat([df_adult_income_enc,df_adult_income['capital-gain']], axis = 1)
df_adult_income_enc = pd.concat([df_adult_income_enc,df_adult_income['capital-loss']], axis = 1)
df_adult_income_enc = pd.concat([df_adult_income_enc,df_adult_income['hours-per-week']], axis = 1)

enc_df = pd.DataFrame(enc.fit_transform(df_adult_income[['native-country']]).toarray())
df_adult_income_enc = pd.concat([df_adult_income_enc, enc_df], axis = 1)

df_adult_income_enc = df_adult_income_enc.join(df_adult_income['class'])

le = LabelEncoder() 
df_adult_income_enc['class']= le.fit_transform(df_adult_income['class']) 

df_adult_income = df_adult_income_enc

In [4]:
# data cleaning for king-rook versus king
# we did not use one-hot encoding because of the inherent structure to the board layout
df_krk = df_krk.replace('a',1)
df_krk = df_krk.replace('b',2)
df_krk = df_krk.replace('c',3)
df_krk = df_krk.replace('d',4)
df_krk = df_krk.replace('e',5)
df_krk = df_krk.replace('f',6)
df_krk = df_krk.replace('g',7)
df_krk = df_krk.replace('h',8)
df_krk = df_krk.replace('draw',-1)
df_krk['outcome'] = df_krk['outcome'].replace(['zero','one','two','three','four','five','six','seven','eight','nine','ten','eleven','twelve','thirteen','fourteen','fifteen','sixteen'],1)

In [5]:
# data cleaning for connect-4
df_connect_4 = df_connect_4.replace('b',0)
df_connect_4 = df_connect_4.replace('x',1)
df_connect_4 = df_connect_4.replace('o',-1)
df_connect_4 = df_connect_4.replace('draw',-1)
df_connect_4 = df_connect_4.replace('loss',-1)
df_connect_4 = df_connect_4.replace('win',1)

We will first deal with the HTRU_2 DataSet

In [6]:
# parameter search for Neural Network for HTRU_2
param = {'hidden_layer_sizes':[1,2,4,8,32,128],'momentum':[0,0.2,0.5,0.9]}
model = MLPClassifier()

accu_score = [] 
roc_score = [] 
f1_score = [] 

for i in range(5):

    data_sample = df_HTRU_2.sample(5000)
    
    X = data_sample.iloc[:,:len(data_sample.columns)-1]
    y = data_sample.iloc[:,len(data_sample.columns)-1:len(data_sample.columns)]
    y = np.ravel(y)

    clf = GridSearchCV(model, param, scoring = ['accuracy','roc_auc_ovr','f1_micro'], refit = False, n_jobs = -1)
    best_model = clf.fit(X,y)   
    
    best_accu = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_accuracy'])]
    best_roc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_roc_auc_ovr'])]
    best_f1 = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_f1_micro'])]
    
    X = df_HTRU_2.iloc[:,:len(df_HTRU_2.columns)-1]
    y = df_HTRU_2.iloc[:,len(df_HTRU_2.columns)-1:len(df_HTRU_2.columns)]
    y = np.ravel(y)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = 5000)
    
    predicteds_accu = [] 
    predicteds_roc = [] 
    predicteds_f1 = [] 
    trueys =[]
    
    model_accu = MLPClassifier(hidden_layer_sizes = best_accu['hidden_layer_sizes'], momentum = best_accu['momentum'])
    model_roc = MLPClassifier(hidden_layer_sizes = best_roc['hidden_layer_sizes'], momentum = best_roc['momentum'])
    model_f1 = MLPClassifier(hidden_layer_sizes = best_f1['hidden_layer_sizes'], momentum = best_f1['momentum'])
    model_accu.fit(X_train,y_train)
    model_roc.fit(X_train,y_train)
    model_f1.fit(X_train,y_train)
    
    print(model_accu.score(X_train,y_train))
    
    predicteds_accu.append(model_accu.predict(X_test))
    predicteds_roc.append(model_roc.predict(X_test))
    predicteds_f1.append(model_f1.predict(X_test))
    trueys.append(y_test)

    predicteds_accu = np.concatenate(predicteds_accu)
    predicteds_roc = np.concatenate(predicteds_roc)
    predicteds_f1 = np.concatenate(predicteds_f1)
    trueys = np.concatenate(trueys)
    
    accu_score.append(accuracy_score(trueys,predicteds_accu))
    roc_score.append(accuracy_score(trueys,predicteds_roc))
    f1_score.append(accuracy_score(trueys,predicteds_f1))

0.979
0.9714
0.975
0.9746
0.9758


In [7]:
print(accu_score)
print(roc_score)
print(f1_score)

[0.9751104908118167, 0.9746452663410096, 0.9755757152826239, 0.9759634023416298, 0.9748003411646119]
[0.9744901915174071, 0.9758083275180275, 0.9734822051639916, 0.9759634023416298, 0.9763510894006358]
[0.976428626812437, 0.9763510894006358, 0.9729394432813833, 0.9757307901062262, 0.9755757152826239]


In [8]:
# parameter search for Logistic Regression for HTRU_2
param = {'penalty':('l2','none'), 'C':[0.000001,0.000001,0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000],'solver': ['newton-cg']}
model = model = LogisticRegression()

accu_score = [] 
roc_score = [] 
f1_score = [] 

for i in range(5):

    data_sample = df_HTRU_2.sample(5000)
    simplefilter(action='ignore', category=UserWarning)
    
    X = data_sample.iloc[:,:len(data_sample.columns)-1]
    y = data_sample.iloc[:,len(data_sample.columns)-1:len(data_sample.columns)]
    y = np.ravel(y)

    clf = GridSearchCV(model, param, scoring = ['accuracy','roc_auc_ovr','f1_micro'], refit = False, n_jobs = -1)
    best_model = clf.fit(X,y)   
    
    best_accu = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_accuracy'])]
    best_roc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_roc_auc_ovr'])]
    best_f1 = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_f1_micro'])]
    
    X = df_HTRU_2.iloc[:,:len(df_HTRU_2.columns)-1]
    y = df_HTRU_2.iloc[:,len(df_HTRU_2.columns)-1:len(df_HTRU_2.columns)]
    y = np.ravel(y)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = 5000)
    
    predicteds_accu = [] 
    predicteds_roc = [] 
    predicteds_f1 = [] 
    trueys =[]
    
    model_accu = LogisticRegression(penalty = best_accu['penalty'], C = best_accu['C'], solver = 'newton-cg')
    model_roc = LogisticRegression(penalty = best_roc['penalty'], C = best_roc['C'], solver = 'newton-cg')
    model_f1 = LogisticRegression(penalty = best_f1['penalty'], C = best_f1['C'], solver = 'newton-cg')
    model_accu.fit(X_train,y_train)
    model_roc.fit(X_train,y_train)
    model_f1.fit(X_train,y_train)
    
    print(model_accu.score(X_train,y_train))
    
    predicteds_accu.append(model_accu.predict(X_test))
    predicteds_roc.append(model_roc.predict(X_test))
    predicteds_f1.append(model_f1.predict(X_test))
    trueys.append(y_test)

    predicteds_accu = np.concatenate(predicteds_accu)
    predicteds_roc = np.concatenate(predicteds_roc)
    predicteds_f1 = np.concatenate(predicteds_f1)
    trueys = np.concatenate(trueys)
    
    accu_score.append(accuracy_score(trueys,predicteds_accu))
    roc_score.append(accuracy_score(trueys,predicteds_roc))
    f1_score.append(accuracy_score(trueys,predicteds_f1))

0.9792
0.978
0.981
0.978
0.9816


In [9]:
print(accu_score)
print(roc_score)
print(f1_score)

[0.9796851981080872, 0.9807707218733038, 0.9775141505776537, 0.9790648988136776, 0.9785996743428704]
[0.9782119872838645, 0.979995347755292, 0.9772040009304489, 0.9786772117546716, 0.9774366131658525]
[0.9796851981080872, 0.9807707218733038, 0.9775141505776537, 0.9790648988136776, 0.9785996743428704]


In [10]:
# parameter search for Random Forest for HTRU_2
param = {'n_estimators':[1024], 'max_features':[1,2,4,6,8], 'min_samples_split':[2,5,10], 'min_samples_leaf':[1,2,4]}
model = RandomForestClassifier()

accu_score = [] 
roc_score = [] 
f1_score = [] 

for i in range(5):

    data_sample = df_HTRU_2.sample(5000)
    simplefilter(action='ignore', category=UserWarning)
    
    X = data_sample.iloc[:,:len(data_sample.columns)-1]
    y = data_sample.iloc[:,len(data_sample.columns)-1:len(data_sample.columns)]
    y = np.ravel(y)

    clf = GridSearchCV(model, param, scoring = ['accuracy','roc_auc_ovr','f1_micro'], refit = False, n_jobs = -1)
    best_model = clf.fit(X,y)   
    
    best_accu = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_accuracy'])]
    best_roc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_roc_auc_ovr'])]
    best_f1 = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_f1_micro'])]
    
    X = df_HTRU_2.iloc[:,:len(df_HTRU_2.columns)-1]
    y = df_HTRU_2.iloc[:,len(df_HTRU_2.columns)-1:len(df_HTRU_2.columns)]
    y = np.ravel(y)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = 5000)
    
    predicteds_accu = [] 
    predicteds_roc = [] 
    predicteds_f1 = [] 
    trueys =[]
    
    model_accu = RandomForestClassifier(n_estimators = best_accu['n_estimators'], max_features = best_accu['max_features'], min_samples_split = best_accu['min_samples_split'], min_samples_leaf = best_accu['min_samples_leaf'])
    model_roc = RandomForestClassifier(n_estimators = best_roc['n_estimators'], max_features = best_roc['max_features'], min_samples_split = best_roc['min_samples_split'], min_samples_leaf = best_roc['min_samples_leaf'])
    model_f1 = RandomForestClassifier(n_estimators = best_f1['n_estimators'], max_features = best_f1['max_features'], min_samples_split = best_f1['min_samples_split'], min_samples_leaf = best_f1['min_samples_leaf'])
    model_accu.fit(X_train,y_train)
    model_roc.fit(X_train,y_train)
    model_f1.fit(X_train,y_train)
    
    print(model_accu.score(X_train,y_train))
    
    predicteds_accu.append(model_accu.predict(X_test))
    predicteds_roc.append(model_roc.predict(X_test))
    predicteds_f1.append(model_f1.predict(X_test))
    trueys.append(y_test)

    predicteds_accu = np.concatenate(predicteds_accu)
    predicteds_roc = np.concatenate(predicteds_roc)
    predicteds_f1 = np.concatenate(predicteds_f1)
    trueys = np.concatenate(trueys)
    
    accu_score.append(accuracy_score(trueys,predicteds_accu))
    roc_score.append(accuracy_score(trueys,predicteds_roc))
    f1_score.append(accuracy_score(trueys,predicteds_f1))

0.99
0.9854
0.9926
0.9864
0.9862


In [11]:
print(accu_score)
print(roc_score)
print(f1_score)

[0.9786772117546716, 0.97921997363728, 0.9792975110490811, 0.9804605722260991, 0.9800728851670931]
[0.9791424362254788, 0.9804605722260991, 0.978056912460262, 0.9802279599906956, 0.9803054974024967]
[0.9787547491664729, 0.97921997363728, 0.9789873614018764, 0.9806156470497015, 0.9800728851670931]


Now we proceed to find the best parameters for the connect-4 dataset

In [12]:
# parameter search for Neural Network for Connect 4
param = {'hidden_layer_sizes':[1,2,4,8,32,128],'momentum':[0,0.2,0.5,0.9]}
model = MLPClassifier()

accu_score = [] 
roc_score = [] 
f1_score = [] 

for i in range(5):

    data_sample = df_connect_4.sample(5000)
    
    X = data_sample.iloc[:,:len(data_sample.columns)-1]
    y = data_sample.iloc[:,len(data_sample.columns)-1:len(data_sample.columns)]
    y = np.ravel(y)

    clf = GridSearchCV(model, param, scoring = ['accuracy','roc_auc_ovr','f1_micro'], refit = False, n_jobs = -1)
    best_model = clf.fit(X,y)   
    
    best_accu = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_accuracy'])]
    best_roc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_roc_auc_ovr'])]
    best_f1 = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_f1_micro'])]
    
    X = df_connect_4.iloc[:,:len(df_connect_4.columns)-1]
    y = df_connect_4.iloc[:,len(df_connect_4.columns)-1:len(df_connect_4.columns)]
    y = np.ravel(y)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = 5000)
    
    predicteds_accu = [] 
    predicteds_roc = [] 
    predicteds_f1 = [] 
    trueys =[]
    
    model_accu = MLPClassifier(hidden_layer_sizes = best_accu['hidden_layer_sizes'], momentum = best_accu['momentum'])
    model_roc = MLPClassifier(hidden_layer_sizes = best_roc['hidden_layer_sizes'], momentum = best_roc['momentum'])
    model_f1 = MLPClassifier(hidden_layer_sizes = best_f1['hidden_layer_sizes'], momentum = best_f1['momentum'])
    model_accu.fit(X_train,y_train)
    model_roc.fit(X_train,y_train)
    model_f1.fit(X_train,y_train)
    
    print(model_accu.score(X_train,y_train))
    
    predicteds_accu.append(model_accu.predict(X_test))
    predicteds_roc.append(model_roc.predict(X_test))
    predicteds_f1.append(model_f1.predict(X_test))
    trueys.append(y_test)

    predicteds_accu = np.concatenate(predicteds_accu)
    predicteds_roc = np.concatenate(predicteds_roc)
    predicteds_f1 = np.concatenate(predicteds_f1)
    trueys = np.concatenate(trueys)
    
    accu_score.append(accuracy_score(trueys,predicteds_accu))
    roc_score.append(accuracy_score(trueys,predicteds_roc))
    f1_score.append(accuracy_score(trueys,predicteds_f1))

0.964
0.9556
0.9692
0.964
0.9586


In [13]:
print(accu_score)
print(roc_score)
print(f1_score)

[0.8218716030436729, 0.8153014898650809, 0.8266033633864058, 0.8239337553552017, 0.8180829976341198]
[0.8212481616471642, 0.8185945392927937, 0.8248928959652152, 0.8243973399833749, 0.8202730353603171]
[0.8227827866231856, 0.8180510262804527, 0.8235500991111964, 0.8244612826907092, 0.822335187671846]


In [14]:
# parameter search for Logistic Regression for Connect 4
param = {'penalty':('l2','none'), 'C':[0.000001,0.000001,0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000],'solver': ['newton-cg']}
model = model = LogisticRegression()

accu_score = [] 
roc_score = [] 
f1_score = [] 

for i in range(5):

    data_sample = df_connect_4.sample(5000)
    simplefilter(action='ignore', category=UserWarning)
    
    X = data_sample.iloc[:,:len(data_sample.columns)-1]
    y = data_sample.iloc[:,len(data_sample.columns)-1:len(data_sample.columns)]
    y = np.ravel(y)

    clf = GridSearchCV(model, param, scoring = ['accuracy','roc_auc_ovr','f1_micro'], refit = False, n_jobs = -1)
    best_model = clf.fit(X,y)   
    
    best_accu = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_accuracy'])]
    best_roc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_roc_auc_ovr'])]
    best_f1 = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_f1_micro'])]
    
    X = df_connect_4.iloc[:,:len(df_connect_4.columns)-1]
    y = df_connect_4.iloc[:,len(df_connect_4.columns)-1:len(df_connect_4.columns)]
    y = np.ravel(y)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = 5000)
    
    predicteds_accu = [] 
    predicteds_roc = [] 
    predicteds_f1 = [] 
    trueys =[]
    
    model_accu = LogisticRegression(penalty = best_accu['penalty'], C = best_accu['C'], solver = 'newton-cg')
    model_roc = LogisticRegression(penalty = best_roc['penalty'], C = best_roc['C'], solver = 'newton-cg')
    model_f1 = LogisticRegression(penalty = best_f1['penalty'], C = best_f1['C'], solver = 'newton-cg')
    model_accu.fit(X_train,y_train)
    model_roc.fit(X_train,y_train)
    model_f1.fit(X_train,y_train)
    
    print(model_accu.score(X_train,y_train))
    
    predicteds_accu.append(model_accu.predict(X_test))
    predicteds_roc.append(model_roc.predict(X_test))
    predicteds_f1.append(model_f1.predict(X_test))
    trueys.append(y_test)

    predicteds_accu = np.concatenate(predicteds_accu)
    predicteds_roc = np.concatenate(predicteds_roc)
    predicteds_f1 = np.concatenate(predicteds_f1)
    trueys = np.concatenate(trueys)
    
    accu_score.append(accuracy_score(trueys,predicteds_accu))
    roc_score.append(accuracy_score(trueys,predicteds_roc))
    f1_score.append(accuracy_score(trueys,predicteds_f1))

0.7876
0.7904
0.791
0.7988
0.7934


In [15]:
print(accu_score)
print(roc_score)
print(f1_score)

[0.7843052624848136, 0.785264403094827, 0.7840494916554767, 0.7836338640578042, 0.7865752285951787]
[0.7835379499968028, 0.785120532003325, 0.7840494916554767, 0.7833461218748001, 0.7865752285951787]
[0.7843052624848136, 0.785264403094827, 0.7840494916554767, 0.7836338640578042, 0.7865752285951787]


In [16]:
# # parameter search for Random Forest for Connect 4
param = {'n_estimators':[1024], 'max_features':[1,2,4,6,8], 'min_samples_split':[2,5,10], 'min_samples_leaf':[1,2,4]}
model = RandomForestClassifier()

accu_score = [] 
roc_score = [] 
f1_score = [] 

for i in range(5):

    data_sample = df_connect_4.sample(5000)
    simplefilter(action='ignore', category=UserWarning)
    
    X = data_sample.iloc[:,:len(data_sample.columns)-1]
    y = data_sample.iloc[:,len(data_sample.columns)-1:len(data_sample.columns)]
    y = np.ravel(y)

    clf = GridSearchCV(model, param, scoring = ['accuracy','roc_auc_ovr','f1_micro'], refit = False, n_jobs = -1)
    best_model = clf.fit(X,y)   
    
    best_accu = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_accuracy'])]
    best_roc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_roc_auc_ovr'])]
    best_f1 = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_f1_micro'])]
    
    X = df_connect_4.iloc[:,:len(df_connect_4.columns)-1]
    y = df_connect_4.iloc[:,len(df_connect_4.columns)-1:len(df_connect_4.columns)]
    y = np.ravel(y)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = 5000)
    
    predicteds_accu = [] 
    predicteds_roc = [] 
    predicteds_f1 = [] 
    trueys =[]
    
    model_accu = RandomForestClassifier(n_estimators = best_accu['n_estimators'], max_features = best_accu['max_features'], min_samples_split = best_accu['min_samples_split'], min_samples_leaf = best_accu['min_samples_leaf'])
    model_roc = RandomForestClassifier(n_estimators = best_roc['n_estimators'], max_features = best_roc['max_features'], min_samples_split = best_roc['min_samples_split'], min_samples_leaf = best_roc['min_samples_leaf'])
    model_f1 = RandomForestClassifier(n_estimators = best_f1['n_estimators'], max_features = best_f1['max_features'], min_samples_split = best_f1['min_samples_split'], min_samples_leaf = best_f1['min_samples_leaf'])
    model_accu.fit(X_train,y_train)
    model_roc.fit(X_train,y_train)
    model_f1.fit(X_train,y_train)
    
    print(model_accu.score(X_train,y_train))
    
    predicteds_accu.append(model_accu.predict(X_test))
    predicteds_roc.append(model_roc.predict(X_test))
    predicteds_f1.append(model_f1.predict(X_test))
    trueys.append(y_test)

    predicteds_accu = np.concatenate(predicteds_accu)
    predicteds_roc = np.concatenate(predicteds_roc)
    predicteds_f1 = np.concatenate(predicteds_f1)
    trueys = np.concatenate(trueys)
    
    accu_score.append(accuracy_score(trueys,predicteds_accu))
    roc_score.append(accuracy_score(trueys,predicteds_roc))
    f1_score.append(accuracy_score(trueys,predicteds_f1))

1.0
1.0
0.9864
0.9884
1.0


In [17]:
print(accu_score)
print(roc_score)
print(f1_score)

[0.8231504571903574, 0.8233582709891937, 0.8207366199884903, 0.8249888100262165, 0.820384935098152]
[0.8216318178911695, 0.8186105249696272, 0.8209444337873265, 0.8139746786878956, 0.8176034273291131]
[0.8231184858366903, 0.8222392736108447, 0.8201291642688151, 0.8246211394590447, 0.8207206343116568]


Now we move on to the kr-k dataset

In [18]:
# parameter search for Neural Network for King-Rook versus King

param = {'hidden_layer_sizes':[1,2,4,8,32,128],'momentum':[0,0.2,0.5,0.9]}
model = MLPClassifier()

accu_score = [] 
roc_score = [] 
f1_score = [] 

for i in range(5):

    data_sample = df_krk.sample(5000)
    
    X = data_sample.iloc[:,:len(data_sample.columns)-1]
    y = data_sample.iloc[:,len(data_sample.columns)-1:len(data_sample.columns)]
    y = np.ravel(y)

    clf = GridSearchCV(model, param, scoring = ['accuracy','roc_auc_ovr','f1_micro'], refit = False, n_jobs = -1)
    best_model = clf.fit(X,y)   
    
    best_accu = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_accuracy'])]
    best_roc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_roc_auc_ovr'])]
    best_f1 = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_f1_micro'])]
    
    X = df_krk.iloc[:,:len(df_krk.columns)-1]
    y = df_krk.iloc[:,len(df_krk.columns)-1:len(df_krk.columns)]
    y = np.ravel(y)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = 5000)
    
    predicteds_accu = [] 
    predicteds_roc = [] 
    predicteds_f1 = [] 
    trueys =[]
    
    model_accu = MLPClassifier(hidden_layer_sizes = best_accu['hidden_layer_sizes'], momentum = best_accu['momentum'])
    model_roc = MLPClassifier(hidden_layer_sizes = best_roc['hidden_layer_sizes'], momentum = best_roc['momentum'])
    model_f1 = MLPClassifier(hidden_layer_sizes = best_f1['hidden_layer_sizes'], momentum = best_f1['momentum'])
    model_accu.fit(X_train,y_train)
    model_roc.fit(X_train,y_train)
    model_f1.fit(X_train,y_train)
    
    print(model_accu.score(X_train,y_train))
    
    predicteds_accu.append(model_accu.predict(X_test))
    predicteds_roc.append(model_roc.predict(X_test))
    predicteds_f1.append(model_f1.predict(X_test))
    trueys.append(y_test)

    predicteds_accu = np.concatenate(predicteds_accu)
    predicteds_roc = np.concatenate(predicteds_roc)
    predicteds_f1 = np.concatenate(predicteds_f1)
    trueys = np.concatenate(trueys)
    
    accu_score.append(accuracy_score(trueys,predicteds_accu))
    roc_score.append(accuracy_score(trueys,predicteds_roc))
    f1_score.append(accuracy_score(trueys,predicteds_f1))

0.9964
0.9964
0.996
0.9968
0.9964


In [19]:
print(accu_score)
print(roc_score)
print(f1_score)

[0.9937977099236641, 0.9950988896599584, 0.9949253990284525, 0.9953591256072172, 0.9949687716863289]
[0.9936242192921583, 0.9947085357390701, 0.9947085357390701, 0.9961398334489937, 0.9941880638445524]
[0.9942748091603053, 0.9953591256072172, 0.9954024982650936, 0.9946651630811936, 0.994882026370576]


In [20]:
# # parameter search for Logistic Regression for King-Rook versus King
param = {'penalty':('l2','none'), 'C':[0.000001,0.000001,0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000],'solver': ['newton-cg']}
model = model = LogisticRegression()

accu_score = [] 
roc_score = [] 
f1_score = [] 

for i in range(5):

    data_sample = df_krk.sample(5000)
    simplefilter(action='ignore', category=UserWarning)
    
    X = data_sample.iloc[:,:len(data_sample.columns)-1]
    y = data_sample.iloc[:,len(data_sample.columns)-1:len(data_sample.columns)]
    y = np.ravel(y)

    clf = GridSearchCV(model, param, scoring = ['accuracy','roc_auc_ovr','f1_micro'], refit = False, n_jobs = -1)
    best_model = clf.fit(X,y)   
    
    best_accu = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_accuracy'])]
    best_roc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_roc_auc_ovr'])]
    best_f1 = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_f1_micro'])]
    
    X = df_krk.iloc[:,:len(df_krk.columns)-1]
    y = df_krk.iloc[:,len(df_krk.columns)-1:len(df_krk.columns)]
    y = np.ravel(y)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = 5000)
    
    predicteds_accu = [] 
    predicteds_roc = [] 
    predicteds_f1 = [] 
    trueys =[]
    
    model_accu = LogisticRegression(penalty = best_accu['penalty'], C = best_accu['C'], solver = 'newton-cg')
    model_roc = LogisticRegression(penalty = best_roc['penalty'], C = best_roc['C'], solver = 'newton-cg')
    model_f1 = LogisticRegression(penalty = best_f1['penalty'], C = best_f1['C'], solver = 'newton-cg')
    model_accu.fit(X_train,y_train)
    model_roc.fit(X_train,y_train)
    model_f1.fit(X_train,y_train)
    
    print(model_accu.score(X_train,y_train))
    
    predicteds_accu.append(model_accu.predict(X_test))
    predicteds_roc.append(model_roc.predict(X_test))
    predicteds_f1.append(model_f1.predict(X_test))
    trueys.append(y_test)

    predicteds_accu = np.concatenate(predicteds_accu)
    predicteds_roc = np.concatenate(predicteds_roc)
    predicteds_f1 = np.concatenate(predicteds_f1)
    trueys = np.concatenate(trueys)
    
    accu_score.append(accuracy_score(trueys,predicteds_accu))
    roc_score.append(accuracy_score(trueys,predicteds_roc))
    f1_score.append(accuracy_score(trueys,predicteds_f1))

0.8978
0.8996
0.9072
0.902
0.898


In [21]:
print(accu_score)
print(roc_score)
print(f1_score)

[0.9008934767522554, 0.9005031228313671, 0.898854961832061, 0.8999826509368494, 0.9008501040943789]
[0.9008934767522554, 0.9005031228313671, 0.898854961832061, 0.8999826509368494, 0.9008501040943789]
[0.9008934767522554, 0.9005031228313671, 0.898854961832061, 0.8999826509368494, 0.9008501040943789]


In [22]:
# parameter search for Randon Forest for King-Rook versus King
param = {'n_estimators':[1024], 'max_features':[1,2,4,6,8], 'min_samples_split':[2,5,10], 'min_samples_leaf':[1,2,4]}
model = RandomForestClassifier()

accu_score = [] 
roc_score = [] 
f1_score = [] 

for i in range(5):

    data_sample = df_krk.sample(5000)
    simplefilter(action='ignore', category=UserWarning)
    
    X = data_sample.iloc[:,:len(data_sample.columns)-1]
    y = data_sample.iloc[:,len(data_sample.columns)-1:len(data_sample.columns)]
    y = np.ravel(y)

    clf = GridSearchCV(model, param, scoring = ['accuracy','roc_auc_ovr','f1_micro'], refit = False, n_jobs = -1)
    best_model = clf.fit(X,y)   
    
    best_accu = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_accuracy'])]
    best_roc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_roc_auc_ovr'])]
    best_f1 = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_f1_micro'])]
    
    X = df_krk.iloc[:,:len(df_krk.columns)-1]
    y = df_krk.iloc[:,len(df_krk.columns)-1:len(df_krk.columns)]
    y = np.ravel(y)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = 5000)
    
    predicteds_accu = [] 
    predicteds_roc = [] 
    predicteds_f1 = [] 
    trueys =[]
    
    model_accu = RandomForestClassifier(n_estimators = best_accu['n_estimators'], max_features = best_accu['max_features'], min_samples_split = best_accu['min_samples_split'], min_samples_leaf = best_accu['min_samples_leaf'])
    model_roc = RandomForestClassifier(n_estimators = best_roc['n_estimators'], max_features = best_roc['max_features'], min_samples_split = best_roc['min_samples_split'], min_samples_leaf = best_roc['min_samples_leaf'])
    model_f1 = RandomForestClassifier(n_estimators = best_f1['n_estimators'], max_features = best_f1['max_features'], min_samples_split = best_f1['min_samples_split'], min_samples_leaf = best_f1['min_samples_leaf'])
    model_accu.fit(X_train,y_train)
    model_roc.fit(X_train,y_train)
    model_f1.fit(X_train,y_train)
    
    print(model_accu.score(X_train,y_train))
    
    predicteds_accu.append(model_accu.predict(X_test))
    predicteds_roc.append(model_roc.predict(X_test))
    predicteds_f1.append(model_f1.predict(X_test))
    trueys.append(y_test)

    predicteds_accu = np.concatenate(predicteds_accu)
    predicteds_roc = np.concatenate(predicteds_roc)
    predicteds_f1 = np.concatenate(predicteds_f1)
    trueys = np.concatenate(trueys)
    
    accu_score.append(accuracy_score(trueys,predicteds_accu))
    roc_score.append(accuracy_score(trueys,predicteds_roc))
    f1_score.append(accuracy_score(trueys,predicteds_f1))

1.0
1.0
1.0
1.0
1.0


In [23]:
print(accu_score)
print(roc_score)
print(f1_score)

[0.9882893823733518, 0.9901977793199167, 0.9858605135322692, 0.9876387925052047, 0.9773160999306038]
[0.9874219292158224, 0.9899375433726578, 0.9857737682165163, 0.9875954198473282, 0.976969118667592]
[0.9884628730048577, 0.9901977793199167, 0.9857303955586398, 0.9874219292158224, 0.9767956280360861]


We then focus on the adult census income dataset

In [24]:
# parameter search for Neural Network for Adult Income
param = {'hidden_layer_sizes':[1,2,4,8,32,128],'momentum':[0,0.2,0.5,0.9]}
model = MLPClassifier()

accu_score = [] 
roc_score = [] 
f1_score = [] 

for i in range(5):

    data_sample = df_adult_income.sample(5000)
    
    X = data_sample.iloc[:,:len(data_sample.columns)-1]
    y = data_sample.iloc[:,len(data_sample.columns)-1:len(data_sample.columns)]
    y = np.ravel(y)

    clf = GridSearchCV(model, param, scoring = ['accuracy','roc_auc_ovr','f1_micro'], refit = False, n_jobs = -1)
    best_model = clf.fit(X,y)   
    
    best_accu = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_accuracy'])]
    best_roc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_roc_auc_ovr'])]
    best_f1 = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_f1_micro'])]
    
    X = df_adult_income.iloc[:,:len(df_adult_income.columns)-1]
    y = df_adult_income.iloc[:,len(df_adult_income.columns)-1:len(df_adult_income.columns)]
    y = np.ravel(y)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = 5000)
    
    predicteds_accu = [] 
    predicteds_roc = [] 
    predicteds_f1 = [] 
    trueys =[]
    
    model_accu = MLPClassifier(hidden_layer_sizes = best_accu['hidden_layer_sizes'], momentum = best_accu['momentum'])
    model_roc = MLPClassifier(hidden_layer_sizes = best_roc['hidden_layer_sizes'], momentum = best_roc['momentum'])
    model_f1 = MLPClassifier(hidden_layer_sizes = best_f1['hidden_layer_sizes'], momentum = best_f1['momentum'])
    model_accu.fit(X_train,y_train)
    model_roc.fit(X_train,y_train)
    model_f1.fit(X_train,y_train)
    
    print(model_accu.score(X_train,y_train))
    
    predicteds_accu.append(model_accu.predict(X_test))
    predicteds_roc.append(model_roc.predict(X_test))
    predicteds_f1.append(model_f1.predict(X_test))
    trueys.append(y_test)

    predicteds_accu = np.concatenate(predicteds_accu)
    predicteds_roc = np.concatenate(predicteds_roc)
    predicteds_f1 = np.concatenate(predicteds_f1)
    trueys = np.concatenate(trueys)
    
    accu_score.append(accuracy_score(trueys,predicteds_accu))
    roc_score.append(accuracy_score(trueys,predicteds_roc))
    f1_score.append(accuracy_score(trueys,predicteds_f1))

0.7898
0.7834
0.7474
0.7918
0.7896


In [25]:
print(accu_score)
print(roc_score)
print(f1_score)

[0.7879249664380829, 0.7882515148216683, 0.7389789920539893, 0.7875984180544973, 0.7859656761365698]
[0.23975907985922137, 0.7942382351874024, 0.7807409019992018, 0.7829904575305685, 0.7897028409709372]
[0.7903559377381082, 0.792206378578426, 0.7933311563441094, 0.7895939915097421, 0.7934037226515729]


In [26]:
# parameter search for Logistic Regression for Adult Income
param = {'penalty':('l2','none'), 'C':[0.000001,0.000001,0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000],'solver': ['newton-cg']}
model = model = LogisticRegression()

accu_score = [] 
roc_score = [] 
f1_score = [] 

for i in range(5):

    data_sample = df_adult_income.sample(5000)
    simplefilter(action='ignore', category=UserWarning)
    
    X = data_sample.iloc[:,:len(data_sample.columns)-1]
    y = data_sample.iloc[:,len(data_sample.columns)-1:len(data_sample.columns)]
    y = np.ravel(y)

    clf = GridSearchCV(model, param, scoring = ['accuracy','roc_auc_ovr','f1_micro'], refit = False, n_jobs = -1)
    best_model = clf.fit(X,y)   
    
    best_accu = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_accuracy'])]
    best_roc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_roc_auc_ovr'])]
    best_f1 = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_f1_micro'])]
    
    X = df_adult_income.iloc[:,:len(df_adult_income.columns)-1]
    y = df_adult_income.iloc[:,len(df_adult_income.columns)-1:len(df_adult_income.columns)]
    y = np.ravel(y)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = 5000)
    
    predicteds_accu = [] 
    predicteds_roc = [] 
    predicteds_f1 = [] 
    trueys =[]
    
    model_accu = LogisticRegression(penalty = best_accu['penalty'], C = best_accu['C'], solver = 'newton-cg')
    model_roc = LogisticRegression(penalty = best_roc['penalty'], C = best_roc['C'], solver = 'newton-cg')
    model_f1 = LogisticRegression(penalty = best_f1['penalty'], C = best_f1['C'], solver = 'newton-cg')
    model_accu.fit(X_train,y_train)
    model_roc.fit(X_train,y_train)
    model_f1.fit(X_train,y_train)
    
    print(model_accu.score(X_train,y_train))
    
    predicteds_accu.append(model_accu.predict(X_test))
    predicteds_roc.append(model_roc.predict(X_test))
    predicteds_f1.append(model_f1.predict(X_test))
    trueys.append(y_test)

    predicteds_accu = np.concatenate(predicteds_accu)
    predicteds_roc = np.concatenate(predicteds_roc)
    predicteds_f1 = np.concatenate(predicteds_f1)
    trueys = np.concatenate(trueys)
    
    accu_score.append(accuracy_score(trueys,predicteds_accu))
    roc_score.append(accuracy_score(trueys,predicteds_roc))
    f1_score.append(accuracy_score(trueys,predicteds_f1))



0.8518
0.8496
0.861




0.854
0.861


In [27]:
print(accu_score)
print(roc_score)
print(f1_score)

[0.8500780087805232, 0.849969159319328, 0.8506948223939624, 0.8494974783208157, 0.8500054424730598]
[0.8498240267044012, 0.848916947861108, 0.8506948223939624, 0.8499328761655963, 0.8494249120133522]
[0.8500780087805232, 0.849969159319328, 0.8506948223939624, 0.8494974783208157, 0.8500054424730598]


In [28]:
# parameter search for Random Forest for Adult Income
param = {'n_estimators':[1024], 'max_features':[1,2,4,6,8], 'min_samples_split':[2,5,10], 'min_samples_leaf':[1,2,4]}
model = RandomForestClassifier()

accu_score = [] 
roc_score = [] 
f1_score = [] 

for i in range(5):

    data_sample = df_adult_income.sample(5000)
    simplefilter(action='ignore', category=UserWarning)
    
    X = data_sample.iloc[:,:len(data_sample.columns)-1]
    y = data_sample.iloc[:,len(data_sample.columns)-1:len(data_sample.columns)]
    y = np.ravel(y)

    clf = GridSearchCV(model, param, scoring = ['accuracy','roc_auc_ovr','f1_micro'], refit = False, n_jobs = -1)
    best_model = clf.fit(X,y)   
    
    best_accu = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_accuracy'])]
    best_roc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_roc_auc_ovr'])]
    best_f1 = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_f1_micro'])]
    
    X = df_adult_income.iloc[:,:len(df_adult_income.columns)-1]
    y = df_adult_income.iloc[:,len(df_adult_income.columns)-1:len(df_adult_income.columns)]
    y = np.ravel(y)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = 5000)
    
    predicteds_accu = [] 
    predicteds_roc = [] 
    predicteds_f1 = [] 
    trueys =[]
    
    model_accu = RandomForestClassifier(n_estimators = best_accu['n_estimators'], max_features = best_accu['max_features'], min_samples_split = best_accu['min_samples_split'], min_samples_leaf = best_accu['min_samples_leaf'])
    model_roc = RandomForestClassifier(n_estimators = best_roc['n_estimators'], max_features = best_roc['max_features'], min_samples_split = best_roc['min_samples_split'], min_samples_leaf = best_roc['min_samples_leaf'])
    model_f1 = RandomForestClassifier(n_estimators = best_f1['n_estimators'], max_features = best_f1['max_features'], min_samples_split = best_f1['min_samples_split'], min_samples_leaf = best_f1['min_samples_leaf'])
    model_accu.fit(X_train,y_train)
    model_roc.fit(X_train,y_train)
    model_f1.fit(X_train,y_train)
    
    print(model_accu.score(X_train,y_train))
    
    predicteds_accu.append(model_accu.predict(X_test))
    predicteds_roc.append(model_roc.predict(X_test))
    predicteds_f1.append(model_f1.predict(X_test))
    trueys.append(y_test)

    predicteds_accu = np.concatenate(predicteds_accu)
    predicteds_roc = np.concatenate(predicteds_roc)
    predicteds_f1 = np.concatenate(predicteds_f1)
    trueys = np.concatenate(trueys)
    
    accu_score.append(accuracy_score(trueys,predicteds_accu))
    roc_score.append(accuracy_score(trueys,predicteds_roc))
    f1_score.append(accuracy_score(trueys,predicteds_f1))

0.9002
0.9028
0.9032
0.8932
0.8966


In [29]:
print(accu_score)
print(roc_score)
print(f1_score)

[0.8562824280686477, 0.8594753455970393, 0.8592213635209173, 0.8570080911432821, 0.8533797757701099]
[0.8570443742970139, 0.8588948151373318, 0.8598381771343565, 0.8567541090671601, 0.851674467544719]
[0.8563549943761112, 0.8589673814447952, 0.8594027792895759, 0.8565364101447698, 0.8534160589238416]
