# Data pre-processing/data clean-up

In [3]:
import pandas as pd
from fancyimpute import KNN, MICE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_selection import chi2
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import precision_score, make_scorer, f1_score, classification_report

df = pd.read_csv("bouts_out_new.csv")

# Remove draws
df = df[df.result != 'draw']
print("drawless shape")
print(df.shape)

# Random under sample - reduce to 82,000 records (roughly)
winAcount, winBcount = df.result.value_counts()
df_winA = df[df['result'] == "win_A"]
df_winB = df[df['result'] == "win_B"]
df_winA_reduced = df_winA.sample(winBcount)
df_winB_reduced = df_winB
df = pd.concat([df_winA_reduced, df_winB_reduced], axis=0)

# Encode the label 
le = preprocessing.LabelEncoder().fit(df['result'])
encoded = le.transform(df['result'])
df['result'] = encoded
target = df['result']
clean_df = df.drop(['result'], axis=1) #trial 
print("Clean df shape " + str(clean_df.shape))

# Models can only handle numeric features so I convert the non-numeric features - dummies
clean_df = pd.get_dummies(clean_df)

# Impute with MICE
clean_df = pd.DataFrame(MICE().complete(clean_df))


# SCALING 
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
scaled_df = scaler.fit_transform(clean_df)
clean_df = pd.DataFrame(scaled_df, columns=clean_df.columns)

# Split the dataset, splits the dataset 90/10%, shuffles the dataset (see the book)
X_train, X_test, y_train, y_test = train_test_split(
     clean_df, 
     target, test_size=0.1, random_state=0)


# Select the 20 best features to reduce dimensionality 
import sklearn.feature_selection
selection = sklearn.feature_selection.SelectKBest(chi2, k=20)
selected_features = selection.fit(X_train, y_train) 
indices_selected = selected_features.get_support(indices=True)
colnames_selected = [clean_df.columns[i] for i in indices_selected]








  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


drawless shape
(362655, 26)
Clean df shape (81988, 25)
[MICE] Completing matrix with shape (81988, 36)
[MICE] Starting imputation round 1/110, elapsed time 0.031
[MICE] Starting imputation round 2/110, elapsed time 0.919
[MICE] Starting imputation round 3/110, elapsed time 1.622
[MICE] Starting imputation round 4/110, elapsed time 2.323
[MICE] Starting imputation round 5/110, elapsed time 3.031
[MICE] Starting imputation round 6/110, elapsed time 3.734
[MICE] Starting imputation round 7/110, elapsed time 4.441
[MICE] Starting imputation round 8/110, elapsed time 5.141
[MICE] Starting imputation round 9/110, elapsed time 5.849
[MICE] Starting imputation round 10/110, elapsed time 6.550
[MICE] Starting imputation round 11/110, elapsed time 7.222
[MICE] Starting imputation round 12/110, elapsed time 7.972
[MICE] Starting imputation round 13/110, elapsed time 8.713
[MICE] Starting imputation round 14/110, elapsed time 9.493
[MICE] Starting imputation round 15/110, elapsed time 10.257
[MICE

# Test dataset

In [95]:
# Do not need balanced data, purely for testing code
test_df = clean_df.sample(2500)
test_target = target.sample(2500)

x_train_test, x_test_test, y_train_test, y_test_test = train_test_split(
     test_df, 
     test_target, random_state=0)


0.7255208079231107


# Logistic Regression

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


def logreg_gridsearch(X, x, Y, y, param_grid):
    k = StratifiedKFold(n_splits=10, shuffle=False)
    logreg = LogisticRegression()
    logreg_grid = GridSearchCV(estimator = logreg, param_grid=param_grid, 
                            cv=k, n_jobs=-1, verbose=3)
    logreg_grid.fit(X, Y)
    prediction = logreg_grid.predict(x)
    print(classification_report(y, prediction))
    print(logreg_grid.best_estimator_.score(X, Y)) # 
    print(logreg_grid.best_estimator_.score(x, y))
    return logreg_grid.best_params_


# # Do a gridsearch on exponential values of 0.01 to 1000 for both sets of features
Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = {'C':Cs}

# # # Full range of features 
print("Full range of features best parameters and results: ")
full_range_initial = logreg_gridsearch(X_train, X_test, y_train, y_test, param_grid)
print(full_range_initial)

# # # K20 range of features
print("K20 range of features best parameters: ")
k20_range_initial = logreg_gridsearch(X_train[colnames_selected], y_train, param_grid)
print(k20_range_initial)

# # Refined the param grid for full range
Cs = [x for x in range(50, 151)]
param_grid = {'C':Cs}
full_range_refine = logreg_gridsearch(X_train, X_test, y_train, y_test, param_grid)
print("Refined parameters results: ")
print(full_range_refine)

# # Refined the param grid for K20 range
Cs = [x for x in range(750, 1250)]
param_grid = {'C':Cs}
k20_range_refine = logreg_gridsearch(X_train[colnames_selected], X_test[colnames_selected], y_train, y_test, param_grid)
print("Refined parameters results - k20: ")
print(k20_range_refine)

Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

# # Iterate through C levels - full range of features:
print("Full range of features - growth of C")
for c in Cs:
    logreg_clf = LogisticRegression(C=c)
    logreg_clf.fit(X_train, y_train)
    print("When the C is " + str(c) + " the score for the training set is " + str(logreg_clf.score(X_train, y_train)))
    print("When the C is " + str(c) + " the score for the test set is " + str(logreg_clf.score(X_test, y_test)))

print("K20 range of features - growth of C")
for c in Cs:
    logreg_clf = LogisticRegression(C=c)
    logreg_clf.fit(X_train[colnames_selected], y_train)
    print("When the C is " + str(c) + " the score for the training set is " + str(logreg_clf.score(X_train[colnames_selected], y_train)))
    print("When the C is " + str(c) + " the score for the test set is " + str(logreg_clf.score(X_test[colnames_selected], y_test)))



Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    2.5s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.9s finished


             precision    recall  f1-score   support

          0       0.64      0.71      0.67      4038
          1       0.68      0.62      0.65      4161

avg / total       0.66      0.66      0.66      8199

0.6623978533967557
Refined parameters results - k20: 
{'C': 773}


# KNN

In [55]:
from sklearn.neighbors import KNeighborsClassifier

Ks = [x for x in range(1, 31)]
param_grid = {'n_neighbors': Ks}


def knn_gridsearch(X, x, Y, y, param_grid):
    k = StratifiedKFold(n_splits=10, shuffle=False)
    knn = KNeighborsClassifier()
    knn_grid = GridSearchCV(estimator = knn, param_grid=param_grid, 
                            cv=k, n_jobs=-1, verbose=3)
    knn_grid.fit(X, Y)
    prediction = knn_grid.predict(x)
    print(classification_report(y, prediction))
    print("Train set: ")
    print(knn_grid.best_estimator_.score(X, Y))
    print("Test set: ")
    print(knn_grid.best_estimator_.score(x, y))
    return knn_grid.best_params_

# Testing 1-30 for full range of features
print("Full range results:")
knn_initial = knn_gridsearch(X_train, X_test, y_train, y_test, param_grid)
print(knn_initial)

# Testing 1-30 for K20 range of features
print("K20 range results:")
knn_initial_k20 = knn_gridsearch(X_train[colnames_selected], X_test[colnames_selected], y_train, y_test, param_grid)
print(knn_initial_k20)

Ks = [1, 5, 10, 15, 20, 25, 30]

# Print growth of K and the accuracy on train/test set
print("Full range of features - growth of K")
for k in Ks:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    print("When K is " + str(k) + " the train score is " + str(knn.score(X_train, y_train)))
    print("When K is " + str(k) + " the test score is " + str(knn.score(X_test, y_test)))
    
print("K20 range of features - growth of K")
for k in Ks:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train[colnames_selected], y_train)
    print("When K is " + str(k) + " the train score is " + str(knn.score(X_train[colnames_selected], y_train)))
    print("When K is " + str(k) + " the test score is " + str(knn.score(X_test[colnames_selected], y_test)))
    









K20 range of features - growth of K
When K is 1 the train score is 0.9651574082857879
When K is 1 the test score is 0.6431272106354433
When K is 5 the train score is 0.7675669815284121
When K is 5 the test score is 0.6515428710818393
When K is 10 the train score is 0.7299190936318422
When K is 10 the test score is 0.654835955604342
When K is 15 the train score is 0.720147989537736
When K is 15 the test score is 0.6625198194901817
When K is 20 the train score is 0.7123555001422976
When K is 20 the test score is 0.6650811074521283
When K is 25 the train score is 0.7085202401441949
When K is 25 the test score is 0.6702036833760214
When K is 30 the train score is 0.702977408556831
When K is 30 the test score is 0.6722771069642639


# RDF

In [101]:
from sklearn.ensemble import RandomForestClassifier

def rdf_gridsearch(X, x, Y, y, param_grid):
    k = StratifiedKFold(n_splits=5, shuffle=False)
    rdf = RandomForestClassifier()
    rdf_grid = GridSearchCV(estimator = rdf, param_grid=param_grid,
                            cv=k, n_jobs=-1, verbose=51)
    rdf_grid.fit(X, Y)
    prediction = rdf_grid.predict(x)
    print(classification_report(y, prediction))
    print("Train set: ")
    print(rdf_grid.best_estimator_.score(X, Y))
    print("Test set: ")
    print(rdf_grid.best_estimator_.score(x, y))
    return rdf_grid.best_params_

n_estimators = [1, 20, 40, 60, 80, 100, 120, 128]
max_depth = [1, 10, 20, 30, 40, 50]
min_samples_leaf = [10, 30, 50, 70, 90, 110, 130, 150, 170, 190, 200]

param_grid = {'n_estimators':n_estimators,
             'max_depth': max_depth,
             'min_samples_leaf':min_samples_leaf}

# # Initial test - full range
rdf_full_initial = rdf_gridsearch(X_train, X_test, y_train, y_test, param_grid)
print(rdf_full_initial)

# # K20 range
rdf_k20_initial = rdf_gridsearch(X_train[colnames_selected], X_test[colnames_selected], y_train, y_test, param_grid)
print(rdf_k20_initial)

# # Refined test - full range
n_estimators = [128]
max_depth = [57] # edit 
min_samples_leaf = [x for x in range(1, 100)]  

# #Further refined test
min_samples_leaf = [x for x in range(1, 100)]  
max_depth = [57] # edit 
param_grid = {'n_estimators':n_estimators,
             'max_depth': max_depth,
             'min_samples_leaf':min_samples_leaf}

rdf_full_refined = rdf_gridsearch(X_train, X_test, y_train, y_test, param_grid)
print(rdf_full_refined)

# #Refined K20 test
n_estimators = [120]
min_samples_leaf = [x for x in range(5, 15)]  
max_depth = [x for x in range(30, 50)]  
param_grid = {'n_estimators':n_estimators,
             'max_depth': max_depth,
             'min_samples_leaf':min_samples_leaf}

rdf_k20_refined = rdf_gridsearch(X_train[colnames_selected], X_test[colnames_selected], y_train, y_test, param_grid)
print(rdf_k20_refined)
min_samples_leaf = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]


print("Full range of features, change in min samples")
for mini in min_samples_leaf:
    rdf_full = RandomForestClassifier(min_samples_leaf=mini)
    rdf_full.fit(X_train, y_train)
    print("When min is " + str(mini) + "Training set score: ")
    print(rdf_full.score(X_train, y_train))
    print("When min is " + str(mini) + "Ttest set score: ")
    print(rdf_full.score(X_test, y_test))

# # print("K20 range of features, change in min samples")
for min in min_samples_leaf:
    rdf_k20 = RandomForestClassifier(min_samples_leaf=min)
    rdf_k20.fit(X_train[colnames_selected], y_train)
    print("When min is " + str(min) + "Training set score: ")
    print(rdf_k20.score(X_train[colnames_selected], y_train))
    print("When min is " + str(min) + "Ttest set score: ")
    print(rdf_k20.score(X_test[colnames_selected], y_test))  
    


When min is 5Training set score: 
0.8033582241255471
When min is 5Ttest set score: 
0.689840224417612
When min is 10Training set score: 
0.7575654907913103
When min is 10Ttest set score: 
0.6922795462861324
When min is 15Training set score: 
0.7362208459255445
When min is 15Ttest set score: 
0.6904500548847421
When min is 20Training set score: 
0.7289433384379785
When min is 20Ttest set score: 
0.6884985973899256
When min is 25Training set score: 
0.7200937809158546
When min is 25Ttest set score: 
0.6897182583241859
When min is 30Training set score: 
0.7151879006355961
When min is 30Ttest set score: 
0.6871569703622393
When min is 35Training set score: 
0.7114746100367263
When min is 35Ttest set score: 
0.6909379192584462
When min is 40Training set score: 
0.7078019759042675
When min is 40Ttest set score: 
0.685693377241127
When min is 45Training set score: 
0.7062976866470612
When min is 45Ttest set score: 
0.6887425295767776
When min is 50Training set score: 
0.7040751331499275
When 

# Naive Bayes

In [129]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB



def bn_gridsearch(X, x, Y, y, param_grid):
    k = StratifiedKFold(n_splits=10, shuffle=False)
    bn = BernoulliNB()
    bn_grid = GridSearchCV(estimator = bn, param_grid=param_grid,
                            cv=k, n_jobs=-1, verbose=51)
    bn_grid.fit(X, Y)
    prediction = bn_grid.predict(x)
    print(classification_report(y, prediction))
    print("Train set: ")
    print(bn_grid.best_estimator_.score(X, Y))
    print("Test set: ")
    print(bn_grid.best_estimator_.score(x, y))
    return bn_grid.best_params_

alphas = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = {'alpha':alphas}

print("Initial search - full range")
bn_initial = bn_gridsearch(X_train, X_test, y_train, y_test, param_grid)
print(bn_initial)

alphas = [x for x in np.linspace(0.0001, 0.1)]
param_grid = {'alpha':alphas}

print("Refined search - full range")
bn_initial = bn_gridsearch(X_train, X_test, y_train, y_test, param_grid)
print(bn_initial)

alphas = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = {'alpha':alphas}

print("Initial search - K20 range")
bn_initial = bn_gridsearch(X_train[colnames_selected], X_test[colnames_selected], y_train, y_test, param_grid)
print(bn_initial)

alphas = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = {'alpha':alphas}

for a in alphas:
    bn = BernoulliNB(alpha=a)
    bn.fit(X_train[colnames_selected], y_train)
    print("When the alpha is " + str(a) + " the train score is " + str(bn.score(X_train[colnames_selected], y_train)))
    print("When the alpha is " + str(a) + " the test score is " + str(bn.score(X_test[colnames_selected], y_test)))

    





When the alpha is 0.001 the train score is 0.6454891650517015
When the alpha is 0.001 the test score is 0.6408098548603488
When the alpha is 0.01 the train score is 0.6454891650517015
When the alpha is 0.01 the test score is 0.6408098548603488
When the alpha is 0.1 the train score is 0.6454891650517015
When the alpha is 0.1 the test score is 0.6408098548603488
When the alpha is 1 the train score is 0.6454891650517015
When the alpha is 1 the test score is 0.6408098548603488
When the alpha is 10 the train score is 0.6453671956524685
When the alpha is 10 the test score is 0.6399560922063666
When the alpha is 100 the train score is 0.6447844529672444
When the alpha is 100 the test score is 0.6397121600195146


# MLP 

In [88]:
from sklearn.neural_network import MLPClassifier
import itertools 

# Do gridsearch on parameters - both sets of features 
# Print most important parameter change variation - both sets of features 

def mlp_gridsearch(X, x, Y, y, param_grid):
    k = StratifiedKFold(n_splits=10, shuffle=False)
    mlp = MLPClassifier()
    mlp_grid = GridSearchCV(estimator = mlp, param_grid=param_grid, 
                            cv=k, n_jobs=-1, verbose=51)
    mlp_grid.fit(X, Y)
    prediction = mlp_grid.predict(x)
    print(classification_report(y, prediction))
    print("Train set: ")
    print(mlp_grid.best_estimator_.score(X, Y))
    print("Test set: ")
    print(mlp_grid.best_estimator_.score(x, y))
    return mlp_grid.best_params_


#hidden_layer_sizes = ([x for x in itertools.product((10, 25, 50, 53, 73, 75, 100, 125, 150, 175, 200), repeat=1)] + \
                      #[x for x in itertools.product((10, 25, 50, 53, 73, 75, 100, 125, 150, 175, 200), repeat=2)] )

hidden_layer_sizes = ([x for x in itertools.product((10, 50, 53, 73, 75, 125, 200), repeat=1)] + \
                      [x for x in itertools.product((10, 50, 53, 73, 75, 125, 200), repeat=2)] )
    
alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
activation = ['relu', 'logistic', 'tanh']  
solver=['adam']

param_grid = {'hidden_layer_sizes':hidden_layer_sizes,
             'alpha': alpha,
             'activation':activation,
             'solver':solver}

#MLP initial search
print("MLP full range initial search")
full_mlp_initial = mlp_gridsearch(X_train, X_test, y_train, y_test, param_grid)
print(full_mlp_initial)

K20 initial search 
print("MLP K20 range initial search")
k20_mlp_initial = mlp_gridsearch(X_train[colnames_selected], X_test[colnames_selected], y_train, y_test, param_grid)
print(k20_mlp_initial)

Refined search for full range 
hidden_layer_sizes = ([x for x in itertools.product((x for x in range(190, 211)), repeat=1)])
alpha = [0.00001]
activation = ['relu', 'logistic', 'tanh'] 
solver=['adam']

 param_grid = {'hidden_layer_sizes':hidden_layer_sizes,
              'alpha': alpha,
              'activation':activation,
              'solver':solver}

print("MLP full range refined search")
full_mlp_refined = mlp_gridsearch(X_train, X_test, y_train, y_test, param_grid)
print(full_mlp_refined)

# Refined search for K20 range
list1 = []
for x in range(190, 211):
     list1.append(x)

print(list1)

list2 = []
for x in range(5, 16):
     list2.append(x)

print(list2)

hidden_layer_sizes=[]
for combo in itertools.product(list2, list1):
     hidden_layer_sizes.append(combo)
    
alpha = [0.00001]
activation = ['relu', 'logistic', 'tanh'] 
solver=['adam']

param_grid = {'hidden_layer_sizes':hidden_layer_sizes,
              'alpha': alpha,
              'activation':activation,
              'solver':solver}

print("MLP K20 range refined search")
k20_mlp_refined = mlp_gridsearch(X_train[colnames_selected], X_test[colnames_selected], y_train, y_test, param_grid)
print(k20_mlp_refined)

# 1 layer full range + k20 range - /train and test 
hidden_layer_sizes = [(10,), (25,), (50,), (75,), (100,), (125,), (150,), (175,), (200,)]
alpha = [0.00001]
activation = ['relu', 'logistic', 'tanh'] 
solver=['adam']

print("Full range of features - 1 layer: ")
for layer in hidden_layer_sizes:
    mlp_full=MLPClassifier(hidden_layer_sizes=layer)
    mlp_full.fit(X_train, y_train)
    print("For layer = " + str(layer) + "train = " + str(mlp_full.score(X_train, y_train)))
    print("For layer = " + str(layer) + "test = " + str(mlp_full.score(X_test, y_test)))

# # 1 layer full range + k20 range - train and test
print("K20 range of features - 1 layer: ")
for layer in hidden_layer_sizes:
    mlp_full=MLPClassifier(hidden_layer_sizes=layer)
    mlp_full.fit(X_train[colnames_selected], y_train)
    print("For layer = " + str(layer) + "train = " + str(mlp_full.score(X_train[colnames_selected], y_train)))
    print("For layer = " + str(layer) + "test = " + str(mlp_full.score(X_test[colnames_selected], y_test)))
    
# 2 layer full range + full/k20 range - /train and test 
hidden_layer_sizes = [(10,10), (25,25), (50,50), (75,75), (100,100), (125,125), (150,150), (175,175), (200,200)]
alpha = [0.00001]
activation = ['relu', 'logistic', 'tanh']  
solver=['adam']

print("Full range of features - 2 layer: ")
for layer in hidden_layer_sizes:
    mlp_full=MLPClassifier(hidden_layer_sizes=layer)
    mlp_full.fit(X_train, y_train)
    print("For layer = " + str(layer) + "train = " + str(mlp_full.score(X_train, y_train)))
    print("For layer = " + str(layer) + "test = " + str(mlp_full.score(X_test, y_test)))

# 2 layer full range + k20 range - train and test
print("K20 range of features - 2 layer: ")
for layer in hidden_layer_sizes:
    mlp_full=MLPClassifier(hidden_layer_sizes=layer)
    mlp_full.fit(X_train[colnames_selected], y_train)
    print("For layer = " + str(layer) + "train = " + str(mlp_full.score(X_train[colnames_selected], y_train)))
    print("For layer = " + str(layer) + "test = " + str(mlp_full.score(X_test[colnames_selected], y_test)))
    

hidden_layer_sizes = ([x for x in itertools.product((10, 25, 50, 53, 73, 75, 100, 125, 200), repeat=1)] + \
                      [x for x in itertools.product((10, 25, 50, 53, 73, 75, 100, 125, 200), repeat=2)] )
    
alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
activation = ['relu', 'logistic', 'tanh'] 
solver=['adam']

full_mlp_refined = mlp_gridsearch(x_train_test, x_test_test, y_train_test, y_test_test, param_grid)
print(full_mlp_refined)

mlp = MLPClassifier(hidden_layer_sizes = (100,))

scores = cross_val_score(mlp, clean_df, target, cv=10, n_jobs=-1)
print(scores.mean())

Full range of features - 2 layer: 
For layer = (10, 10)train = 0.7634200219544919
For layer = (10, 10)test = 0.760946456884986
For layer = (25, 25)train = 0.7703180690888886
For layer = (25, 25)test = 0.7632638126600805
For layer = (50, 50)train = 0.7705349035764139
For layer = (50, 50)test = 0.7600926942310038
For layer = (75, 75)train = 0.7711311984171082
For layer = (75, 75)test = 0.761190389071838
For layer = (100, 100)train = 0.7659407228719728
For layer = (100, 100)test = 0.7637516770337847
For layer = (125, 125)train = 0.7720662971445609
For layer = (125, 125)test = 0.7613123551652641
For layer = (150, 150)train = 0.7775413679545732
For layer = (150, 150)test = 0.7641175753140627
For layer = (175, 175)train = 0.7612516770792395
For layer = (175, 175)test = 0.7581412367361874
For layer = (200, 200)train = 0.7784087059046741
For layer = (200, 200)test = 0.7685083546773996
K20 range of features - 2 layer: 
For layer = (10, 10)train = 0.6927048747103227
For layer = (10, 10)test = 0.



For layer = (200, 200)train = 0.7523208066242936
For layer = (200, 200)test = 0.6850835467739969


# SVCS

In [None]:
from sklearn.svm import SVC, LinearSVC # remove l9inear

# Do gridsearch on parameters - both sets of features 
# Retrieve best parameters, refine parameters from selection
# Print most important parameter change variation - both sets of features 

def svc_gridsearch(X, x, Y, y, param_grid):
    k=StratifiedKFold(n_splits=10, shuffle=False)
    svc = SVC()
    svc_grid = GridSearchCV(estimator = svc, param_grid=param_grid, cv=k, n_jobs=-1,
                           verbose=51)
    svc_grid.fit(x, y)
    prediction = svc_grid.predict(x)
    print(classification_report(y, prediction))
    print("Train set: ")
    print(svc_grid.best_estimator_.score(X, Y))
    print("Test set: ")
    print(svc_grid.best_estimator_.score(x, y))
    return svc_grid.best_params_

kernel = ['linear', 'rbf'] 
Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
gammas = [0.001, 0.01, 0.1, 1, 10, 100, 1000] 
param_grid= {'C': Cs, 'gamma': gammas, 'kernel': kernel}
    
# Full range of features initial test
print("Full range of features:")
full_initial_svc = svc_gridsearch(X_train, x_test, Y_train, y_test)

# K20 range of features initial test
print("K20 range of features:")
k20_initial_svc = svc_gridsearch(X_train[colnames_selected}, x_test[colnames_selected], Y_train, y_test)

def linearsvc_gridsearch(X, x, Y, y, param_grid):
    k=StratifiedKFold(n_splits=10, shuffle=False)
    svc = LinearSVC()
    svc_grid = GridSearchCV(estimator = svc, param_grid=param_grid, cv=k, n_jobs=-1,
                           verbose=51)
    svc_grid.fit(x, y)
    prediction = svc_grid.predict(x)
    print(classification_report(y, prediction))
    print("Train set: ")
    print(svc_grid.best_estimator_.score(X, Y))
    print("Test set: ")
    print(svc_grid.best_estimator_.score(x, y))
    return svc_grid.best_params_

Cs = [x for x in range(30, 125)]
param_grid= {'C': Cs}
#Full range of features refined linear test
print("Full range of features:")
full_refined_svc = linearsvc_gridsearch(X_train, X_test, y_train, y_test, param_grid)
print(full_refined_svc)

#K20 range of features refined linear test
print("K20 range of features:")
k20_refined_svc = linearsvc_gridsearch(X_train[colnames_selected], X_test[colnames_selected], y_train, y_test, param_grid)
print(k20_refined_svc)

gammas = [0.001, 0.01, 0.1, 1, 10, 100, 1000] 

print("Analysing gamma change with the RBF kernel - full features")
for g in gammas:
    svc = SVC(kernel='rbf', gamma=g)
    svc.fit(X_train, y_train)
    print("When gamma = " + str(g) + "the train score = " + str(svc.score(X_train, y_train)))
    print("When gamma = " + str(g) + "the test score = " + str(svc.score(X_test, y_test)))

print("Analysing gamma change with the RBF kernel - K20 features")
for g in gammas:
    svc = SVC(kernel='rbf', gamma=g)
    svc.fit(X_train[colnames_selected], y_train)
    print("When gamma = " + str(g) + "the train score = " + str(svc.score(X_train[colnames_selected], y_train)))
    print("When gamma = " + str(g) + "the test score = " + str(svc.score(X_test[colnames_selected], y_test)))
    
svc = SVC(kernel='rbf', gamma=1000)
svc.fit(X_train[colnames_selected], y_train)
print("When gamma = " + str(g) + "the train score = " + str(svc.score(X_train[colnames_selected], y_train)))
print("When gamma = " + str(g) + "the test score = " + str(svc.score(X_test[colnames_selected], y_test)))
        
                                         
                                

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

Ks = [x for x in range(1, 30)]



def kgrowth(xtrain, ytrain, xtest, ytest)

    for k in range(1, 31):
        knn = KNeighborsClassifier()
        knn.fit(xtrain, ytrain)
        print("Number of K = " + str(k) + " the score = " + str(knn.score(xtest, ytest)))

print("Observing the accuracy of K from 1-30 - full range of features")  
kgrowth(X_train, y_train, X_test, y_test)
print("Observing the accuracy of K from 1-30 - K20 range of features")
kgrowth(X_train[colnames_selected], y_train, X_test[colnames_selected], y_test)    
    


# param_grid = {'n_neighbors':Ks}
# prec_metric = make_scorer(precision_score)
# k = StratifiedKFold(n_splits=10, shuffle=False)
# knn = KNeighborsClassifier()

# knn_grid = GridSearchCV(estimator=knn, param_grid=param_grid, scoring=prec_metric,
#                        cv=k, n_jobs=-1, verbose=51)

# knn_grid.fit(X_train, y_train)

# print("Full: Best K found for K-NN" + str(knn_grid.best_params_))

# pd.DataFrame(knn_grid.cv_results_).to_csv('knn_results_full.csv')

# param_grid = {'n_neighbors':Ks}
# prec_metric2 = make_scorer(precision_score)
# k2 = StratifiedKFold(n_splits=10, shuffle=False)
# knn2 = KNeighborsClassifier()

# knn_grid2 = GridSearchCV(estimator=knn2, param_grid=param_grid, scoring=prec_metric2,
#                        cv=k2, n_jobs=-1, verbose=51)

# knn_grid2.fit(X_train[colnames_selected], y_train)

# print("K20: Best K found for K-NN" + str(knn_grid2.best_params_))

# pd.DataFrame(knn_grid2.cv_results_).to_csv('knn_results_k20.csv')

# print("Optimised parameter best score, full range: " + str(knn_grid.best_estimator_.score(X_test, y_test)))
# print("Optimised parameter best score, K20 range: " + str(knn_grid2.best_estimator_.score(X_test[colnames_selected], y_test)))

# pd.DataFrame(knn_grid.cv_results_).to_csv('knn_results_full.csv') # didnt do this for k20

# # Full range of features - refinement
# logreg_grid = GridSearchCV(estimator = logreg, param_grid=param_grid, scoring = prec_metric,
#                            cv=k, n_jobs=-1, verbose=3)

# logreg_grid.fit(X_train, y_train)
#print("Best parameters found for Logistic Regression in grid search - phase 2" + str(logreg_grid.best_params_))



# for i in range(1, 11):
# knn = knn = KNeighborsClassifier(n_neighbors=9)
# #     knn.fit(X_train_selected, y_train)
# #     print("Normal:" + str(knn.score(X_test_selected, y_test)))

# scores = cross_val_score(knn, clean_df, target, cv=10)
# print(scores.mean())
    
# knn = knn = KNeighborsClassifier(n_neighbors=5)
# knn.fit(X_train_selected, y_train)
# print("Actual reduced:" + str(knn.score(X_test_selected, y_test)))

#scores = cross_val_score(knn, X_test_selected, y_test, cv=10)
#print(scores.mean())

# knn = knn = KNeighborsClassifier(n_neighbors=5)
# knn.fit(clean_df2, target)
# print("Assumed reduced:" + str(knn.score(clean_df2, target)))


# for i in range(1, 11):
#     knn = KNeighborsClassifier(n_neighbors=i)
#     scores = cross_val_score(knn, X_test, y_test, cv=10)
#     print("Number of neighbors: " + str(i) + "\nDataset with 36 features scores: {}".format(scores))
#     print("Mean of the RDS scores: {:.2f}".format(scores.mean()))
#     # SelectKBest Results applied
#     scores = cross_val_score(knn, X_test, target, cv=10)
#     print("Number of neighbors: " + str(i) + "\nDataset with 20 features scores: {}".format(scores))
#     print("Mean of the K20 scores: {:.2f}".format(scores.mean()))


# Benefit of using cross-validation:
# -	Train test split performs a random split, we could get lucky with the data split. 
# -	With cross validation each example will be in the training set exactly once. 
# -	We get a best case and a worst case scenario with the multiple folds as opposed to the one accuracy. 
# Another benefit of cross-validation as compared to using a single split of the data is
# that we use our data more effectively. When using train_test_split, we usually use
# 75% of the data for training and 25% of the data for evaluation. When using five-fold
# cross-validation, in each iteration we can use four-fifths of the data (80%) to fit the
# model. When using 10-fold cross-validation, we can use nine-tenths of the data
# (90%) to fit the model. More data will usually result in more accurate models.

# As the simple k-fold strategy fails here, scikit-learn does not use it for classification,
# but rather uses stratified k-fold cross-validation. In stratified cross-validation, we
# split the data such that the proportions between classes are the same in each fold as
# they are in the whole dataset, as illustrated in Figure 5-2:
# For example, if 90% of your samples belong to class A and 10% of your samples
# belong to class B, then stratified cross-validation ensures that in each fold, 90% of
# samples belong to class A and 10% of samples belong to class B.

# Talk about benefits of cross validation etc 


# Logistic Regression

In [None]:




# logreg = LogisticRegression(random_state=0)
# Cs = [x for x in np.linspace(start=1, stop=100, num=10)] 
# param_grid = { 'C':Cs }
# prec_metric = make_scorer(precision_score)
# k = StratifiedKFold(n_splits=10, shuffle=False)
# k2 = StratifiedKFold(n_splits=10, shuffle=False)
# grid = RandomizedSearchCV(estimator = logreg, param_distributions=param_grid, 
#                     scoring=prec_metric,cv=k, 
#                     n_jobs=-1, verbose=3)
# grid.fit(X_train, y_train)
# print("Grid search 1 best params " + str(grid.best_params_))

# # Get score of X_train, y_train (TEST FOLD) from GridSearchCV metric
# print("Mean test score from GridSearchCV: " + (str(grid.cv_results_['mean_test_score'].mean())))

# # Feed result of GridSearchCV to cross_val_score for scoring on the test set, is it similar to above score?
# scores = cross_val_score(grid.best_estimator_, X_test, 
#                          y_test,cv=k)

# print("Scores on hold out set with cross_val_score: " + str(scores.mean()))


# logreg2 = LogisticRegression()

# prec_metric = make_scorer(precision_score)

# grid2 = RandomizedSearchCV(estimator = logreg2, param_distributions=param_grid, cv=k2, 
#                     n_jobs=-1, scoring=prec_metric, verbose=3)
# grid2.fit(clean_df, target)
# print("Grid search 1 best params " + str(grid2.best_params_))
# # Score of the whole dataset on the test fold with GridSearchCV 
# print("Mean test score from GridSearchCV full dataset: " + (str(grid2.cv_results_['mean_test_score'].mean())))




# scores2 = cross_val_score(grid.best_estimator_, clean_df, 
#                          target,cv=10)

print("Scores on hold out set with cross_val_score: " + str(scores2.mean()))









# scores2 = cross_val_score(grid.best_estimators_, X_test2, y_test)

# print("Dataset of 36: Mean of the scores: {:.2f}".format(scores.mean()))



# logreg = LogisticRegression()

# grid_search = GridSearchCV(estimator = logreg, param_grid=param_grid, cv=3, n_jobs=-1, verbose=3)
# grid_search.fit(clean_df, target)

# print(grid_search.cv_results_)


# Cs = [0.01, 0.1, 1, 10, 100]

# for c in Cs:
#     logreg = LogisticRegression(C=c)
#     logreg.fit(X_train, y_train)
#     scores = cross_val_score(logreg, X_test, y_test, cv=10)
#     print("Dataset with 36 features scores: {}".format(scores))
#     print("Mean of the scores: {:.2f}".format(scores.mean()))

#     scores = cross_val_score(logreg, X_test, y_test, cv=10)
#     print("Dataset with 20 features scores: {}".format(scores))
#     print("Mean of the scores: {:.2f}".format(scores.mean()))




# logreg = LogisticRegression()

# Cs = [x for x in np.linspace(start=0.01, stop=100)]
# param_grid = { 'C':Cs }

# grid = GridSearchCV(estimator = logreg, param_grid=param_grid, cv=3, n_jobs=-3, verbose=3)
# grid.fit(X_train2, y_train2)

# scores = cross_val_score(grid.best_estimator_, X_test2, y_test2, cv=10)
# print("Dataset of 20: Mean of the scores: {:.2f}".format(scores.mean()))











#

# logreg = LogisticRegression(C=75)
# logreg.fit(X_train, y_train)





# logreg = LogisticRegression()



# # print(grid_search.best_params_)

# logreg = LogisticRegression(C=84)

# print(logreg.score(X_test, y_test))

# logreg2 = LogisticRegression(C=10)
# scores = cross_val_score(logreg2, clean_df, target, cv=10)
# print(scores.mean())
# print(logreg.score(clean_df, target))







# Grid search CV 

# 

# param_grid = {
#     'C': Cs
# }

# logreg = LogisticRegression()
# grid_search = GridSearchCV(estimator = logreg, param_grid = param_grid, cv = 3, n_jobs=-1, verbose=3)
# grid_search.fit(X_train, y_train)
# print(grid_search.best_params_)

# Second manual grid search

# Cs = [0.01, 0.1, 1, 10, 100]

# for c in Cs:
#     logreg = LogisticRegression(C=c)
#     scores = cross_val_score(logreg, clean_df, target, cv=10)
#     print("Full features: When C = " + str(c) + ". Mean of the scores: {:.2f}".format(scores.mean()))
#     scores = cross_val_score(logreg, clean_df2, target, cv=10)
#     print("Reduced features: When C = " + str(c) + ". Mean of the scores: {:.2f}".format(scores.mean()))


# X_train, X_test, y_train, y_test = train_test_split(
#       clean_df, 
#       target, random_state=0)

# logreg = LogisticRegression().fit(X_train, y_train)
# print(logreg.score(X_train, y_train))
# print(logreg.score(X_test, y_test))


# High training set accuracy/low test set accuracy means overfitting 
# When both train/test is similar, means underfitting 

#Random Grid Search
# Cs = [x for x in np.linspace(start=0.001, stop=1000, num=10 )]

# random_grid = {'C': Cs}

# logreg = LogisticRegression()

# lr_random = RandomizedSearchCV(estimator = logreg, param_distributions = random_grid, n_iter = 10, cv = 3, n_jobs=-1, verbose=3, random_state=42)
# lr_random.fit(X_train, y_train)
# print(lr_random.best_params_)

# logreg = LogisticRegression(C=1).fit(X_train, y_train)
# print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
# print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))


# C = [0.01, 0.1, 1, 10, 100, 1000]

# for i in C:
#     logreg = LogisticRegression(C=i)
#     scores = cross_val_score(logreg, clean_df, target, cv=10)
#     print("Full features: When C = " + str(i) + ". Mean of the scores: {:.2f}".format(scores.mean()))
#     scores = cross_val_score(logreg, clean_df2, target, cv=10)
#     print("Reduced features: When C = " + str(i) + ". Mean of the scores: {:.2f}".format(scores.mean()))

# for i in range(1, 100):
#     logreg = LogisticRegression(C=i)
#     scores = cross_val_score(logreg, clean_df, target, cv=10)
#     print("Full features: When C = " + str(i) + ". Mean of the scores: {:.2f}".format(scores.mean()))
#     scores = cross_val_score(logreg, clean_df2, target, cv=10)
#     print("Reduced features: When C = " + str(i) + ". Mean of the scores: {:.2f}".format(scores.mean()))
    
# for i in range(1, 101):
#     logreg = LogisticRegression(C=i).fit(X_train, y_train)
#     print("When C is equal to " + str(i) + " training set result : " + str(logreg.score(X_train, y_train)))
#     print("When C is equal to " + str(i) + " test set result : " + str(logreg.score(X_test, y_test)))






# logreg = LogisticRegression(C=84)#.fit(X_train, y_train)
# # logreg.fit(X_test, y_test)

# scores = cross_val_score(logreg, X_test, y_test, cv=10)
# print("Full features: mean of the scores: {:.2f}".format(scores.mean()))

# scores = cross_val_score(logreg, clean_df, target, cv=10)
# print("Dataset with 36 features scores: {}".format(scores))
# print("Mean of the scores: {:.2f}".format(scores.mean()))

# scores = cross_val_score(logreg, clean_df2, target, cv=10)
# print("Dataset with 20 features scores: {}".format(scores))
# print("Mean of the scores: {:.2f}".format(scores.mean()))



# If we're overfitting 
# When the test and training set score are close it means I am likely underfitting
# print("Training set score " + str(logreg.score(X_train, y_train)))
#print("Test set score " + str(logreg.score(X_test, y_test)))
# You will need to explain alpha and regularization in this section, not the lit review

# C is changed, this relates to regularization I think, talk about this, this means
# less regularization
# logreg100 = LogisticRegression(C=100).fit(X_train, y_train)
# print("Training set score: {:.3f}".format(logreg100.score(X_train, y_train)))
# print("Test set score: {:.3f}".format(logreg100.score(X_test, y_test)))

# C is set to 0.01, this means even more regularization
# logreg001 = LogisticRegression(C=0.01).fit(X_train, y_train)
# print("Training set score: {:.3f}".format(logreg001.score(X_train, y_train)))
# print("Test set score: {:.3f}".format(logreg001.score(X_test, y_test)))



# Random Forest classifier

In [None]:
# # As mentioned in the literature review, a Random Forest comprises of decision
# trees. When a decision tree is built that continues until all leaves are pure leads to models
# that are very complex and highly overfit to the training data. The presence of pure
# leaves means that a tree is 100% accurate on the training set. 
# To stop the overfitting of trees we can pre-prune the tree or post-prune the tree
# To pre-prune we can limit the maximum depth of the tree 
# The deeper a tree becomes the more complex it becomes. Limiting the depth
# prevents overfitting
# try a standard decision tree aswell as this one 
# Make note that scaling does improve the performance for this algorithm 

from sklearn.ensemble import RandomForestClassifier


# n estimators equal the number of trees
# Iterate thropugh a number of numbers of trees to gauge the best
# A heavy tuning of parameters is not really needed
# max depth is set to default

    #{'max_depth': 23, 'max_features': 'sqrt', 'min_samples_leaf': 9, 'n_estimators': 392}

#forest = RandomForestClassifier(n_estimators=500, n_jobs=-1, max_features = 'auto', min_samples_leaf=1)

n_estimators = [int(x) for x in np.linspace(start=10, stop=500, num=10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(1, 50, num = 10)]
min_samples_leaf = [int(x) for x in np.linspace(start=1, stop=250, num=10)]

param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_leaf' : min_samples_leaf
              }

forest = RandomForestClassifier()

grid = GridSearchCV(estimator = forest, param_grid=param_grid, cv=3, 
                    n_jobs=-3, verbose=3)
grid.fit(X_train, y_train)

scores = cross_val_score(grid.best_estimator_, X_test, 
                         y_test, cv=10)

print("Dataset of 36: Mean of the scores: {:.2f}".format(scores.mean()))

# # forest = RandomForestClassifier()



# scores = cross_val_score(forest, X_test, y_test, cv=10)
# print("Full features: mean of the scores: {:.2f}".format(scores.mean()))

# scores = cross_val_score(forest, clean_df2, target, cv=10)
# print("Reduced features: Mean of the scores: {:.2f}".format(scores.mean()))

# omit samples/leafs and just reflect on it

# forest = RandomForestClassifier()

# scores = cross_val_score(forest, clean_df, target, cv=10)
# print("Full features: mean of the scores: {:.2f}".format(scores.mean()))

# scores = cross_val_score(forest, clean_df2, target, cv=10)
# print("Reduced features: Mean of the scores: {:.2f}".format(scores.mean()))

# forest.fit(X_train, y_train)
# print(forest.score(X_test, y_test))

#Random Grid Search
# n_estimators = [int(x) for x in np.linspace(start=10, stop=500, num=10)]
# max_features = ['auto', 'sqrt']
# max_depth = [int(x) for x in np.linspace(1, 50, num = 10)]
# min_samples_leaf = [int(x) for x in np.linspace(start=1, stop=250, num=10)]

# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_leaf' : min_samples_leaf
#               }

# forest = RandomForestClassifier()

# rf_random = RandomizedSearchCV(estimator = forest, param_distributions = random_grid, n_iter = 100, cv = 3, n_jobs=-1, verbose=3, random_state=42)
# rf_random.fit(X_train, y_train)
# print(rf_random.best_params_)

# Grid search CV 

# param_grid = {
#     'max_depth' : [None, 1, 2, 3, 4, 5],
#     'max_features': ['auto'],
#     'n_estimators' : [498, 499, 500, 501, 502],
#     'min_samples_leaf': [9, 10, 11]
# }

# forest = RandomForestClassifier()

# grid_search = GridSearchCV(estimator = forest, param_grid = param_grid, cv = 3, n_jobs=-1, verbose=3)
# grid_search.fit(X_train, y_train)

# print(grid_search.best_params_)



# grid_search.fit(X_train, y_train)
# print(grid_search.best_params_)

#print(str(forest.score(X_test, y_test)))

# Also use AUC from April chen's video
# Assess with cross validation
# Test on both the 20 features aswell as all features

# Naive Bayes

# Neural Networks

In [8]:
#https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw

from sklearn.neural_network import MLPClassifier
import itertools




hidden_layer_sizes = ([x for x in itertools.product((10, 30, 50, 53, 70, 73, 90, 120, 140, 150), repeat=1)] + \
                       [x for x in itertools.product((10, 30, 50, 53, 70, 73, 90, 120, 140, 150), repeat=2)] )
                       #[((len(clean_df.columns))+1,)] 

alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]
#activation = ['relu', 'logistic'] # uncomment
activation = ['relu']
solver = ['adam']

param_grid = {'hidden_layer_sizes': hidden_layer_sizes,
                'alpha': alpha,
               'activation' : activation,
               'solver': solver}



def mlp_gridsearch(X, y, param_grid):
    mlp = MLPClassifier()
    prec_metric = make_scorer(precision_score)
    k2 = StratifiedKFold(n_splits=10, shuffle=False)
    cv_search = GridSearchCV(estimator = mlp, param_grid = param_grid, cv=k2, n_jobs=-1, verbose = 51, scoring=prec_metric)
    cv_search.fit(X, y)
    return cv_search.best_params_
    
    
print("Full range of features grid search, best parameters: ")
fullrange_testparams = mlp_gridsearch(X_train, y_train, param_grid)
print(fullrange_testparams)
print("K20 range of features grid search, best parameters: ")
k20_testparams = mlp_gridsearch(X_train[colnames_selected], y_train, param_grid)
print(k20_testparams)

# Test 
# Get best estimator scores and then .score when trained on training sets
# Do iterations 




#Random Grid Search
# n_estimators = [int(x) for x in np.linspace(start=10, stop=500, num=10)]
# max_features = ['auto', 'sqrt']
# max_depth = [int(x) for x in np.linspace(1, 50, num = 10)]
# min_samples_leaf = [int(x) for x in np.linspace(start=1, stop=250, num=10)]

# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_leaf' : min_samples_leaf
#               }

# forest = RandomForestClassifier()

# rf_random = RandomizedSearchCV(estimator = forest, param_distributions = random_grid, n_iter = 100, cv = 3, n_jobs=-1, verbose=3, random_state=42)
# rf_random.fit(X_train, y_train)
# print(rf_random.best_params_)

# 
# k2 = StratifiedKFold(n_splits=10, shuffle=False)
# grid = RandomizedSearchCV(estimator = logreg, param_distributions=param_grid, 
#                     scoring=prec_metric,cv=k, 
#                     n_jobs=-1, verbose=3)
# grid.fit(X_train, y_train)
# print("Grid search 1 best params " + str(grid.best_params_))

# # Get score of X_train, y_train (TEST FOLD) from GridSearchCV metric
# print("Mean test score from GridSearchCV: " + (str(grid.cv_results_['mean_test_score'].mean())))



# Parameters

    


# random_grid = {#'hidden_layer_sizes': hidden_layer_sizes,
#                 'alpha': alpha,
#                'activation' : activation,
#                'solver': solver}


# # prec_metric = make_scorer(precision_score)
# # k = StratifiedKFold(n_splits=10, shuffle=False)
# # mlp = MLPClassifier()

# mlp_random = RandomizedSearchCV(estimator = mlp, param_distributions=random_grid, scoring=prec_metric,
# #                                 n_iter=100, cv=k, n_jobs=-1, verbose=3)

# # mlp_random.fit(X_train, y_train)
# print("Best parameters found: " + str(mlp_random.best_params_))                      
# print("Best score from search" + str(mlp_random.best_score_))

# # # Evaluate on test set 
# print(mlp_random.best_estimator_.score(X_test, y_test))

# pd.DataFrame(mlp_random.cv_results_).to_csv('mlp_results_full.csv')

# clean_df2 = clean_df[colnames_selected]

# X_train2, X_test2, y_train2, y_test2 = train_test_split(
#     clean_df2, 
#     target, test_size=0.1, random_state=0)



#print(clean_df2.shape)

# MLP 2 ON K20 FEATURES 
# mlp2 = MLPClassifier()
# prec_metric2 = make_scorer(precision_score)
# k2 = StratifiedKFold(n_splits=10, shuffle=False)
# mlp_random2 = RandomizedSearchCV(estimator=mlp2, param_distributions=random_grid, scoring=prec_metric2,
#                                n_iter=100, cv=k2, n_jobs=-1, verbose=3)



# # mlp_random2.fit(X_train[colnames_selected], y_train)
# print("Best parameters found for K20: " + str(mlp_random2.best_params_))
# print("Best test score from Randomized Search CV for K20: " + str(mlp_random2.best_score_))

# # # Evaluate on test set 
# print(mlp_random2.best_estimator_.score(X_test[colnames_selected], y_test))

# pd.DataFrame(mlp_random2.cv_results_).to_csv('mlp_results_26.csv')





# mlp = MLPClassifier(hidden_layer_sizes=(13, 1), random_state=42)
# mlp.fit(X_train, y_train)

# #print("Accuracy " + str(mlp.score(X_test, y_test)))

# scores = cross_val_score(mlp, X_test, y_test, cv=10)
# print("Full features: mean of the scores: {:.2f}".format(scores.mean()))

# scores = cross_val_score(mlp, X_test, y_test, cv=10)
# print("Full features: mean of the scores: {:.2f}".format(scores.mean()))

#mlp.fit(X_train, y_train)

#Random Grid Search
# n_estimators = [int(x) for x in np.linspace(start=200, stop=500, num=10)]
# max_features = ['auto', 'sqrt']
# max_depth = [int(x) for x in np.linspace(5, 100, num = 5)]
# max_depth.append(None)

# hidden_layer_sizes = [(7, 7), (128,), (128, 7), (5, 2)]

# param_grid = {
#     'hidden_layer_sizes': hidden_layer_sizes
# }

# cv_search = GridSearchCV(estimator = mlp, param_grid = param_grid, n_jobs=-1, verbose = 3)
# cv_search.fit(X_train, y_train)
# print(cv_search.best_params_)



# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth
#               }

# forest = RandomForestClassifier()

# rf_random = RandomizedSearchCV(estimator = forest, param_distributions = random_grid, n_iter = 100, cv = 3, n_jobs=-1, verbose=3, random_state=42)
# rf_random.fit(X_train, y_train)
# print(rf_random.best_params_)


# scores = cross_val_score(mlp, clean_df, target, cv=10)
# print("Full features: mean of the scores: {:.2f}".format(scores.mean()))

# print("Training Accuracy " + str(mlp.score(X_train, y_train)))
# print("Testing Accuracy " + str(mlp.score(X_test, y_test)))

# A common way to adjust parameters in a neural network is to first create a network
# that is large enough to overfit, making sure that the task can actually be learned by
# the network. Then, once you know the training data can be learned, either shrink the
# network or increase alpha to add regularization, which will improve generalization
# performance.

# Algorithms part - http://scikit-learn.org/stable/modules/neural_networks_supervised.html

# # Poor accuracy could be down to poor scaling, scale with minmax scaler and see
# # if there';s an improvement in accuracy 
# # Either use minmax scaler or scale from cristi vlad video, standardscaler
# # neural networks 3 
# # Decent accuracy 
# # By default the MLP uses 100 hidden nodes
#print("Accuracy " + str(mlp.score(X_test, y_test)))

# Reduced the number of hidden nodes - 10 hidden units
# mlp = MLPClassifier(hidden_layer_sizes=[10], random_state=42)
# mlp.fit(X_train, y_train)
# print("Accuracy " + str(mlp.score(X_test, y_test)))

# Two hidden layers now with 10 nodes each
# mlp = MLPClassifier(hidden_layer_sizes=[10, 10], random_state=42)
# mlp.fit(X_train, y_train)
# print("Accuracy " + str(mlp.score(X_test, y_test)))

# Experiment with the alpha some more 
# mlp = MLPClassifier(hidden_layer_sizes=[10, 10], alpha=1, random_state=42)
# mlp.fit(X_train, y_train)
# print("Accuracy " + str(mlp.score(X_test, y_test)))

# Also use AUC from April chen's video
# Assess with cross validation
# Test on both the 20 features aswell as all features

Full range of features grid search, best parameters: 
Fitting 3 folds for each of 770 candidates, totalling 2310 fits
Memmaping (shape=(36, 73789), dtype=float64) to new file C:\Users\Josh\AppData\Local\Temp\joblib_memmaping_pool_2196_68540607736\2196-66640797200-ada1cc5f90801e4f056bb8185da05b87.pkl
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   16.1s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   27.5s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   28.1s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   32.1s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   34.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   42.7s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   44.2s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   51.9s
[Parallel(n_jobs=

[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 131 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 134 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 135 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 139 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 140 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 143 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:  9.5min
[Paralle

[Parallel(n_jobs=-1)]: Done 262 tasks      | elapsed: 18.6min
[Parallel(n_jobs=-1)]: Done 263 tasks      | elapsed: 18.7min
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed: 18.8min
[Parallel(n_jobs=-1)]: Done 265 tasks      | elapsed: 19.0min
[Parallel(n_jobs=-1)]: Done 266 tasks      | elapsed: 19.0min
[Parallel(n_jobs=-1)]: Done 267 tasks      | elapsed: 19.2min
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed: 19.4min
[Parallel(n_jobs=-1)]: Done 269 tasks      | elapsed: 19.5min
[Parallel(n_jobs=-1)]: Done 270 tasks      | elapsed: 19.5min
[Parallel(n_jobs=-1)]: Done 271 tasks      | elapsed: 19.6min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 19.7min
[Parallel(n_jobs=-1)]: Done 273 tasks      | elapsed: 19.8min
[Parallel(n_jobs=-1)]: Done 274 tasks      | elapsed: 19.8min
[Parallel(n_jobs=-1)]: Done 275 tasks      | elapsed: 19.8min
[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed: 20.0min
[Parallel(n_jobs=-1)]: Done 277 tasks      | elapsed: 20.1min
[Paralle

[Parallel(n_jobs=-1)]: Done 395 tasks      | elapsed: 31.6min
[Parallel(n_jobs=-1)]: Done 396 tasks      | elapsed: 31.7min
[Parallel(n_jobs=-1)]: Done 397 tasks      | elapsed: 31.8min
[Parallel(n_jobs=-1)]: Done 398 tasks      | elapsed: 31.8min
[Parallel(n_jobs=-1)]: Done 399 tasks      | elapsed: 31.8min
[Parallel(n_jobs=-1)]: Done 400 tasks      | elapsed: 31.9min
[Parallel(n_jobs=-1)]: Done 401 tasks      | elapsed: 32.0min
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed: 32.0min
[Parallel(n_jobs=-1)]: Done 403 tasks      | elapsed: 32.1min
[Parallel(n_jobs=-1)]: Done 404 tasks      | elapsed: 32.2min
[Parallel(n_jobs=-1)]: Done 405 tasks      | elapsed: 32.2min
[Parallel(n_jobs=-1)]: Done 406 tasks      | elapsed: 32.3min
[Parallel(n_jobs=-1)]: Done 407 tasks      | elapsed: 32.3min
[Parallel(n_jobs=-1)]: Done 408 tasks      | elapsed: 32.4min
[Parallel(n_jobs=-1)]: Done 409 tasks      | elapsed: 32.5min
[Parallel(n_jobs=-1)]: Done 410 tasks      | elapsed: 32.5min
[Paralle

[Parallel(n_jobs=-1)]: Done 528 tasks      | elapsed: 41.1min
[Parallel(n_jobs=-1)]: Done 529 tasks      | elapsed: 41.1min
[Parallel(n_jobs=-1)]: Done 530 tasks      | elapsed: 41.1min
[Parallel(n_jobs=-1)]: Done 531 tasks      | elapsed: 41.2min
[Parallel(n_jobs=-1)]: Done 532 tasks      | elapsed: 41.3min
[Parallel(n_jobs=-1)]: Done 533 tasks      | elapsed: 41.5min
[Parallel(n_jobs=-1)]: Done 534 tasks      | elapsed: 41.5min
[Parallel(n_jobs=-1)]: Done 535 tasks      | elapsed: 41.5min
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed: 41.7min
[Parallel(n_jobs=-1)]: Done 537 tasks      | elapsed: 41.9min
[Parallel(n_jobs=-1)]: Done 538 tasks      | elapsed: 42.0min
[Parallel(n_jobs=-1)]: Done 539 tasks      | elapsed: 42.0min
[Parallel(n_jobs=-1)]: Done 540 tasks      | elapsed: 42.1min
[Parallel(n_jobs=-1)]: Done 541 tasks      | elapsed: 42.1min
[Parallel(n_jobs=-1)]: Done 542 tasks      | elapsed: 42.2min
[Parallel(n_jobs=-1)]: Done 543 tasks      | elapsed: 42.2min
[Paralle

[Parallel(n_jobs=-1)]: Done 661 tasks      | elapsed: 54.1min
[Parallel(n_jobs=-1)]: Done 662 tasks      | elapsed: 54.2min
[Parallel(n_jobs=-1)]: Done 663 tasks      | elapsed: 54.2min
[Parallel(n_jobs=-1)]: Done 664 tasks      | elapsed: 54.3min
[Parallel(n_jobs=-1)]: Done 665 tasks      | elapsed: 54.4min
[Parallel(n_jobs=-1)]: Done 666 tasks      | elapsed: 54.5min
[Parallel(n_jobs=-1)]: Done 667 tasks      | elapsed: 54.6min
[Parallel(n_jobs=-1)]: Done 668 tasks      | elapsed: 54.7min
[Parallel(n_jobs=-1)]: Done 669 tasks      | elapsed: 54.8min
[Parallel(n_jobs=-1)]: Done 670 tasks      | elapsed: 55.0min
[Parallel(n_jobs=-1)]: Done 671 tasks      | elapsed: 55.0min
[Parallel(n_jobs=-1)]: Done 672 tasks      | elapsed: 55.3min
[Parallel(n_jobs=-1)]: Done 673 tasks      | elapsed: 55.3min
[Parallel(n_jobs=-1)]: Done 674 tasks      | elapsed: 55.4min
[Parallel(n_jobs=-1)]: Done 675 tasks      | elapsed: 55.4min
[Parallel(n_jobs=-1)]: Done 676 tasks      | elapsed: 55.7min
[Paralle

[Parallel(n_jobs=-1)]: Done 794 tasks      | elapsed: 64.6min
[Parallel(n_jobs=-1)]: Done 795 tasks      | elapsed: 64.6min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed: 64.6min
[Parallel(n_jobs=-1)]: Done 797 tasks      | elapsed: 64.8min
[Parallel(n_jobs=-1)]: Done 798 tasks      | elapsed: 64.9min
[Parallel(n_jobs=-1)]: Done 799 tasks      | elapsed: 64.9min
[Parallel(n_jobs=-1)]: Done 800 tasks      | elapsed: 64.9min
[Parallel(n_jobs=-1)]: Done 801 tasks      | elapsed: 65.1min
[Parallel(n_jobs=-1)]: Done 802 tasks      | elapsed: 65.2min
[Parallel(n_jobs=-1)]: Done 803 tasks      | elapsed: 65.2min
[Parallel(n_jobs=-1)]: Done 804 tasks      | elapsed: 65.3min
[Parallel(n_jobs=-1)]: Done 805 tasks      | elapsed: 65.5min
[Parallel(n_jobs=-1)]: Done 806 tasks      | elapsed: 65.5min
[Parallel(n_jobs=-1)]: Done 807 tasks      | elapsed: 65.6min
[Parallel(n_jobs=-1)]: Done 808 tasks      | elapsed: 65.8min
[Parallel(n_jobs=-1)]: Done 809 tasks      | elapsed: 65.8min
[Paralle

[Parallel(n_jobs=-1)]: Done 927 tasks      | elapsed: 75.4min
[Parallel(n_jobs=-1)]: Done 928 tasks      | elapsed: 75.6min
[Parallel(n_jobs=-1)]: Done 929 tasks      | elapsed: 75.6min
[Parallel(n_jobs=-1)]: Done 930 tasks      | elapsed: 75.8min
[Parallel(n_jobs=-1)]: Done 931 tasks      | elapsed: 75.8min
[Parallel(n_jobs=-1)]: Done 932 tasks      | elapsed: 75.9min
[Parallel(n_jobs=-1)]: Done 933 tasks      | elapsed: 76.0min
[Parallel(n_jobs=-1)]: Done 934 tasks      | elapsed: 76.1min
[Parallel(n_jobs=-1)]: Done 935 tasks      | elapsed: 76.1min
[Parallel(n_jobs=-1)]: Done 936 tasks      | elapsed: 76.3min
[Parallel(n_jobs=-1)]: Done 937 tasks      | elapsed: 76.4min
[Parallel(n_jobs=-1)]: Done 938 tasks      | elapsed: 76.5min
[Parallel(n_jobs=-1)]: Done 939 tasks      | elapsed: 76.6min
[Parallel(n_jobs=-1)]: Done 940 tasks      | elapsed: 76.6min
[Parallel(n_jobs=-1)]: Done 941 tasks      | elapsed: 76.7min
[Parallel(n_jobs=-1)]: Done 942 tasks      | elapsed: 76.8min
[Paralle

[Parallel(n_jobs=-1)]: Done 1059 tasks      | elapsed: 86.7min
[Parallel(n_jobs=-1)]: Done 1060 tasks      | elapsed: 86.7min
[Parallel(n_jobs=-1)]: Done 1061 tasks      | elapsed: 86.8min
[Parallel(n_jobs=-1)]: Done 1062 tasks      | elapsed: 86.8min
[Parallel(n_jobs=-1)]: Done 1063 tasks      | elapsed: 86.9min
[Parallel(n_jobs=-1)]: Done 1064 tasks      | elapsed: 87.0min
[Parallel(n_jobs=-1)]: Done 1065 tasks      | elapsed: 87.0min
[Parallel(n_jobs=-1)]: Done 1066 tasks      | elapsed: 87.0min
[Parallel(n_jobs=-1)]: Done 1067 tasks      | elapsed: 87.1min
[Parallel(n_jobs=-1)]: Done 1068 tasks      | elapsed: 87.2min
[Parallel(n_jobs=-1)]: Done 1069 tasks      | elapsed: 87.3min
[Parallel(n_jobs=-1)]: Done 1070 tasks      | elapsed: 87.3min
[Parallel(n_jobs=-1)]: Done 1071 tasks      | elapsed: 87.3min
[Parallel(n_jobs=-1)]: Done 1072 tasks      | elapsed: 87.5min
[Parallel(n_jobs=-1)]: Done 1073 tasks      | elapsed: 87.5min
[Parallel(n_jobs=-1)]: Done 1074 tasks      | elapsed: 

[Parallel(n_jobs=-1)]: Done 1190 tasks      | elapsed: 95.2min
[Parallel(n_jobs=-1)]: Done 1191 tasks      | elapsed: 95.3min
[Parallel(n_jobs=-1)]: Done 1192 tasks      | elapsed: 95.3min
[Parallel(n_jobs=-1)]: Done 1193 tasks      | elapsed: 95.4min
[Parallel(n_jobs=-1)]: Done 1194 tasks      | elapsed: 95.5min
[Parallel(n_jobs=-1)]: Done 1195 tasks      | elapsed: 95.6min
[Parallel(n_jobs=-1)]: Done 1196 tasks      | elapsed: 95.7min
[Parallel(n_jobs=-1)]: Done 1197 tasks      | elapsed: 95.8min
[Parallel(n_jobs=-1)]: Done 1198 tasks      | elapsed: 95.9min
[Parallel(n_jobs=-1)]: Done 1199 tasks      | elapsed: 96.0min
[Parallel(n_jobs=-1)]: Done 1200 tasks      | elapsed: 96.0min
[Parallel(n_jobs=-1)]: Done 1201 tasks      | elapsed: 96.1min
[Parallel(n_jobs=-1)]: Done 1202 tasks      | elapsed: 96.2min
[Parallel(n_jobs=-1)]: Done 1203 tasks      | elapsed: 96.2min
[Parallel(n_jobs=-1)]: Done 1204 tasks      | elapsed: 96.3min
[Parallel(n_jobs=-1)]: Done 1205 tasks      | elapsed: 

[Parallel(n_jobs=-1)]: Done 1319 tasks      | elapsed: 107.1min
[Parallel(n_jobs=-1)]: Done 1320 tasks      | elapsed: 107.1min
[Parallel(n_jobs=-1)]: Done 1321 tasks      | elapsed: 107.2min
[Parallel(n_jobs=-1)]: Done 1322 tasks      | elapsed: 107.2min
[Parallel(n_jobs=-1)]: Done 1323 tasks      | elapsed: 107.2min
[Parallel(n_jobs=-1)]: Done 1324 tasks      | elapsed: 107.4min
[Parallel(n_jobs=-1)]: Done 1325 tasks      | elapsed: 107.4min
[Parallel(n_jobs=-1)]: Done 1326 tasks      | elapsed: 107.4min
[Parallel(n_jobs=-1)]: Done 1327 tasks      | elapsed: 107.4min
[Parallel(n_jobs=-1)]: Done 1328 tasks      | elapsed: 107.5min
[Parallel(n_jobs=-1)]: Done 1329 tasks      | elapsed: 107.5min
[Parallel(n_jobs=-1)]: Done 1330 tasks      | elapsed: 107.5min
[Parallel(n_jobs=-1)]: Done 1331 tasks      | elapsed: 107.5min
[Parallel(n_jobs=-1)]: Done 1332 tasks      | elapsed: 107.6min
[Parallel(n_jobs=-1)]: Done 1333 tasks      | elapsed: 107.6min
[Parallel(n_jobs=-1)]: Done 1334 tasks  

KeyboardInterrupt: 

# Support Vector Machine

In [None]:
from sklearn.svm import SVC

#svc = SVC(kernel='linear', gamma=33, C=100)

# svc = SVC()

# scores = cross_val_score(svc, clean_df, target, cv=10, n_jobs=-1)
# print("Full features: mean of the scores: {:.2f}".format(scores.mean()))

# scores = cross_val_score(svc, clean_df2, target, cv=10)
# print("Reduced features: Mean of the scores: {:.2f}".format(scores.mean()))


#Random Grid Search
kernel = ['linear'] # edit in rbf and sigmoid, run it again later and ensure the results are the same 
# Cs = [x for x in np.linspace(start=0.001, stop=100, num=6)]
# gammas = [x for x in np.linspace(start=0.001, stop=100, num=6)]
# random_grid_svc = {'kernel' : kernel, 'C' : Cs, 'gamma' : gammas}

Cs = [0.001, 0.1, 1, 10, 100]
gammas = [0.001, 0.1, 1, 10, 100]
random_grid_svc = {'kernel' : kernel, 'C' : Cs, 'gamma' : gammas}

svc_prec_metric = make_scorer(precision_score)
svc_k = StratifiedKFold(n_splits=3, shuffle=False)

svc = SVC()

svc_random = GridSearchCV(estimator = svc, param_grid=random_grid_svc, scoring=svc_prec_metric,
                               cv=svc_k, n_jobs=-1, verbose=51)

svc_random.fit(X_train, y_train)
print("Best parameters found full range: " + str(svc_random.best_params_))   
print("Best score: " + str(svc_random.best_score_))  

# Evaluate on test set
print(svc_random.best_estimator_.score(X_test, y_test))


#pd.DataFrame(svc_random.cv_results_).to_csv('svc_results_full.csv')

# K20 
# svc_prec_metric2 = make_scorer(precision_score)
# svc_k2 = StratifiedKFold(n_splits=10, shuffle=False)

# svc2 = SVC()

# svc_random2 = RandomizedSearchCV(estimator = svc2, param_distributions=random_grid_svc, scoring=svc_prec_metric2,
#                                n_iter=100, cv=svc_k2, n_jobs=-1, verbose=51)

# svc_random2.fit(X_train[colnames_selected], y_train)
# print("Best parameters found 20 features: " + str(svc_random2.best_params_))                      
# print("Best score: " + str(svc_random2.best_score_)) 

# # Evaluate on test set
# print(svc_random.best_estimator_.score(X_test[colnames_selected], y_test))

# pd.DataFrame(svc_random2.cv_results_).to_csv('svc_results_26.csv')

# Grid search CV 

# param_grid = {
#     'max_depth' : [20, 25, 28, 30, 35],
#     'max_features': ['sqrt'],
#     'n_estimators' : [450, 460, 466, 470, 475]
# }

# forest = RandomForestClassifier()

# grid_search = GridSearchCV(estimator = forest, param_grid = param_grid, cv = 3, n_jobs=-1, verbose=3)
# grid_search.fit(X_train, y_train)

# print(grid_search.best_params_)


# grid_search = GridSearchCV(SVC(), param_grid, cv=3, n_jobs= -1, verbose=2)
# grid_search.fit(X_train, y_train)

# print(grid_search.best_params_)

# svc = SVC()
# clf = 
# svc.fit(X_train, y_train)
# print("Score " + str(svc.score(X_test, y_test)))
# Assess with cross validation
# Test on both the 20 features aswell as all features

In [None]:
print(len(clean_df.columns))

In [None]:
    import pandas as pd

    df = pd.read_csv("bouts_out_new.csv")

    X_train, X_test, y_train, y_test = train_test_split(
         clean_df, 
         target, random_state=0)


    Cs = [x for x in np.linspace(start=0.01, stop=100)]
    param_grid = {
        'C': Cs
    }

    logreg = LogisticRegression()
    grid_search = GridSearchCV(estimator = logreg, param_grid = param_grid, cv = 3, n_jobs=-1, verbose=3)
    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_)

    logreg = LogisticRegression(C=100)
    scores = cross_val_score(logreg, X_test, y_test, cv=10)
    print("Full features: mean of the scores: {:.2f}".format(scores.mean()))


In [None]:
alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]
#activation = ['relu', 'logistic']
activation = ['relu']
solver = ['adam']
hidden_layer_sizes = [None]

random_grid = {'hidden_layer_sizes': hidden_layer_sizes,
                'alpha': alpha,
               'activation' : activation,
               'solver': solver}


prec_metric = make_scorer(precision_score)
k = StratifiedKFold(n_splits=10, shuffle=False)
mlp = MLPClassifier()

mlp_random = GridSearchCV(estimator = mlp, param_grid=random_grid, scoring=prec_metric,
                                 cv=k, n_jobs=-1, verbose=3)

mlp_random.fit(X_train, y_train)
print("Best parameters found: " + str(mlp_random.best_params_))                      
print("Best score from search" + str(mlp_random.best_score_))

# # Evaluate on test set 
print(mlp_random.best_estimator_.score(X_test, y_test))
                                
                                


In [None]:
kernel = ['linear'] # edit in rbf and sigmoid, run it again later and ensure the results are the same 
# Cs = [x for x in np.linspace(start=0.001, stop=100, num=6)]
# gammas = [x for x in np.linspace(start=0.001, stop=100, num=6)]
# random_grid_svc = {'kernel' : kernel, 'C' : Cs, 'gamma' : gammas}

#Cs = [0.001, 0.01, 0.1, 1, 10, 100]
# Cs = [0.01]
# gammas = [0.001]
# random_grid_svc = {'kernel' : kernel, 'C' : Cs, 'gamma' : gammas}

# svc_prec_metric = make_scorer(precision_score)
# svc_k = StratifiedKFold(n_splits=10, shuffle=False)

# svc = SVC()

# svc_random = GridSearchCV(estimator = svc, param_grid=random_grid_svc, scoring=svc_prec_metric,
#                                cv=svc_k, n_jobs=-1, verbose=51)

# svc_random.fit(X_train, y_train)
# print("Best parameters found full range: " + str(svc_random.best_params_))   
# print("Best score: " + str(svc_random.best_score_))  

# # Evaluate on test set
# print(svc_random.best_estimator_.score(X_test, y_test))

# Cs = [0.01]
# gammas = [0.001]
# random_grid_svc = {'kernel' : kernel, 'C' : Cs, 'gamma' : gammas}

# svc_prec_metric = make_scorer(precision_score)
# svc_k = StratifiedKFold(n_splits=10, shuffle=False)

Cs = [0.01, 0.1, 1, 10, 100]
gammas = [0.001, 0.01, 0.1, 1, 10, 100]
kernel = ['linear']
random_grid = {'kernel' : kernel, 'C' : Cs, 'gamma' : gammas}

svc_random = GridSearchCV(estimator = svc, param_grid=random_grid, scoring=svc_prec_metric,
                                cv=svc_k, n_jobs=-1, verbose=51)
svc_random.fit(X_train, y_train)
print("Best parameters found full range: " + str(svc_random.best_params_))   
print("Best score: " + str(svc_random.best_score_))  

# # Evaluate on test set
print(svc_random.best_estimator_.score(X_test, y_test))

pd.DataFrame(svc_random2.cv_results_).to_csv('new_svc_results_full.csv') # this is wrong, save it again after 

#svc = SVC(kernel='linear',C=100, gamma = 0.001)


# svc.fit(X_train, y_train)
# print(svc.score(X_test, y_test))
# #svc_random = GridSearchCV(estimator = svc, param_grid=random_grid_svc, scoring=svc_prec_metric,
#                                cv=svc_k, n_jobs=-1, verbose=51)

# gammas = [0.001, 0.01, 0.1, 1, 10, 100]

# for gamma in gammas:
#     svc = SVC(kernel='linear', C=)




# svc_random.fit(X_train, y_train)
# print("Best parameters found full range: " + str(svc_random.best_params_))   
# print("Best score: " + str(svc_random.best_score_))  

# # Evaluate on test set
# print(svc_random.best_estimator_.score(X_test, y_test))