# Data pre-processing/data clean-up

In [3]:
import pandas as pd
from fancyimpute import KNN, MICE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_selection import chi2
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import precision_score, make_scorer, f1_score, classification_report

df = pd.read_csv("bouts_out_new.csv")

# Remove draws
df = df[df.result != 'draw']
print("drawless shape")
print(df.shape)

# Random under sample - reduce to 82,000 records (roughly)
winAcount, winBcount = df.result.value_counts()
df_winA = df[df['result'] == "win_A"]
df_winB = df[df['result'] == "win_B"]
df_winA_reduced = df_winA.sample(winBcount)
df_winB_reduced = df_winB
df = pd.concat([df_winA_reduced, df_winB_reduced], axis=0)

# Encode the label 
le = preprocessing.LabelEncoder().fit(df['result'])
encoded = le.transform(df['result'])
df['result'] = encoded
target = df['result']
clean_df = df.drop(['result'], axis=1) #trial 
print("Clean df shape " + str(clean_df.shape))

# Models can only handle numeric features so I convert the non-numeric features - dummies
clean_df = pd.get_dummies(clean_df)

# Impute with MICE
clean_df = pd.DataFrame(MICE().complete(clean_df))


# SCALING 
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
scaled_df = scaler.fit_transform(clean_df)
clean_df = pd.DataFrame(scaled_df, columns=clean_df.columns)

# Split the dataset, splits the dataset 90/10%, shuffles the dataset (see the book)
X_train, X_test, y_train, y_test = train_test_split(
     clean_df, 
     target, test_size=0.1, random_state=0)


# Select the 20 best features to reduce dimensionality 
import sklearn.feature_selection
selection = sklearn.feature_selection.SelectKBest(chi2, k=20)
selected_features = selection.fit(X_train, y_train) 
indices_selected = selected_features.get_support(indices=True)
colnames_selected = [clean_df.columns[i] for i in indices_selected]








  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


drawless shape
(362655, 26)
Clean df shape (81988, 25)
[MICE] Completing matrix with shape (81988, 36)
[MICE] Starting imputation round 1/110, elapsed time 0.031
[MICE] Starting imputation round 2/110, elapsed time 0.919
[MICE] Starting imputation round 3/110, elapsed time 1.622
[MICE] Starting imputation round 4/110, elapsed time 2.323
[MICE] Starting imputation round 5/110, elapsed time 3.031
[MICE] Starting imputation round 6/110, elapsed time 3.734
[MICE] Starting imputation round 7/110, elapsed time 4.441
[MICE] Starting imputation round 8/110, elapsed time 5.141
[MICE] Starting imputation round 9/110, elapsed time 5.849
[MICE] Starting imputation round 10/110, elapsed time 6.550
[MICE] Starting imputation round 11/110, elapsed time 7.222
[MICE] Starting imputation round 12/110, elapsed time 7.972
[MICE] Starting imputation round 13/110, elapsed time 8.713
[MICE] Starting imputation round 14/110, elapsed time 9.493
[MICE] Starting imputation round 15/110, elapsed time 10.257
[MICE

# Test dataset

In [95]:
# Do not need balanced data, purely for testing code
test_df = clean_df.sample(2500)
test_target = target.sample(2500)

x_train_test, x_test_test, y_train_test, y_test_test = train_test_split(
     test_df, 
     test_target, random_state=0)


0.7255208079231107


# Logistic Regression

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


def logreg_gridsearch(X, x, Y, y, param_grid):
    k = StratifiedKFold(n_splits=10, shuffle=False)
    logreg = LogisticRegression()
    logreg_grid = GridSearchCV(estimator = logreg, param_grid=param_grid, 
                            cv=k, n_jobs=-1, verbose=3)
    logreg_grid.fit(X, Y)
    prediction = logreg_grid.predict(x)
    print(classification_report(y, prediction))
    print(logreg_grid.best_estimator_.score(X, Y)) # 
    print(logreg_grid.best_estimator_.score(x, y))
    return logreg_grid.best_params_


# # Do a gridsearch on exponential values of 0.01 to 1000 for both sets of features
Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = {'C':Cs}

# # # Full range of features 
print("Full range of features best parameters and results: ")
full_range_initial = logreg_gridsearch(X_train, X_test, y_train, y_test, param_grid)
print(full_range_initial)

# # # K20 range of features
print("K20 range of features best parameters: ")
k20_range_initial = logreg_gridsearch(X_train[colnames_selected], y_train, param_grid)
print(k20_range_initial)

# # Refined the param grid for full range
Cs = [x for x in range(50, 151)]
param_grid = {'C':Cs}
full_range_refine = logreg_gridsearch(X_train, X_test, y_train, y_test, param_grid)
print("Refined parameters results: ")
print(full_range_refine)

# # Refined the param grid for K20 range
Cs = [x for x in range(750, 1250)]
param_grid = {'C':Cs}
k20_range_refine = logreg_gridsearch(X_train[colnames_selected], X_test[colnames_selected], y_train, y_test, param_grid)
print("Refined parameters results - k20: ")
print(k20_range_refine)

Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

# # Iterate through C levels - full range of features:
print("Full range of features - growth of C")
for c in Cs:
    logreg_clf = LogisticRegression(C=c)
    logreg_clf.fit(X_train, y_train)
    print("When the C is " + str(c) + " the score for the training set is " + str(logreg_clf.score(X_train, y_train)))
    print("When the C is " + str(c) + " the score for the test set is " + str(logreg_clf.score(X_test, y_test)))

print("K20 range of features - growth of C")
for c in Cs:
    logreg_clf = LogisticRegression(C=c)
    logreg_clf.fit(X_train[colnames_selected], y_train)
    print("When the C is " + str(c) + " the score for the training set is " + str(logreg_clf.score(X_train[colnames_selected], y_train)))
    print("When the C is " + str(c) + " the score for the test set is " + str(logreg_clf.score(X_test[colnames_selected], y_test)))



Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    2.5s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.9s finished


             precision    recall  f1-score   support

          0       0.64      0.71      0.67      4038
          1       0.68      0.62      0.65      4161

avg / total       0.66      0.66      0.66      8199

0.6623978533967557
Refined parameters results - k20: 
{'C': 773}


# KNN

In [55]:
from sklearn.neighbors import KNeighborsClassifier

Ks = [x for x in range(1, 31)]
param_grid = {'n_neighbors': Ks}


def knn_gridsearch(X, x, Y, y, param_grid):
    k = StratifiedKFold(n_splits=10, shuffle=False)
    knn = KNeighborsClassifier()
    knn_grid = GridSearchCV(estimator = knn, param_grid=param_grid, 
                            cv=k, n_jobs=-1, verbose=3)
    knn_grid.fit(X, Y)
    prediction = knn_grid.predict(x)
    print(classification_report(y, prediction))
    print("Train set: ")
    print(knn_grid.best_estimator_.score(X, Y))
    print("Test set: ")
    print(knn_grid.best_estimator_.score(x, y))
    return knn_grid.best_params_

# Testing 1-30 for full range of features
print("Full range results:")
knn_initial = knn_gridsearch(X_train, X_test, y_train, y_test, param_grid)
print(knn_initial)

# Testing 1-30 for K20 range of features
print("K20 range results:")
knn_initial_k20 = knn_gridsearch(X_train[colnames_selected], X_test[colnames_selected], y_train, y_test, param_grid)
print(knn_initial_k20)

Ks = [1, 5, 10, 15, 20, 25, 30]

# Print growth of K and the accuracy on train/test set
print("Full range of features - growth of K")
for k in Ks:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    print("When K is " + str(k) + " the train score is " + str(knn.score(X_train, y_train)))
    print("When K is " + str(k) + " the test score is " + str(knn.score(X_test, y_test)))
    
print("K20 range of features - growth of K")
for k in Ks:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train[colnames_selected], y_train)
    print("When K is " + str(k) + " the train score is " + str(knn.score(X_train[colnames_selected], y_train)))
    print("When K is " + str(k) + " the test score is " + str(knn.score(X_test[colnames_selected], y_test)))
    









K20 range of features - growth of K
When K is 1 the train score is 0.9651574082857879
When K is 1 the test score is 0.6431272106354433
When K is 5 the train score is 0.7675669815284121
When K is 5 the test score is 0.6515428710818393
When K is 10 the train score is 0.7299190936318422
When K is 10 the test score is 0.654835955604342
When K is 15 the train score is 0.720147989537736
When K is 15 the test score is 0.6625198194901817
When K is 20 the train score is 0.7123555001422976
When K is 20 the test score is 0.6650811074521283
When K is 25 the train score is 0.7085202401441949
When K is 25 the test score is 0.6702036833760214
When K is 30 the train score is 0.702977408556831
When K is 30 the test score is 0.6722771069642639


# RDF

In [101]:
from sklearn.ensemble import RandomForestClassifier

def rdf_gridsearch(X, x, Y, y, param_grid):
    k = StratifiedKFold(n_splits=5, shuffle=False)
    rdf = RandomForestClassifier()
    rdf_grid = GridSearchCV(estimator = rdf, param_grid=param_grid,
                            cv=k, n_jobs=-1, verbose=51)
    rdf_grid.fit(X, Y)
    prediction = rdf_grid.predict(x)
    print(classification_report(y, prediction))
    print("Train set: ")
    print(rdf_grid.best_estimator_.score(X, Y))
    print("Test set: ")
    print(rdf_grid.best_estimator_.score(x, y))
    return rdf_grid.best_params_

n_estimators = [1, 20, 40, 60, 80, 100, 120, 128]
max_depth = [1, 10, 20, 30, 40, 50]
min_samples_leaf = [10, 30, 50, 70, 90, 110, 130, 150, 170, 190, 200]

param_grid = {'n_estimators':n_estimators,
             'max_depth': max_depth,
             'min_samples_leaf':min_samples_leaf}

# # Initial test - full range
rdf_full_initial = rdf_gridsearch(X_train, X_test, y_train, y_test, param_grid)
print(rdf_full_initial)

# # K20 range
rdf_k20_initial = rdf_gridsearch(X_train[colnames_selected], X_test[colnames_selected], y_train, y_test, param_grid)
print(rdf_k20_initial)

# # Refined test - full range
n_estimators = [128]
max_depth = [57] # edit 
min_samples_leaf = [x for x in range(1, 100)]  

# #Further refined test
min_samples_leaf = [x for x in range(1, 100)]  
max_depth = [57] # edit 
param_grid = {'n_estimators':n_estimators,
             'max_depth': max_depth,
             'min_samples_leaf':min_samples_leaf}

rdf_full_refined = rdf_gridsearch(X_train, X_test, y_train, y_test, param_grid)
print(rdf_full_refined)

# #Refined K20 test
n_estimators = [120]
min_samples_leaf = [x for x in range(5, 15)]  
max_depth = [x for x in range(30, 50)]  
param_grid = {'n_estimators':n_estimators,
             'max_depth': max_depth,
             'min_samples_leaf':min_samples_leaf}

rdf_k20_refined = rdf_gridsearch(X_train[colnames_selected], X_test[colnames_selected], y_train, y_test, param_grid)
print(rdf_k20_refined)
min_samples_leaf = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]


print("Full range of features, change in min samples")
for mini in min_samples_leaf:
    rdf_full = RandomForestClassifier(min_samples_leaf=mini)
    rdf_full.fit(X_train, y_train)
    print("When min is " + str(mini) + "Training set score: ")
    print(rdf_full.score(X_train, y_train))
    print("When min is " + str(mini) + "Ttest set score: ")
    print(rdf_full.score(X_test, y_test))

# # print("K20 range of features, change in min samples")
for min in min_samples_leaf:
    rdf_k20 = RandomForestClassifier(min_samples_leaf=min)
    rdf_k20.fit(X_train[colnames_selected], y_train)
    print("When min is " + str(min) + "Training set score: ")
    print(rdf_k20.score(X_train[colnames_selected], y_train))
    print("When min is " + str(min) + "Ttest set score: ")
    print(rdf_k20.score(X_test[colnames_selected], y_test))  
    


When min is 5Training set score: 
0.8033582241255471
When min is 5Ttest set score: 
0.689840224417612
When min is 10Training set score: 
0.7575654907913103
When min is 10Ttest set score: 
0.6922795462861324
When min is 15Training set score: 
0.7362208459255445
When min is 15Ttest set score: 
0.6904500548847421
When min is 20Training set score: 
0.7289433384379785
When min is 20Ttest set score: 
0.6884985973899256
When min is 25Training set score: 
0.7200937809158546
When min is 25Ttest set score: 
0.6897182583241859
When min is 30Training set score: 
0.7151879006355961
When min is 30Ttest set score: 
0.6871569703622393
When min is 35Training set score: 
0.7114746100367263
When min is 35Ttest set score: 
0.6909379192584462
When min is 40Training set score: 
0.7078019759042675
When min is 40Ttest set score: 
0.685693377241127
When min is 45Training set score: 
0.7062976866470612
When min is 45Ttest set score: 
0.6887425295767776
When min is 50Training set score: 
0.7040751331499275
When 

# Naive Bayes

In [129]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB



def bn_gridsearch(X, x, Y, y, param_grid):
    k = StratifiedKFold(n_splits=10, shuffle=False)
    bn = BernoulliNB()
    bn_grid = GridSearchCV(estimator = bn, param_grid=param_grid,
                            cv=k, n_jobs=-1, verbose=51)
    bn_grid.fit(X, Y)
    prediction = bn_grid.predict(x)
    print(classification_report(y, prediction))
    print("Train set: ")
    print(bn_grid.best_estimator_.score(X, Y))
    print("Test set: ")
    print(bn_grid.best_estimator_.score(x, y))
    return bn_grid.best_params_

alphas = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = {'alpha':alphas}

print("Initial search - full range")
bn_initial = bn_gridsearch(X_train, X_test, y_train, y_test, param_grid)
print(bn_initial)

alphas = [x for x in np.linspace(0.0001, 0.1)]
param_grid = {'alpha':alphas}

print("Refined search - full range")
bn_initial = bn_gridsearch(X_train, X_test, y_train, y_test, param_grid)
print(bn_initial)

alphas = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = {'alpha':alphas}

print("Initial search - K20 range")
bn_initial = bn_gridsearch(X_train[colnames_selected], X_test[colnames_selected], y_train, y_test, param_grid)
print(bn_initial)

alphas = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = {'alpha':alphas}

for a in alphas:
    bn = BernoulliNB(alpha=a)
    bn.fit(X_train[colnames_selected], y_train)
    print("When the alpha is " + str(a) + " the train score is " + str(bn.score(X_train[colnames_selected], y_train)))
    print("When the alpha is " + str(a) + " the test score is " + str(bn.score(X_test[colnames_selected], y_test)))

    





When the alpha is 0.001 the train score is 0.6454891650517015
When the alpha is 0.001 the test score is 0.6408098548603488
When the alpha is 0.01 the train score is 0.6454891650517015
When the alpha is 0.01 the test score is 0.6408098548603488
When the alpha is 0.1 the train score is 0.6454891650517015
When the alpha is 0.1 the test score is 0.6408098548603488
When the alpha is 1 the train score is 0.6454891650517015
When the alpha is 1 the test score is 0.6408098548603488
When the alpha is 10 the train score is 0.6453671956524685
When the alpha is 10 the test score is 0.6399560922063666
When the alpha is 100 the train score is 0.6447844529672444
When the alpha is 100 the test score is 0.6397121600195146


# MLP 

In [88]:
from sklearn.neural_network import MLPClassifier
import itertools 

# Do gridsearch on parameters - both sets of features 
# Print most important parameter change variation - both sets of features 

def mlp_gridsearch(X, x, Y, y, param_grid):
    k = StratifiedKFold(n_splits=10, shuffle=False)
    mlp = MLPClassifier()
    mlp_grid = GridSearchCV(estimator = mlp, param_grid=param_grid, 
                            cv=k, n_jobs=-1, verbose=51)
    mlp_grid.fit(X, Y)
    prediction = mlp_grid.predict(x)
    print(classification_report(y, prediction))
    print("Train set: ")
    print(mlp_grid.best_estimator_.score(X, Y))
    print("Test set: ")
    print(mlp_grid.best_estimator_.score(x, y))
    return mlp_grid.best_params_


#hidden_layer_sizes = ([x for x in itertools.product((10, 25, 50, 53, 73, 75, 100, 125, 150, 175, 200), repeat=1)] + \
                      #[x for x in itertools.product((10, 25, 50, 53, 73, 75, 100, 125, 150, 175, 200), repeat=2)] )

hidden_layer_sizes = ([x for x in itertools.product((10, 50, 53, 73, 75, 125, 200), repeat=1)] + \
                      [x for x in itertools.product((10, 50, 53, 73, 75, 125, 200), repeat=2)] )
    
alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
activation = ['relu', 'logistic', 'tanh']  
solver=['adam']

param_grid = {'hidden_layer_sizes':hidden_layer_sizes,
             'alpha': alpha,
             'activation':activation,
             'solver':solver}

#MLP initial search
print("MLP full range initial search")
full_mlp_initial = mlp_gridsearch(X_train, X_test, y_train, y_test, param_grid)
print(full_mlp_initial)

K20 initial search 
print("MLP K20 range initial search")
k20_mlp_initial = mlp_gridsearch(X_train[colnames_selected], X_test[colnames_selected], y_train, y_test, param_grid)
print(k20_mlp_initial)

Refined search for full range 
hidden_layer_sizes = ([x for x in itertools.product((x for x in range(190, 211)), repeat=1)])
alpha = [0.00001]
activation = ['relu', 'logistic', 'tanh'] 
solver=['adam']

 param_grid = {'hidden_layer_sizes':hidden_layer_sizes,
              'alpha': alpha,
              'activation':activation,
              'solver':solver}

print("MLP full range refined search")
full_mlp_refined = mlp_gridsearch(X_train, X_test, y_train, y_test, param_grid)
print(full_mlp_refined)

# Refined search for K20 range
list1 = []
for x in range(190, 211):
     list1.append(x)

print(list1)

list2 = []
for x in range(5, 16):
     list2.append(x)

print(list2)

hidden_layer_sizes=[]
for combo in itertools.product(list2, list1):
     hidden_layer_sizes.append(combo)
    
alpha = [0.00001]
activation = ['relu', 'logistic', 'tanh'] 
solver=['adam']

param_grid = {'hidden_layer_sizes':hidden_layer_sizes,
              'alpha': alpha,
              'activation':activation,
              'solver':solver}

print("MLP K20 range refined search")
k20_mlp_refined = mlp_gridsearch(X_train[colnames_selected], X_test[colnames_selected], y_train, y_test, param_grid)
print(k20_mlp_refined)

# 1 layer full range + k20 range - /train and test 
hidden_layer_sizes = [(10,), (25,), (50,), (75,), (100,), (125,), (150,), (175,), (200,)]
alpha = [0.00001]
activation = ['relu', 'logistic', 'tanh'] 
solver=['adam']

print("Full range of features - 1 layer: ")
for layer in hidden_layer_sizes:
    mlp_full=MLPClassifier(hidden_layer_sizes=layer)
    mlp_full.fit(X_train, y_train)
    print("For layer = " + str(layer) + "train = " + str(mlp_full.score(X_train, y_train)))
    print("For layer = " + str(layer) + "test = " + str(mlp_full.score(X_test, y_test)))

# # 1 layer full range + k20 range - train and test
print("K20 range of features - 1 layer: ")
for layer in hidden_layer_sizes:
    mlp_full=MLPClassifier(hidden_layer_sizes=layer)
    mlp_full.fit(X_train[colnames_selected], y_train)
    print("For layer = " + str(layer) + "train = " + str(mlp_full.score(X_train[colnames_selected], y_train)))
    print("For layer = " + str(layer) + "test = " + str(mlp_full.score(X_test[colnames_selected], y_test)))
    
# 2 layer full range + full/k20 range - /train and test 
hidden_layer_sizes = [(10,10), (25,25), (50,50), (75,75), (100,100), (125,125), (150,150), (175,175), (200,200)]
alpha = [0.00001]
activation = ['relu', 'logistic', 'tanh']  
solver=['adam']

print("Full range of features - 2 layer: ")
for layer in hidden_layer_sizes:
    mlp_full=MLPClassifier(hidden_layer_sizes=layer)
    mlp_full.fit(X_train, y_train)
    print("For layer = " + str(layer) + "train = " + str(mlp_full.score(X_train, y_train)))
    print("For layer = " + str(layer) + "test = " + str(mlp_full.score(X_test, y_test)))

# 2 layer full range + k20 range - train and test
print("K20 range of features - 2 layer: ")
for layer in hidden_layer_sizes:
    mlp_full=MLPClassifier(hidden_layer_sizes=layer)
    mlp_full.fit(X_train[colnames_selected], y_train)
    print("For layer = " + str(layer) + "train = " + str(mlp_full.score(X_train[colnames_selected], y_train)))
    print("For layer = " + str(layer) + "test = " + str(mlp_full.score(X_test[colnames_selected], y_test)))
    

hidden_layer_sizes = ([x for x in itertools.product((10, 25, 50, 53, 73, 75, 100, 125, 200), repeat=1)] + \
                      [x for x in itertools.product((10, 25, 50, 53, 73, 75, 100, 125, 200), repeat=2)] )
    
alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
activation = ['relu', 'logistic', 'tanh'] 
solver=['adam']

full_mlp_refined = mlp_gridsearch(x_train_test, x_test_test, y_train_test, y_test_test, param_grid)
print(full_mlp_refined)

mlp = MLPClassifier(hidden_layer_sizes = (100,))

scores = cross_val_score(mlp, clean_df, target, cv=10, n_jobs=-1)
print(scores.mean())

Full range of features - 2 layer: 
For layer = (10, 10)train = 0.7634200219544919
For layer = (10, 10)test = 0.760946456884986
For layer = (25, 25)train = 0.7703180690888886
For layer = (25, 25)test = 0.7632638126600805
For layer = (50, 50)train = 0.7705349035764139
For layer = (50, 50)test = 0.7600926942310038
For layer = (75, 75)train = 0.7711311984171082
For layer = (75, 75)test = 0.761190389071838
For layer = (100, 100)train = 0.7659407228719728
For layer = (100, 100)test = 0.7637516770337847
For layer = (125, 125)train = 0.7720662971445609
For layer = (125, 125)test = 0.7613123551652641
For layer = (150, 150)train = 0.7775413679545732
For layer = (150, 150)test = 0.7641175753140627
For layer = (175, 175)train = 0.7612516770792395
For layer = (175, 175)test = 0.7581412367361874
For layer = (200, 200)train = 0.7784087059046741
For layer = (200, 200)test = 0.7685083546773996
K20 range of features - 2 layer: 
For layer = (10, 10)train = 0.6927048747103227
For layer = (10, 10)test = 0.



For layer = (200, 200)train = 0.7523208066242936
For layer = (200, 200)test = 0.6850835467739969


# SVCS

In [None]:
from sklearn.svm import SVC, LinearSVC # remove l9inear

# Do gridsearch on parameters - both sets of features 
# Retrieve best parameters, refine parameters from selection
# Print most important parameter change variation - both sets of features 

def svc_gridsearch(X, x, Y, y, param_grid):
    k=StratifiedKFold(n_splits=10, shuffle=False)
    svc = SVC()
    svc_grid = GridSearchCV(estimator = svc, param_grid=param_grid, cv=k, n_jobs=-1,
                           verbose=51)
    svc_grid.fit(x, y)
    prediction = svc_grid.predict(x)
    print(classification_report(y, prediction))
    print("Train set: ")
    print(svc_grid.best_estimator_.score(X, Y))
    print("Test set: ")
    print(svc_grid.best_estimator_.score(x, y))
    return svc_grid.best_params_

kernel = ['linear', 'rbf'] 
Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
gammas = [0.001, 0.01, 0.1, 1, 10, 100, 1000] 
param_grid= {'C': Cs, 'gamma': gammas, 'kernel': kernel}
    
# Full range of features initial test
print("Full range of features:")
full_initial_svc = svc_gridsearch(X_train, x_test, Y_train, y_test)

# K20 range of features initial test
print("K20 range of features:")
k20_initial_svc = svc_gridsearch(X_train[colnames_selected}, x_test[colnames_selected], Y_train, y_test)

def linearsvc_gridsearch(X, x, Y, y, param_grid):
    k=StratifiedKFold(n_splits=10, shuffle=False)
    svc = LinearSVC()
    svc_grid = GridSearchCV(estimator = svc, param_grid=param_grid, cv=k, n_jobs=-1,
                           verbose=51)
    svc_grid.fit(x, y)
    prediction = svc_grid.predict(x)
    print(classification_report(y, prediction))
    print("Train set: ")
    print(svc_grid.best_estimator_.score(X, Y))
    print("Test set: ")
    print(svc_grid.best_estimator_.score(x, y))
    return svc_grid.best_params_

Cs = [x for x in range(30, 125)]
param_grid= {'C': Cs}
#Full range of features refined linear test
print("Full range of features:")
full_refined_svc = linearsvc_gridsearch(X_train, X_test, y_train, y_test, param_grid)
print(full_refined_svc)

#K20 range of features refined linear test
print("K20 range of features:")
k20_refined_svc = linearsvc_gridsearch(X_train[colnames_selected], X_test[colnames_selected], y_train, y_test, param_grid)
print(k20_refined_svc)

gammas = [0.001, 0.01, 0.1, 1, 10, 100, 1000] 

print("Analysing gamma change with the RBF kernel - full features")
for g in gammas:
    svc = SVC(kernel='rbf', gamma=g)
    svc.fit(X_train, y_train)
    print("When gamma = " + str(g) + "the train score = " + str(svc.score(X_train, y_train)))
    print("When gamma = " + str(g) + "the test score = " + str(svc.score(X_test, y_test)))

print("Analysing gamma change with the RBF kernel - K20 features")
for g in gammas:
    svc = SVC(kernel='rbf', gamma=g)
    svc.fit(X_train[colnames_selected], y_train)
    print("When gamma = " + str(g) + "the train score = " + str(svc.score(X_train[colnames_selected], y_train)))
    print("When gamma = " + str(g) + "the test score = " + str(svc.score(X_test[colnames_selected], y_test)))
    
svc = SVC(kernel='rbf', gamma=1000)
svc.fit(X_train[colnames_selected], y_train)
print("When gamma = " + str(g) + "the train score = " + str(svc.score(X_train[colnames_selected], y_train)))
print("When gamma = " + str(g) + "the test score = " + str(svc.score(X_test[colnames_selected], y_test)))
        
                                         
                                