# Data pre-processing/data clean-up

In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import chi2
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

df = pd.read_csv("bouts_out_new.csv")

# INITIAL INSPECTION

# This is the shape of the data
print(df.shape)
print(df['result'].value_counts())

# Remove draws
df = df[df.result != 'draw']
print("drawless shape")
print(df.shape)

# Random under sample
winAcount, winBcount = df.result.value_counts()
df_winA = df[df['result'] == "win_A"]
df_winB = df[df['result'] == "win_B"]
df_winA_reduced = df_winA.sample(winBcount)

df_winB_reduced = df_winB
df_winA_reduced = df_winA.sample(2500)
df_winB_reduced = df_winB.sample(2500)

df = pd.concat([df_winA_reduced, df_winB_reduced], axis=0)
print(df.result.value_counts())
print(df.shape)


# These are the features I am working with
#print(df.columns)

# Shows that reach_b is missing 349K times, a lot of the score cards are 
# missing aswell as the physical features 
#print(df.isnull().sum().sort_values(ascending=False))

# The data shows that fighter A is recorded as winning the most 


# If I create a new dataframe with purely complete records I only get 2800 records
# removed_df = df.dropna(how='any')
# print("Inconsistent records removed shape")
# print(removed_df.shape)

# There's 3 unique values for a result meaning it is multiclass classification
#print(df['result'].value_counts()) 

# PRE-PROCESSING AND CLEAN-UP

# Encode the label 
le = preprocessing.LabelEncoder().fit(df['result'])
encoded = le.transform(df['result'])
df['result'] = encoded
target = df['result']
clean_df = df.drop(['result'], axis=1)


#print(clean_df.head)

# Models can only handle numeric features so I convert the non-numeric features
# into numeric using dummy features
clean_df = pd.get_dummies(clean_df)

# This results in more features 
#print("Clean dataframe columns")
#print(clean_df.columns)
#print(clean_df.shape)

# Convert result to numeric data 
#result_conversion = {'win_A': 0, 'win_B': 1, 'draw': 2}
#target = target.replace({'result': result_conversion}).infer_objects()
#print(type(target['result']))

#target = clean_df[['result_win_A', 'result_win_B', 'result_draw']]
#print("Target dummied")
#print(target.isnull().sum().sort_values(ascending=False))

# Imputes the mean for missing values - link to paper
the_imputer = Imputer(missing_values= 'NaN', strategy='mean', axis=0)
the_imputer.fit(clean_df)
clean_df = pd.DataFrame(data=the_imputer.transform(clean_df), columns=clean_df.columns)


#clean_df = clean_df[clean_df.result != ]

# All records now have no missing features 
#print(clean_df.isnull().sum().sort_values(ascending=False))

# Test both imputed values aswell as a completely clean dataset

# SCALING 
# Use MinMaxScaler to scale all values
# USe for KNN algorithm 
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
scaled_df = scaler.fit_transform(clean_df)
clean_df = pd.DataFrame(scaled_df, columns=clean_df.columns)
#print("Type of scaled df " + str(type(clean_df)))
#print("Shape of scaled df " + str(clean_df.shape))


# Split the dataset, splits the dataset 75%/25%, shuffles the dataset (see the book)
X_train, X_test, y_train, y_test = train_test_split(
     clean_df, 
     target, random_state=0)



#print(y_test.shape)
#print(X_train.head)

# Split the reduced 2800 dataset, splits the dataset 75%/25%, shuffles the dataset
X_train, X_test, y_train, y_test = train_test_split(
    clean_df, 
    target, random_state=0)


# Select the 20 best features to reduce dimensionality 
import sklearn.feature_selection

selection = sklearn.feature_selection.SelectKBest(chi2, k=20)
selected_features = selection.fit(clean_df, target) # on x_train and y_train but save cleandf_2 as usual 
indices_selected = selected_features.get_support(indices=True)
colnames_selected = [clean_df.columns[i] for i in indices_selected]

clean_df2 = clean_df[colnames_selected]

selection = sklearn.feature_selection.SelectKBest(chi2, k=20)
selected_features = selection.fit(X_train, y_train)
indices_selected = selected_features.get_support(indices=True)
colnames_selected = [clean_df.columns[i] for i in indices_selected]

X = clean_df[colnames_selected]

#print(clean_df2.shape)

# print(X_train_selected.columns)
# print(X_train_selected.head())






(387427, 26)
win_A    321661
win_B     40994
draw      24772
Name: result, dtype: int64
drawless shape
(362655, 26)
win_A    2500
win_B    2500
Name: result, dtype: int64
(5000, 26)


# KNN

In [49]:
from sklearn.neighbors import KNeighborsClassifier

# for i in range(1, 11):
knn = knn = KNeighborsClassifier(n_neighbors=5)
#     knn.fit(X_train_selected, y_train)
#     print("Normal:" + str(knn.score(X_test_selected, y_test)))

scores = cross_val_score(knn, X_test_selected, y_test, cv=10)
print(scores.mean())
    
# knn = knn = KNeighborsClassifier(n_neighbors=5)
# knn.fit(X_train_selected, y_train)
# print("Actual reduced:" + str(knn.score(X_test_selected, y_test)))

#scores = cross_val_score(knn, X_test_selected, y_test, cv=10)
#print(scores.mean())

# knn = knn = KNeighborsClassifier(n_neighbors=5)
# knn.fit(clean_df2, target)
# print("Assumed reduced:" + str(knn.score(clean_df2, target)))


# for i in range(1, 11):
#     knn = KNeighborsClassifier(n_neighbors=i)
#     scores = cross_val_score(knn, clean_df, target, cv=10)
#     print("Number of neighbors: " + str(i) + "\nDataset with 36 features scores: {}".format(scores))
#     print("Mean of the scores: {:.2f}".format(scores.mean()))
#     # SelectKBest Results applied
#     scores = cross_val_score(knn, clean_df2, target, cv=10)
#     print("Number of neighbors: " + str(i) + "\nDataset with 20 features scores: {}".format(scores))
#     print("Mean of the scores: {:.2f}".format(scores.mean()))


# Benefit of using cross-validation:
# -	Train test split performs a random split, we could get lucky with the data split. 
# -	With cross validation each example will be in the training set exactly once. 
# -	We get a best case and a worst case scenario with the multiple folds as opposed to the one accuracy. 
# Another benefit of cross-validation as compared to using a single split of the data is
# that we use our data more effectively. When using train_test_split, we usually use
# 75% of the data for training and 25% of the data for evaluation. When using five-fold
# cross-validation, in each iteration we can use four-fifths of the data (80%) to fit the
# model. When using 10-fold cross-validation, we can use nine-tenths of the data
# (90%) to fit the model. More data will usually result in more accurate models.

# As the simple k-fold strategy fails here, scikit-learn does not use it for classification,
# but rather uses stratified k-fold cross-validation. In stratified cross-validation, we
# split the data such that the proportions between classes are the same in each fold as
# they are in the whole dataset, as illustrated in Figure 5-2:
# For example, if 90% of your samples belong to class A and 10% of your samples
# belong to class B, then stratified cross-validation ensures that in each fold, 90% of
# samples belong to class A and 10% of samples belong to class B.

# Talk about benefits of cross validation etc 


0.6497034306195596


# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression


# X_train, X_test, y_train, y_test = train_test_split(
#       clean_df, 
#       target, random_state=0)

# logreg = LogisticRegression().fit(X_train, y_train)
# print(logreg.score(X_train, y_train))
# print(logreg.score(X_test, y_test))


# High training set accuracy/low test set accuracy means overfitting 
# When both train/test is similar, means underfitting 

# C = [0.01, 0.1, 1, 10, 100]

# for i in C:
#     logreg = LogisticRegression(C=i)
#     scores = cross_val_score(logreg, clean_df, target, cv=10)
#     print("Full features: When C = " + str(i) + ". Mean of the scores: {:.2f}".format(scores.mean()))
#     scores = cross_val_score(logreg, clean_df2, target, cv=10)
#     print("Reduced features: When C = " + str(i) + ". Mean of the scores: {:.2f}".format(scores.mean()))

# for i in range(1, 100):
#     logreg = LogisticRegression(C=i)
#     scores = cross_val_score(logreg, clean_df, target, cv=10)
#     print("Full features: When C = " + str(i) + ". Mean of the scores: {:.2f}".format(scores.mean()))
#     scores = cross_val_score(logreg, clean_df2, target, cv=10)
#     print("Reduced features: When C = " + str(i) + ". Mean of the scores: {:.2f}".format(scores.mean()))
    
# for i in range(1, 101):
#     logreg = LogisticRegression(C=i).fit(X_train, y_train)
#     print("When C is equal to " + str(i) + " training set result : " + str(logreg.score(X_train, y_train)))
#     print("When C is equal to " + str(i) + " test set result : " + str(logreg.score(X_test, y_test)))






# logreg = LogisticRegression(C=100)#.fit(X_train, y_train)

# scores = cross_val_score(logreg, clean_df, target, cv=10)
# print("Dataset with 36 features scores: {}".format(scores))
# print("Mean of the scores: {:.2f}".format(scores.mean()))

# scores = cross_val_score(logreg, clean_df2, target, cv=10)
# print("Dataset with 20 features scores: {}".format(scores))
# print("Mean of the scores: {:.2f}".format(scores.mean()))

# If we're overfitting 
# When the test and training set score are close it means I am likely underfitting
# print("Training set score " + str(logreg.score(X_train, y_train)))
#print("Test set score " + str(logreg.score(X_test, y_test)))
# You will need to explain alpha and regularization in this section, not the lit review

# C is changed, this relates to regularization I think, talk about this, this means
# less regularization
# logreg100 = LogisticRegression(C=100).fit(X_train, y_train)
# print("Training set score: {:.3f}".format(logreg100.score(X_train, y_train)))
# print("Test set score: {:.3f}".format(logreg100.score(X_test, y_test)))

# C is set to 0.01, this means even more regularization
# logreg001 = LogisticRegression(C=0.01).fit(X_train, y_train)
# print("Training set score: {:.3f}".format(logreg001.score(X_train, y_train)))
# print("Test set score: {:.3f}".format(logreg001.score(X_test, y_test)))



# Random Forest classifier

In [3]:
# # As mentioned in the literature review, a Random Forest comprises of decision
# trees. When a decision tree is built that continues until all leaves are pure leads to models
# that are very complex and highly overfit to the training data. The presence of pure
# leaves means that a tree is 100% accurate on the training set. 
# To stop the overfitting of trees we can pre-prune the tree or post-prune the tree
# To pre-prune we can limit the maximum depth of the tree 
# The deeper a tree becomes the more complex it becomes. Limiting the depth
# prevents overfitting
# try a standard decision tree aswell as this one 

from sklearn.ensemble import RandomForestClassifier


# n estimators equal the number of trees
# Iterate thropugh a number of numbers of trees to gauge the best
# A heavy tuning of parameters is not really needed
# max depth is set to default

    

# forest = RandomForestClassifier(n_estimators=460, n_jobs=-1, max_features = 'sqrt', max_depth= 28)

# scores = cross_val_score(forest, clean_df, target, cv=10)
# print("Full features: mean of the scores: {:.2f}".format(scores.mean()))

# scores = cross_val_score(forest, clean_df2, target, cv=10)
# print("Reduced features: Mean of the scores: {:.2f}".format(scores.mean()))

# omit samples/leafs and just reflect on it

# forest = RandomForestClassifier()

# scores = cross_val_score(forest, clean_df, target, cv=10)
# print("Full features: mean of the scores: {:.2f}".format(scores.mean()))

# scores = cross_val_score(forest, clean_df2, target, cv=10)
# print("Reduced features: Mean of the scores: {:.2f}".format(scores.mean()))

# forest.fit(X_train, y_train)
# print(forest.score(X_test, y_test))

#Random Grid Search
# n_estimators = [int(x) for x in np.linspace(start=200, stop=500, num=10)]
# max_features = ['auto', 'sqrt']
# max_depth = [int(x) for x in np.linspace(5, 100, num = 5)]
# max_depth.append(None)

# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth
#               }

# forest = RandomForestClassifier()

# rf_random = RandomizedSearchCV(estimator = forest, param_distributions = random_grid, n_iter = 100, cv = 3, n_jobs=-1, verbose=3, random_state=42)
# rf_random.fit(X_train, y_train)
# print(rf_random.best_params_)

# Grid search CV 

# param_grid = {
#     'max_depth' : [20, 25, 28, 30, 35],
#     'max_features': ['sqrt'],
#     'n_estimators' : [450, 460, 466, 470, 475]
# }

# forest = RandomForestClassifier()

# grid_search = GridSearchCV(estimator = forest, param_grid = param_grid, cv = 3, n_jobs=-1, verbose=3)
# grid_search.fit(X_train, y_train)

# print(grid_search.best_params_)



#grid_search.fit(X_train, y_train)
#print(grid_search.best_params_)

#print(str(forest.score(X_test, y_test)))

# Also use AUC from April chen's video
# Assess with cross validation
# Test on both the 20 features aswell as all features

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

# Need to debug this, it returns 100% accuracy??? - fixed 
gnb_clf = MultinomialNB()

scores = cross_val_score(gnb_clf, clean_df, target, cv=10)
print("Full features: mean of the scores: {:.2f}".format(scores.mean()))

scores = cross_val_score(gnb_clf, clean_df2, target, cv=10)
print("Reduced features: Mean of the scores: {:.2f}".format(scores.mean()))

# Also use AUC from April chen's video
# Assess with cross validation
# Test on both the 20 features aswell as all features

# Neural Networks

In [15]:
#https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw

from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(random_state=42)
mlp.fit(X_train, y_train)

print("Training Accuracy " + str(mlp.score(X_train, y_train)))
print("Testing Accuracy " + str(mlp.score(X_test, y_test)))

# A common way to adjust parameters in a neural network is to first create a network
# that is large enough to overfit, making sure that the task can actually be learned by
# the network. Then, once you know the training data can be learned, either shrink the
# network or increase alpha to add regularization, which will improve generalization
# performance.

# Algorithms part - http://scikit-learn.org/stable/modules/neural_networks_supervised.html

# # Poor accuracy could be down to poor scaling, scale with minmax scaler and see
# # if there';s an improvement in accuracy 
# # Either use minmax scaler or scale from cristi vlad video, standardscaler
# # neural networks 3 
# # Decent accuracy 
# # By default the MLP uses 100 hidden nodes
#print("Accuracy " + str(mlp.score(X_test, y_test)))

# Reduced the number of hidden nodes - 10 hidden units
# mlp = MLPClassifier(hidden_layer_sizes=[10], random_state=42)
# mlp.fit(X_train, y_train)
# print("Accuracy " + str(mlp.score(X_test, y_test)))

# Two hidden layers now with 10 nodes each
# mlp = MLPClassifier(hidden_layer_sizes=[10, 10], random_state=42)
# mlp.fit(X_train, y_train)
# print("Accuracy " + str(mlp.score(X_test, y_test)))

# Experiment with the alpha some more 
# mlp = MLPClassifier(hidden_layer_sizes=[10, 10], alpha=1, random_state=42)
# mlp.fit(X_train, y_train)
# print("Accuracy " + str(mlp.score(X_test, y_test)))

# Also use AUC from April chen's video
# Assess with cross validation
# Test on both the 20 features aswell as all features



Training Accuracy 0.7048348538810557
Testing Accuracy 0.7093721032346197


# Support Vector Machine

In [8]:
from sklearn.svm import SVC

svc = SVC(kernel='linear', gamma=33, C=100)

scores = cross_val_score(svc, clean_df, target, cv=10, n_jobs=-1)
print("Full features: mean of the scores: {:.2f}".format(scores.mean()))

scores = cross_val_score(svc, clean_df2, target, cv=10)
print("Reduced features: Mean of the scores: {:.2f}".format(scores.mean()))


#Random Grid Search
# kernel = ['rbf', 'sigmoid', 'linear']
# Cs = [x for x in np.linspace(start=0.001, stop=100, num=10)]
# gammas = [x for x in np.linspace(start=0.001, stop=100, num=10)]
# random_grid = {'kernel' : kernel, 'C' : Cs, 'gamma' : gammas}
# svc = SVC()
# svc_random = RandomizedSearchCV(estimator = svc, param_distributions = random_grid, n_iter = 100, cv = 3, n_jobs=-1, verbose=5, random_state=42)
# svc_random.fit(X_train, y_train)
# print(svc_random.best_params_)

# test each kernel, take best, then perform search 



# Grid search CV 

# param_grid = {
#     'max_depth' : [20, 25, 28, 30, 35],
#     'max_features': ['sqrt'],
#     'n_estimators' : [450, 460, 466, 470, 475]
# }

# forest = RandomForestClassifier()

# grid_search = GridSearchCV(estimator = forest, param_grid = param_grid, cv = 3, n_jobs=-1, verbose=3)
# grid_search.fit(X_train, y_train)

# print(grid_search.best_params_)


# grid_search = GridSearchCV(SVC(), param_grid, cv=3, n_jobs= -1, verbose=2)
# grid_search.fit(X_train, y_train)

# print(grid_search.best_params_)

# svc = SVC()
# clf = 
# svc.fit(X_train, y_train)
# print("Score " + str(svc.score(X_test, y_test)))
# Assess with cross validation
# Test on both the 20 features aswell as all features

Full features: mean of the scores: 0.75
Reduced features: Mean of the scores: 0.67


In [4]:
    import pandas as pd
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import cross_val_score

    # Import dataset, make seperate dataset for the target
    df = pd.read_csv("bouts_out_new.csv")
    target = df['result']
    clean_df = df.drop(['result'], axis=1)

    forest = RandomForestClassifier(n_estimators=460, n_jobs=-1, max_features = 'sqrt', max_depth= 28)

    selection = sklearn.feature_selection.SelectKBest(chi2, k=20)
    selected_features = selection.fit(X_train, y_train)
    indices_selected = selected_features.get_support(indices=True)
    colnames_selected = [clean_df.columns[i] for i in indices_selected]

    X_train_selected = X_train[colnames_selected]
    X_test_selected = X_test[colnames_selected]

    scores = cross_val_score(forest, X_train_selected, X_test_selected, cv=10)
    print("Reduced features: mean of the scores: {:.2f}".format(scores.mean()))

IndexError: index 25 is out of bounds for axis 0 with size 25