In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib as plt
%matplotlib inline
plt.style.use('ggplot')

from sklearn.metrics import recall_score, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [2]:
# open the pre-processed csv file and split the target variable from the classifiers
df = pd.read_csv('dota2_cleaned.csv')
labels = df[['Won']]
df.drop(['Won'], axis=1, inplace=True)

In [3]:
def print_metrics(labels, predictions, print_score=None):
    ''' This function receives model predictions along with the actual labels
        and returns the precision score, recall, accuracy and F1'''
    
    recall = round(recall_score(labels, predictions)*100, 2)
    acc = round(accuracy_score(labels, predictions)*100, 2)
    
    if print_score:
        print(f"Recall Score: {recall}")
        print(f"Accuracy Score: {acc}")
        
    return recall, acc

#   KNN Model:

In [4]:
from sklearn.neighbors import KNeighborsClassifier

# run initial model with default value (k=5)
x_train, x_test, y_train, y_test = train_test_split(df, labels, test_size=0.3)
knn = KNeighborsClassifier()
knn.fit(x_train, y_train.values.ravel())
test_predict = knn.predict(x_test)


recall, acc = print_metrics(y_test, test_predict, print_score=True)

Recall Score: 52.18
Accuracy Score: 51.79


## Initial KNN model performed slightly better than random guess 
Accuracy = 51.38% 

In [None]:
# run model again k = 7, 8, 9, 10
def multiple_knn(df, labels, ks=[5]):
    x_train, x_test, y_train, y_test = train_test_split(df, labels, test_size=0.3)
    best_acc = 0
    best_k = 0
    scores = []

    for k in tqdm(ks):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(x_train, y_train.values.ravel())
        test_predict = knn.predict(x_test)
        score = print_metrics(y_test, test_predict)
        scores.append(score)
        
        if best_acc < score[1]:
            best_acc = score[1]
            best_k = k
            
        
    return best_acc, best_k, scores


acc, k, scores = multiple_knn(df, labels, ks=[4, 7, 8, 9, 10])

In [None]:
print(f"scores: {scores}")

## Running the model again with a K = 9

Recall - 53.25% 

Accuracy = 52.73%

Still slightly better than random guess

#  Ensamble methods:

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [None]:
# get a baseline how random forest performs
forest = RandomForestClassifier()
mean_forest_score = np.mean(cross_val_score(forest, df, labels, cv=3))*100

print(f"{mean_forest_score}%")

In [None]:
# define params grid
forest_param_grid = {'criterion': ['gini', 'entropy'],
                     'max_depth': [2, 3, 4, 5, 6],
                     'min_samples_split': [2, 5, 7, 10],
                     'min_samples_leaf': [1, 2, 3, 4, 5, 6]
                    }

In [None]:
# use grid search to find the best paramers for random forest
start = time.time()
forest_search = GridSearchCV(forest, forest_param_grid, cv=3)
forest_search.fit(df, labels)

In [None]:
forest_accuracy = forest_search.best_score_ * 100
best_forest_params = forest_search.best_params_

In [None]:
# get a baseline how ada boost performs
adaboost = AdaBoostClassifier()
adaboost_mean_score = np.mean(cross_val_score(adaboost, df, labels, cv=3))

print(f"{adaboost_mean_score}%")

## Running the model using Gridsearch

Slightly better than initial random forest: Accuracy = 55.81%

#### Optimal Parameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 6, 'min_samples_split': 7}


In [None]:
adaboost_param_grid = {'n_estimators': [50, 70, 100, 175, 250],
                       'learning_rate': [0.7, 0.6, 0.5, 0.2, 0.1]
                      }

In [None]:
ada_grid_search = GridSearchCV(adaboost, adaboost_param_grid, cv=3)
tqdm_notebook(ada_grid_search.fit(df, labels))

In [None]:
ada_acccuracy = ada_grid_search.best_score_
ada_best_params = ada_grid_search.best_params_

print(f"{ada_acccuracy}% accuracy")

In [None]:
import xgboost as xgb

x_train, x_test, y_train, y_test = train_test_split(df, labels, test_size=0.3)

# check baseline accuracy for xgboost
xgbooster = xgb.XGBClassifier()
xgbooster.fit(x_train, y_train)

train_pred = xgbooster.predict(x_train)
test_pred = xgbooster.predict(x_test)

training_accuracy = accuracy_score(y_train, train_pred)
val_accuracy = accuracy_score(y_test, test_pred)

print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))

In [None]:
xg_param_grid = {'learning_rate': [0.4, 0.7], 'max_depth': [5],
                 'min_child_weight': [7], 'subsample': [0.7],
                 'n_estimators': [100, 200]
                }

In [None]:
# xg_grid = GridSearchCV(xgbooster, xg_param_grid, scoring='accuracy', cv=3, n_jobs=1)
# xg_grid.fit(df, labels)

best_parameters = xg_grid.best_params_

print("Grid Search found the following optimal parameters: ")
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

training_preds = xg_grid.predict(x_train)
val_preds = xg_grid.predict(x_test)
training_accuracy = accuracy_score(y_train, training_preds)
val_accuracy = accuracy_score(y_test, val_preds)

In [None]:
val_accuracy