In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

from sklearn.metrics import recall_score, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [2]:
# open the pre-processed csv file and split the target variable from the classifiers
df = pd.read_csv('dota2_cleaned.csv')
labels = df[['Won']]
df.drop(['Won'], axis=1, inplace=True)

In [3]:
def print_metrics(labels, predictions, print_score=None):
    ''' This function receives model predictions along with the actual labels
        and returns the precision score, recall, accuracy and F1'''
    
    recall = round(recall_score(labels, predictions)*100, 2)
    acc = round(accuracy_score(labels, predictions)*100, 2)
    
    if print_score:
        print(f"Recall Score: {recall}")
        print(f"Accuracy Score: {acc}")
        
    return recall, acc

In [5]:
# create seprate training and testing sets
x_train, x_test, y_train, y_test = train_test_split(df, labels, test_size=0.25)

#   KNN Model:

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# run initial model with default value (k=5)

knn = KNeighborsClassifier()
knn.fit(x_train, y_train.values.ravel())
test_predict = knn.predict(x_test)


recall, acc = print_metrics(y_test, test_predict, print_score=True)

## Initial KNN model performed slightly better than random guess 
Recall Score: 51.76

Accuracy Score: 51.47

In [None]:
# run model again k = 7, 9, 12, 14, 16
def multiple_knn(df, labels, ks=[5]):
    x_train, x_test, y_train, y_test = train_test_split(df, labels, test_size=0.3)
    best_acc = 0
    best_k = 0
    scores = []

    for k in tqdm(ks):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(x_train, y_train.values.ravel())
        test_predict = knn.predict(x_test)
        score = print_metrics(y_test, test_predict)
        scores.append(score)
        
        if best_acc < score[1]:
            best_acc = score[1]
            best_k = k
            
        
    return best_acc, best_k, scores


acc, k, scores = multiple_knn(df, labels, ks=[4, 6, 7, 8, 9])

In [None]:
print(f"scores: {scores}, k: {k}")

## KNN = 7


Recall - 53.93% 

Accuracy = 52.77%

Still slightly better than random guess and K = 5

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [None]:
# get a baseline how random forest performs
forest = RandomForestClassifier()
forest_scores = cross_val_score(forest, df, labels, cv=3)
means = np.round(forest_scores.mean()*100, 2)

print(f"{means}%")

### Random Forest Using Gridsearch

In [None]:
# define params grid
forest_param_grid = {'criterion': ['gini', 'entropy'],
                     'max_depth': [3, 4, 6, 7],
                     'min_samples_split': [2, 3, 5, 6, 7],
                     'min_samples_leaf': [2, 3, 4, 5, 6]
                    }

In [None]:
# use grid search to find the best paramers for random forest
forest_search = GridSearchCV(forest, forest_param_grid, cv=3)
forest_search.fit(df, labels.values.ravel())

In [None]:
forest_accuracy = forest_search.best_score_ * 100
best_forest_params = forest_search.best_params_

In [None]:
forest_accuracy

In [None]:
best_forest_params

#### Optimal Parameters: {'criterion': 'gini','max_depth': 6,'min_samples_leaf': 5,'min_samples_split': 5}

Accuracy = 55.76%

# AdaBoost

In [None]:
# get a baseline on how ada boost performs
adaboost = AdaBoostClassifier()
adaboost_mean_score = np.mean(cross_val_score(adaboost, df, labels, cv=3))

print(f"{round(adaboost_mean_score, 2)}%")

### AdaBoost Using Gridsearch

In [None]:
adaboost_param_grid = {'n_estimators': [150, 200, 250, 270, 300],
                       'learning_rate': [0.6, 0.3, 0.2, 0.1]
                      }

In [None]:
ada_grid_search = GridSearchCV(adaboost, adaboost_param_grid, cv=3)
ada_grid_search.fit(df, labels)

In [None]:
ada_acccuracy = ada_grid_search.best_score_
ada_best_params = ada_grid_search.best_params_

print(f"Accuracy: {round(ada_acccuracy, 2)}%")
print(f"{ada_best_params}")

#### Optimal Parameters: {'learning_rate': 0.6, 'n_estimators': 270}
Score = 57%

# XGBoost

In [None]:
import xgboost as xgb

x_train, x_test, y_train, y_test = train_test_split(df, labels, test_size=0.2)

# check baseline accuracy for xgboost
xgbooster = xgb.XGBClassifier()

xgbooster.fit(x_train, y_train.values.ravel())

train_pred = xgbooster.predict(x_train)
test_pred = xgbooster.predict(x_test)

training_accuracy = accuracy_score(y_train, train_pred)
val_accuracy = accuracy_score(y_test, test_pred)

print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))

In [None]:
plt.figure(figsize=(20, 20))
plt.barh(range(x_train.shape[1]), xgbooster.feature_importances_, align='center')
plt.yticks(np.arange(x_train.shape[1]), x_train.columns.values)
plt.xlabel("Feature importance")
plt.ylabel("Feature")

### XGBoost Using Gridsearch

In [None]:
xg_param_grid = {'learning_rate': [0.1, 0.3], 'max_depth': [6], 
                 'min_child_weight': [5, 6], 'subsample': [0.7],
                 'n_estimators': [100, 120, 150]
                }

In [None]:
xg_grid = GridSearchCV(xgbooster, xg_param_grid, scoring='accuracy', cv=3, n_jobs=1)
xg_grid.fit(x_train, y_train.values.ravel())

best_parameters = xg_grid.best_params_

training_preds = xg_grid.predict(x_train)
test_preds = xg_grid.predict(x_test)

training_accuracy = accuracy_score(y_train, training_preds)
test_accuracy = accuracy_score(y_test, test_preds)

print(best_parameters)
print(training_accuracy)
print(test_accuracy)

In [None]:
xg_grid.

In [None]:
'learning_rate': 0.2, 'max_depth': 8, 'min_child_weight': 5, 'n_estimators': 100, 'subsample': 0.7}
0.708952380952381
0.7056666666666667