In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

dir_path = os.path.join(os.getcwd(), "data")

data_5 = pd.read_csv(os.path.join(dir_path, "match_summary_5min.csv"), dtype=int).drop("Unnamed: 0", axis=1)
data_10 = pd.read_csv(os.path.join(dir_path, "match_summary_10min.csv"), dtype=int).drop("Unnamed: 0", axis=1)
data_15 = pd.read_csv(os.path.join(dir_path, "match_summary_15min.csv"), dtype=int).drop("Unnamed: 0", axis=1)
data_20 = pd.read_csv(os.path.join(dir_path, "match_summary_20min.csv"), dtype=int).drop("Unnamed: 0", axis=1)
data_25 = pd.read_csv(os.path.join(dir_path, "match_summary_25min.csv"), dtype=int).drop("Unnamed: 0", axis=1)
data_30 = pd.read_csv(os.path.join(dir_path, "match_summary_30min.csv"), dtype=int).drop("Unnamed: 0", axis=1)

We start by only fitting the data at 15 minutes to determine which is the best classifier for our specific use. Only then we will fit a classifier for each time stamp.

In [10]:
attribs = list(data_15.keys())[1:]

y = np.array(data_15["blueWin"])
X = data_15.drop("blueWin", axis=1).values

def train_val_test_split_random(X, y, test_ratio=0.1, val_ratio=0.1, random_seed=42):
    np.random.seed(seed=random_seed)
    shuffled_indices = np.random.permutation(len(X))
    test_num = int(np.round(test_ratio*len(X)))
    val_num = int(np.round(val_ratio*len(X)))
    X_test = X[shuffled_indices[:test_num]]
    y_test = y[shuffled_indices[:test_num]]
    X_val = X[shuffled_indices[test_num:test_num+val_num]]
    y_val = y[shuffled_indices[test_num:test_num+val_num]]
    X_train = X[shuffled_indices[test_num+val_num:]]
    y_train = y[shuffled_indices[test_num+val_num:]]
    
    return X_test, y_test, X_val, y_val, X_train, y_train

X_test, y_test, X_val, y_val, X_train, y_train = train_val_test_split_random(X, y)

In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

clf_list = [
    KNeighborsClassifier(),
    SVC(gamma="scale"),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=100),
    SGDClassifier()
]

for clf in clf_list:
    clf.fit(X_train, y_train)
    print(clf.__class__.__name__, np.round(np.mean(clf.predict(X_val)==y_val), 4)*100, "%")

KNeighborsClassifier 70.41 %
SVC 68.37 %
DecisionTreeClassifier 57.14 %
RandomForestClassifier 70.41 %
SGDClassifier 71.43 %


In [19]:
from sklearn.model_selection import GridSearchCV

knc=KNeighborsClassifier()
rfc=RandomForestClassifier()
sgdc=SGDClassifier()

param_grid_knc = [
    {'n_neighbors': [1, 5, 10, 20, 40, 80], 'weights': ["uniform", "distance"]}
]

param_grid_rfc = [
    {'n_estimators': [10, 50, 100, 150], 'max_depth': [10, 30, 50], "max_leaf_nodes": [10, 20, 30]}
]

param_grid_sgdc = [
    {'penalty': ["l1", "l2", "elasticnet"]}
]

grid_search_sgdc = GridSearchCV(sgdc, param_grid_sgdc, cv=5, n_jobs=-1, scoring="accuracy")
grid_search_sgdc.fit(X_train, y_train)

grid_search_knc = GridSearchCV(knc, param_grid_knc, cv=5, n_jobs=-1, scoring="accuracy")
grid_search_knc.fit(X_train, y_train)

grid_search_rfc = GridSearchCV(rfc, param_grid_rfc, cv=5,n_jobs=-1, scoring="accuracy")
grid_search_rfc.fit(X_train, y_train)

cvres_knc = grid_search_knc.cv_results_
cvres_rfc = grid_search_rfc.cv_results_
cvres_sgdc = grid_search_sgdc.cv_results_

print(grid_search_knc.best_estimator_.__class__.__name__, np.mean(grid_search_knc.best_estimator_.predict(X_val)==y_val))
print(grid_search_rfc.best_estimator_.__class__.__name__, np.mean(grid_search_rfc.best_estimator_.predict(X_val)==y_val))
print(grid_search_sgdc.best_estimator_.__class__.__name__, np.mean(grid_search_sgdc.best_estimator_.predict(X_val)==y_val))



KNeighborsClassifier 0.673469387755102
RandomForestClassifier 0.7142857142857143
SGDClassifier 0.6122448979591837


You can see that the RandomForestClassifier seems to perform the best. Therefore we will focus on it more.

In [25]:
rfc=RandomForestClassifier()

param_grid_rfc = [
    {'n_estimators': [10, 20, 30, 40, 50, 60],'max_depth': [60, 70, 80, 90, 100], "max_leaf_nodes": [20, 25, 30, 35, 40, 45, 50, 55]}
]

grid_search_rfc = GridSearchCV(rfc, param_grid_rfc, cv=3, verbose=3, n_jobs=-1, scoring="accuracy")
grid_search_rfc.fit(X_train, y_train)

cvres_rfc = grid_search_rfc.cv_results_

print(rfc.__class__.__name__)
for mean_score, params in zip(cvres_rfc["mean_test_score"], cvres_rfc["params"]):
    print(mean_score, params)
    
best_estimator = grid_search_rfc.best_estimator_

Fitting 3 folds for each of 240 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 515 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done 713 out of 720 | elapsed:   28.0s remaining:    0.2s


RandomForestClassifier
0.7315521628498728 {'max_depth': 60, 'max_leaf_nodes': 20, 'n_estimators': 10}
0.7290076335877863 {'max_depth': 60, 'max_leaf_nodes': 20, 'n_estimators': 20}
0.7251908396946565 {'max_depth': 60, 'max_leaf_nodes': 20, 'n_estimators': 30}
0.7366412213740458 {'max_depth': 60, 'max_leaf_nodes': 20, 'n_estimators': 40}
0.7239185750636132 {'max_depth': 60, 'max_leaf_nodes': 20, 'n_estimators': 50}
0.7366412213740458 {'max_depth': 60, 'max_leaf_nodes': 20, 'n_estimators': 60}
0.7061068702290076 {'max_depth': 60, 'max_leaf_nodes': 25, 'n_estimators': 10}
0.7201017811704835 {'max_depth': 60, 'max_leaf_nodes': 25, 'n_estimators': 20}
0.7150127226463104 {'max_depth': 60, 'max_leaf_nodes': 25, 'n_estimators': 30}
0.7353689567430025 {'max_depth': 60, 'max_leaf_nodes': 25, 'n_estimators': 40}
0.727735368956743 {'max_depth': 60, 'max_leaf_nodes': 25, 'n_estimators': 50}
0.7302798982188295 {'max_depth': 60, 'max_leaf_nodes': 25, 'n_estimators': 60}
0.7290076335877863 {'max_depth

[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:   28.4s finished


In [27]:
best_estimator

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=80, max_features='auto', max_leaf_nodes=30,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=30,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

Now let´s try to train the best classifier on all of the data.

In [28]:
y_5 = np.array(data_5["blueWin"])
X_5 = data_5.drop("blueWin", axis=1).values
y_10 = np.array(data_10["blueWin"])
X_10 = data_10.drop("blueWin", axis=1).values
y_15 = np.array(data_15["blueWin"])
X_15 = data_15.drop("blueWin", axis=1).values
y_20 = np.array(data_20["blueWin"])
X_20 = data_20.drop("blueWin", axis=1).values
y_25 = np.array(data_25["blueWin"])
X_25 = data_25.drop("blueWin", axis=1).values
y_30 = np.array(data_30["blueWin"])
X_30 = data_30.drop("blueWin", axis=1).values

X_test_5, y_test_5, X_val_5, y_val_5, X_train_5, y_train_5 = train_val_test_split_random(X_5, y_5, val_ratio=0)
X_test_10, y_test_10, X_val_10, y_val_10, X_train_10, y_train_10 = train_val_test_split_random(X_10, y_10, val_ratio=0)
X_test_15, y_test_15, X_val_15, y_val_15, X_train_15, y_train_15 = train_val_test_split_random(X_15, y_15, val_ratio=0)
X_test_20, y_test_20, X_val_20, y_val_20, X_train_20, y_train_20 = train_val_test_split_random(X_20, y_20, val_ratio=0)
X_test_25, y_test_25, X_val_25, y_val_25, X_train_25, y_train_25 = train_val_test_split_random(X_25, y_25, val_ratio=0)
X_test_30, y_test_30, X_val_30, y_val_30, X_train_30, y_train_30 = train_val_test_split_random(X_30, y_30, val_ratio=0)

In [33]:
clf_5 = best_estimator.fit(X_train_5, y_train_5)
clf_10 = best_estimator.fit(X_train_10, y_train_10)
clf_15 = best_estimator.fit(X_train_15, y_train_15)
clf_20 = best_estimator.fit(X_train_20, y_train_20)
clf_25 = best_estimator.fit(X_train_25, y_train_25)
clf_30 = best_estimator.fit(X_train_30, y_train_30)

print("Classifier for 5 minutes: " + str(np.round(np.mean(clf_5.predict(X_test_5) == y_test_5)*100, 2)) + "%.")
print("Classifier for 10 minutes: " + str(np.round(np.mean(clf_10.predict(X_test_10) == y_test_10)*100, 2)) + "%.")
print("Classifier for 15 minutes: " + str(np.round(np.mean(clf_15.predict(X_test_15) == y_test_15)*100, 2)) + "%.")
print("Classifier for 20 minutes: " + str(np.round(np.mean(clf_20.predict(X_test_20) == y_test_20)*100, 2)) + "%.")
print("Classifier for 25 minutes: " + str(np.round(np.mean(clf_25.predict(X_test_25) == y_test_25)*100, 2)) + "%.")
print("Classifier for 30 minutes: " + str(np.round(np.mean(clf_30.predict(X_test_30) == y_test_30)*100, 2)) + "%.")

Classifier for 5 minutes: 55.0%.
Classifier for 10 minutes: 40.82%.
Classifier for 15 minutes: 55.1%.
Classifier for 20 minutes: 69.79%.
Classifier for 25 minutes: 77.11%.
Classifier for 30 minutes: 81.36%.
