In [12]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

dir_path = os.path.join(os.getcwd(), "data")

max_min = 40 #min
min_min = 5 #min
n_files = max_min-min_min+1
data = {}

for i in range(n_files):
    data["data_"+str(5+i)+"min"] = pd.read_csv(os.path.join(dir_path, "match_summary_"+str(5+i)+"min.csv"), 
                                               dtype=int, 
                                               index_col=0)

We start by only fitting the data at 15 minutes to determine which is the best classifier for our specific use. Only then we will fit a classifier for each time stamp.

In [13]:
data_15 = data["data_15min"]

attribs = list(data_15.keys())[1:]

y = np.array(data_15["blueWin"])
X = data_15.drop("blueWin", axis=1).values

def train_val_test_split_random(X, y, test_ratio=0.1, val_ratio=0.1, random_seed=42):
    np.random.seed(seed=random_seed)
    shuffled_indices = np.random.permutation(len(X))
    test_num = int(np.round(test_ratio*len(X)))
    val_num = int(np.round(val_ratio*len(X)))
    X_test = X[shuffled_indices[:test_num]]
    y_test = y[shuffled_indices[:test_num]]
    X_val = X[shuffled_indices[test_num:test_num+val_num]]
    y_val = y[shuffled_indices[test_num:test_num+val_num]]
    X_train = X[shuffled_indices[test_num+val_num:]]
    y_train = y[shuffled_indices[test_num+val_num:]]
    
    return X_test, y_test, X_val, y_val, X_train, y_train

X_test, y_test, X_val, y_val, X_train, y_train = train_val_test_split_random(X, y)

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

clf_list = [
    KNeighborsClassifier(),
    SVC(gamma="scale"),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=100),
    SGDClassifier()
]

for clf in clf_list:
    clf.fit(X_train, y_train)
    print(clf.__class__.__name__, np.round(np.mean(clf.predict(X_val)==y_val), 4)*100, "%")

KNeighborsClassifier 70.41 %
SVC 77.55 %
DecisionTreeClassifier 59.18 %
RandomForestClassifier 76.53 %
SGDClassifier 74.49 %


You can see that the RandomForestClassifier seems to perform the best

In [18]:
from sklearn.model_selection import GridSearchCV

rfc=RandomForestClassifier()

param_grid_rfc = [
    {'n_estimators': [5, 10, 20, 30, 50, 100], 'max_depth': [10, 30, 50, 70, 90], "max_leaf_nodes": [10, 20, 30]}
]

grid_search_rfc = GridSearchCV(rfc, param_grid_rfc, cv=3, n_jobs=-1, verbose=2, scoring="accuracy")
grid_search_rfc.fit(X_train, y_train)

cvres = grid_search_rfc.cv_results_

print(rfc.__class__.__name__)
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
    
best_estimator = grid_search_rfc.best_estimator_

Fitting 3 folds for each of 90 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 203 tasks      | elapsed:   19.9s


RandomForestClassifier
0.7225063938618926 {'max_depth': 10, 'max_leaf_nodes': 10, 'n_estimators': 5}
0.7263427109974424 {'max_depth': 10, 'max_leaf_nodes': 10, 'n_estimators': 10}
0.7327365728900256 {'max_depth': 10, 'max_leaf_nodes': 10, 'n_estimators': 20}
0.7161125319693095 {'max_depth': 10, 'max_leaf_nodes': 10, 'n_estimators': 30}
0.7289002557544757 {'max_depth': 10, 'max_leaf_nodes': 10, 'n_estimators': 50}
0.7225063938618926 {'max_depth': 10, 'max_leaf_nodes': 10, 'n_estimators': 100}
0.7058823529411765 {'max_depth': 10, 'max_leaf_nodes': 20, 'n_estimators': 5}
0.710997442455243 {'max_depth': 10, 'max_leaf_nodes': 20, 'n_estimators': 10}
0.7416879795396419 {'max_depth': 10, 'max_leaf_nodes': 20, 'n_estimators': 20}
0.7250639386189258 {'max_depth': 10, 'max_leaf_nodes': 20, 'n_estimators': 30}
0.7263427109974424 {'max_depth': 10, 'max_leaf_nodes': 20, 'n_estimators': 50}
0.7212276214833759 {'max_depth': 10, 'max_leaf_nodes': 20, 'n_estimators': 100}
0.7122762148337596 {'max_depth

[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed:   26.8s finished


In [17]:
best_estimator

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=50, max_features='auto', max_leaf_nodes=20,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

Now that we have determined the best classifier we can train it on the data for each minute. This will be done in another ***jupyter notebook***.