In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

dir_path = os.path.join(os.getcwd(), "data")

max_min = 40 #min
min_min = 5 #min
n_files = max_min-min_min+1
data = {}

for i in range(n_files):
    data["data_"+str(5+i)+"min"] = pd.read_csv(os.path.join(dir_path, "match_summary_"+str(5+i)+"min.csv"), 
                                               dtype=int, 
                                               index_col=0)

In another notebook we have determined that a support vector machine classifier seems to work best.

In [2]:
def train_test_split_random(X, y, test_ratio=0.1, random_seed=42):
    np.random.seed(seed=random_seed)
    shuffled_indices = np.random.permutation(len(X))
    test_num = int(np.round(test_ratio*len(X)))
    X_test = X[shuffled_indices[:test_num]]
    y_test = y[shuffled_indices[:test_num]]
    X_train = X[shuffled_indices[test_num:]]
    y_train = y[shuffled_indices[test_num:]]
    
    return X_test, y_test, X_train, y_train

In [None]:
from sklearn.svm import SVC
from sklearn.base import clone
import sys

svc=SVC(gamma="scale", kernel="linear", C=10.0)

classifiers = []
accuracy = []

for (i, key) in enumerate(data):
    sys.stdout.write("\rProcessing: " + str(i + 5) + " of 40 min.")
    sys.stdout.flush()
    dat = data[key]
    y = np.array(dat["blueWin"])
    X = dat.drop("blueWin", axis=1).values
    X_test, y_test, X_train, y_train = train_test_split_random(X, y)
    svc_clone = clone(svc)
    svc_clone.fit(X_train, y_train)
    y_pred = svc_clone.predict(X_test)
    
    classifiers.append((i+5, svc_clone))
    accuracy.append((i+5, np.round(np.mean(y_pred==y_test)*100, 2)))

Processing: 16 of 40 min.

Let´s check the accuracy of the models at different minutes.

In [5]:
fig, ax = plt.subplots(figsize=(10,7))
ax.set_title("Distribution of game lengths based on dataset size", fontsize =20)

ax.plot([time for (time, acc) in accuracy], 
        [acc for (time, acc) in accuracy], 
        marker='o', linestyle='dashed', linewidth=2, markersize=8)

ax.set_xticklabels([0, 5, 10, 15, 20, 25, 30, 35, 40, 45], fontsize=14)
ax.set_xlim(0, 45)
ax.set_ylim(0, 110)
ax.set_yticklabels([0, 20, 40, 60, 80, 100], fontsize=14)
ax.set_xlabel("Game length [min]", fontsize=18)
ax.set_ylabel("Accuracy [%]", fontsize=18)
plt.show()

[(5, 50.0), (6, 63.27), (7, 62.24)]

In [None]:
features = list(data["data_5min"].keys())[1:]
coeffs = [clf.coef_ for (time, clf) in classifiers]

coefficients = {key: [] for key in features}
for array in coeffs:
    for (i, key) in enumerate(features):
        coefficients[key].append(array[i])

n_rows = 8
n_columns = 4
x_axis = [i+5 for i in range(n_files)]
corr_index = np.array([i for i in range(n_rows*n_columns)]).reshape((n_rows, -1))

fig, ax = plt.subplots(n_rows, n_columns, figsize=(20,35))
 
for k in range(n_rows):
    for l in range(n_columns):
        try:
            feature = features[corr_index[k, l]]
            ax[k, l].set_title(feature, fontsize=18)
            ax[k, l].plot(x_axis, correlations[feature], linestyle='-', linewidth=2)
            ax[k, l].set_xlim(0, 42.5)
            ax[k, l].set_xticklabels([0, 5, 10, 15, 20, 25, 30, 35, 40], fontsize=14)
            ax[k, l].set_ylim(-1, 1)
        except:
            pass
plt.tight_layout()      
plt.show()