In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.dummy import DummyClassifier
import joblib

In [17]:
# Load the dataset
data = pd.read_csv(r"revisedLoL_matchdata.csv")

# Drop an 'index' column that is there by my data collection mistake
data = data.drop(['index'],axis=1)

# Default red team Id = 200. Change the red team ID to 0 and blue to 1. 
data['winning_team'] = data['winning_team'].replace([200,100],[0,1])
data.head(5)

Unnamed: 0,game_id,winning_team,in_game_time,ally_top_level,ally_jgl_level,ally_mid_level,ally_bot_level,ally_sup_level,enemy_top_level,enemy_jgl_level,...,dragon_killed,dragon_lost,voidgrub_killed,voidgrub_lost,riftherald_killed,riftherald_lost,baron_killed,baron_lost,champion_killed,champion_death
0,7358388756,0.0,10.003517,7.0,7.0,7.0,8.0,5.0,8.0,6.0,...,0,1,0,1,0,0,0,0,1,2
1,7358388756,0.0,13.004633,9.0,8.0,8.0,9.0,7.0,9.0,9.0,...,0,1,0,3,0,1,0,0,1,7
2,7358388756,0.0,15.277683,10.0,9.0,9.0,10.0,7.0,11.0,10.0,...,0,1,0,6,0,1,0,0,5,17
3,7347700848,0.0,10.003133,8.0,7.0,9.0,7.0,5.0,7.0,7.0,...,0,0,0,3,0,0,0,0,2,2
4,7347700848,0.0,13.004267,11.0,8.0,10.0,8.0,6.0,8.0,8.0,...,0,1,0,3,1,0,0,0,3,4


In [18]:
# Feature and Response
X = data.drop(['game_id','winning_team'],axis=1)
Y = data['winning_team']

In [19]:
# K-Fold object
k_folds = KFold(n_splits=5, shuffle = True, random_state = 123)

In [20]:
# Split the training data into train/validation/test set. The split is 60/20/20
# Validation set used to calibrate the data
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, shuffle = True, random_state = 123)
x_train, x_val, y_train, y_val = train_test_split(x_train,y_train, test_size = 0.25, shuffle = True, random_state = 789)

In [21]:
scaler = StandardScaler().fit(x_train)

## Logistic regression

In [25]:
LR = GridSearchCV(LogisticRegression(), {'C': [1e-4,1e-3,1e-2,0.1,1,10,100]},
                  cv = k_folds, scoring = 'accuracy', n_jobs = -1).fit(scaler.transform(x_train),y_train)

In [27]:
LR.best_estimator_

LogisticRegression(C=0.01)

In [32]:
LR = GridSearchCV(LogisticRegression(), {'C': np.arange(1,10)/100},
                  cv = k_folds, scoring = 'accuracy', n_jobs = -1).fit(scaler.transform(x_train),y_train)
LR.best_estimator_

LogisticRegression(C=0.01)

In [39]:
LR = GridSearchCV(LogisticRegression(), {'C': np.arange(100,200)/10000},
                  cv = k_folds, scoring = 'accuracy', n_jobs = -1).fit(scaler.transform(x_train),y_train)
LR.best_estimator_

LogisticRegression(C=0.0144)

## SVM

In [40]:
SVM = GridSearchCV(LinearSVC(), {'C': [1e-4,1e-3,1e-2,0.1,1,10,100]},
                  cv = k_folds, scoring = 'accuracy', n_jobs = -1).fit(scaler.transform(x_train),y_train)

SVM.best_estimator_

LinearSVC(C=0.001)

In [43]:
SVM = GridSearchCV(LinearSVC(), {'C': np.arange(1,10)/1000},
                  cv = k_folds, scoring = 'accuracy', n_jobs = -1).fit(scaler.transform(x_train),y_train)

SVM.best_estimator_

LinearSVC(C=0.002)

In [46]:
SVM = GridSearchCV(LinearSVC(), {'C': np.arange(20,30)/10000},
                  cv = k_folds, scoring = 'accuracy', n_jobs = -1).fit(scaler.transform(x_train),y_train)

SVM.best_estimator_

LinearSVC(C=0.0021)

In [49]:
SVM = GridSearchCV(LinearSVC(), {'C': np.arange(210,220)/100000},
                  cv = k_folds, scoring = 'accuracy', n_jobs = -1).fit(scaler.transform(x_train),y_train)

SVM.best_estimator_

LinearSVC(C=0.00212)