# Predicting (Past) Professional League of Legends Matches with Sklearn

In [5]:
#Importing relevant packages.

In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
%matplotlib inline

In [7]:
#Exploring the data, we can see there are 45,489 rows with 103 columns each.

In [8]:
data = pd.read_csv('2020_LoL_esports_match_data_from_OraclesElixir_20200714.csv')
data

Unnamed: 0,gameid,datacompleteness,url,league,year,split,playoffs,date,game,patch,...,csdiffat10,goldat15,xpat15,csat15,opp_goldat15,opp_xpat15,opp_csat15,golddiffat15,xpdiffat15,csdiffat15
0,ESPORTSTMNT03/1241318,complete,http://matchhistory.na.leagueoflegends.com/en/...,KeSPA,2020,,0,2020-01-03 07:33:26,1,9.24,...,23.0,4888.0,7368.0,131.0,4723.0,7202.0,118.0,165.0,166.0,13.0
1,ESPORTSTMNT03/1241318,complete,http://matchhistory.na.leagueoflegends.com/en/...,KeSPA,2020,,0,2020-01-03 07:33:26,1,9.24,...,-10.0,4385.0,4817.0,91.0,4784.0,4667.0,98.0,-399.0,150.0,-7.0
2,ESPORTSTMNT03/1241318,complete,http://matchhistory.na.leagueoflegends.com/en/...,KeSPA,2020,,0,2020-01-03 07:33:26,1,9.24,...,-9.0,4809.0,6275.0,129.0,5218.0,8112.0,140.0,-409.0,-1837.0,-11.0
3,ESPORTSTMNT03/1241318,complete,http://matchhistory.na.leagueoflegends.com/en/...,KeSPA,2020,,0,2020-01-03 07:33:26,1,9.24,...,3.0,4915.0,4959.0,136.0,4864.0,5360.0,135.0,51.0,-401.0,1.0
4,ESPORTSTMNT03/1241318,complete,http://matchhistory.na.leagueoflegends.com/en/...,KeSPA,2020,,0,2020-01-03 07:33:26,1,9.24,...,1.0,2956.0,3800.0,20.0,3189.0,3543.0,28.0,-233.0,257.0,-8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45484,ESPORTSTMNT03/1414873,complete,http://matchhistory.na.leagueoflegends.com/en/...,BRCC,2020,,0,2020-07-14 01:49:46,1,10.13,...,-6.0,5342.0,7496.0,130.0,4875.0,6756.0,138.0,467.0,740.0,-8.0
45485,ESPORTSTMNT03/1414873,complete,http://matchhistory.na.leagueoflegends.com/en/...,BRCC,2020,,0,2020-07-14 01:49:46,1,10.13,...,17.0,5455.0,4877.0,138.0,4555.0,4448.0,117.0,900.0,429.0,21.0
45486,ESPORTSTMNT03/1414873,complete,http://matchhistory.na.leagueoflegends.com/en/...,BRCC,2020,,0,2020-07-14 01:49:46,1,10.13,...,2.0,3234.0,3747.0,25.0,3071.0,3789.0,20.0,163.0,-42.0,5.0
45487,ESPORTSTMNT03/1414873,complete,http://matchhistory.na.leagueoflegends.com/en/...,BRCC,2020,,0,2020-07-14 01:49:46,1,10.13,...,5.0,22022.0,27605.0,498.0,23240.0,28287.0,512.0,-1218.0,-682.0,-14.0


In [9]:
#A quick view of some of the columns.

In [10]:
data.columns[0:40]

Index(['gameid', 'datacompleteness', 'url', 'league', 'year', 'split',
       'playoffs', 'date', 'game', 'patch', 'playerid', 'side', 'position',
       'player', 'team', 'champion', 'ban1', 'ban2', 'ban3', 'ban4', 'ban5',
       'gamelength', 'result', 'kills', 'deaths', 'assists', 'teamkills',
       'teamdeaths', 'doublekills', 'triplekills', 'quadrakills', 'pentakills',
       'firstblood', 'firstbloodkill', 'firstbloodassist', 'firstbloodvictim',
       'team kpm', 'ckpm', 'firstdragon', 'dragons'],
      dtype='object')

In [11]:
#Filtering the data to only include "complete" rows.

In [12]:
data_cleaned = data[data.get('datacompleteness') == 'complete']

In [13]:
#Dropping irrelevant columns to our prediction.

In [14]:
data_cleaned = data_cleaned.drop(['datacompleteness','url','league','year','split','playoffs','date','game','patch','playerid','position','player','team','gameid','side','ban1','ban2','ban3','ban4','ban5','champion'],axis=1)

In [15]:
#The data still has a decent amount of missing values, even after filtering for completeness.
#Dropping columns with over 20,000 missing values.

In [16]:
a = data_cleaned.isnull().sum().sort_values(ascending=False) < 20000
b = a[a == True].index
data_cleaned = data_cleaned[b]

In [17]:
#Eliminating final missing values by dropping rows which contain any NaN values.

In [18]:
data_cleaned = data_cleaned.dropna(axis=0)
data_cleaned

Unnamed: 0,firstbloodvictim,firstbloodassist,csdiffat15,opp_inhibitors,triplekills,doublekills,xpdiffat15,inhibitors,opp_xpat10,goldspent,...,wardskilled,earnedgold,totalgold,teamdeaths,teamkills,assists,deaths,kills,result,gamelength
0,0.0,0.0,13.0,0.0,0.0,0.0,166.0,1.0,4432.0,11350.0,...,3.0,8498.0,13302.0,11,17,7,3,0,1,2220
1,0.0,0.0,-7.0,0.0,0.0,1.0,150.0,0.0,3010.0,12175.0,...,26.0,8680.0,13484.0,11,17,9,0,4,1,2220
2,1.0,0.0,-11.0,0.0,0.0,0.0,-1837.0,0.0,4861.0,14225.0,...,32.0,10615.0,15419.0,11,17,5,2,5,1,2220
3,0.0,0.0,1.0,0.0,0.0,2.0,-401.0,0.0,3250.0,16500.0,...,30.0,14079.0,18883.0,11,17,9,1,7,1,2220
4,0.0,0.0,-8.0,0.0,0.0,0.0,257.0,0.0,2429.0,8025.0,...,6.0,3035.0,7839.0,11,17,3,5,1,1,2220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45482,0.0,0.0,-1.0,0.0,1.0,2.0,145.0,0.0,4523.0,11495.0,...,6.0,9085.0,13008.0,4,18,4,1,9,1,1788
45483,0.0,1.0,-3.0,0.0,0.0,0.0,-590.0,0.0,3650.0,8725.0,...,6.0,5528.0,9451.0,4,18,10,2,0,1,1788
45484,0.0,0.0,-8.0,0.0,0.0,0.0,740.0,2.0,4382.0,11025.0,...,5.0,7928.0,11851.0,4,18,7,0,1,1,1788
45485,0.0,0.0,21.0,0.0,0.0,1.0,429.0,0.0,2923.0,12200.0,...,4.0,8859.0,12782.0,4,18,5,0,6,1,1788


In [19]:
#Isolating the dependant variable (y) and removing it from the independant variables we will use for prediction (X).

In [20]:
X = data_cleaned.drop('result',axis=1)
y = data_cleaned[['result']]

In [21]:
#Seperating the data into testing and training sets to prepare it for the model.

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 0)

In [23]:
#Issues can arise within our model when there are big differences in the scale of respective columns.
#For example, the total gold is usually in the thousands, whereas the amount of inhibitors taken is one or two.

In [24]:
#To standardize the X data, we use StandardScaler.

In [25]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [26]:
#A quick look at the transformed, standardized training data.

In [27]:
X_train

array([[-3.31506789e-01, -4.20715520e-01,  1.89428411e-01, ...,
         1.32566803e+00, -1.01655661e+00,  1.50527000e+00],
       [-3.31506789e-01,  2.37690304e+00,  4.86930224e-01, ...,
        -2.88232220e-01, -1.01655661e+00, -2.51129030e-01],
       [-3.31506789e-01, -4.20715520e-01,  3.48977714e-03, ...,
        -8.26198971e-01,  1.78825437e-01,  2.27888889e-01],
       ...,
       [-3.31506789e-01,  2.37690304e+00,  5.98493404e-01, ...,
        -8.26198971e-01,  1.78825437e-01,  3.49697608e+00],
       [-3.31506789e-01,  2.37690304e+00, -2.19636583e-01, ...,
        -8.26198971e-01, -6.18095931e-01, -1.55652289e+00],
       [-3.31506789e-01, -4.20715520e-01,  1.63974975e+00, ...,
         2.49734531e-01, -2.19635247e-01,  2.81113102e-01]])

In [28]:
#Training the model using RandomForestClassifier.

In [29]:
rfc = RandomForestClassifier(n_estimators = 800)
rfc.fit(X_train,y_train)
pred_rfc = rfc.predict(X_test)

  


In [30]:
#A quick look at the accuracy of our predicted results.
#97% of wins and 96% of losses are correctly calculated.

In [31]:
print(classification_report(y_test,pred_rfc))

              precision    recall  f1-score   support

           0       0.96      0.97      0.97      2774
           1       0.97      0.96      0.96      2736

    accuracy                           0.97      5510
   macro avg       0.97      0.97      0.97      5510
weighted avg       0.97      0.97      0.97      5510



In [32]:
#Looking at which features the model weighted strongly.
#It appears the model found teamdeaths, team kpm, and teamkills as the strongest predictors of the result of the match.

In [33]:
feature_list = X.columns
importances = np.array(rfc.feature_importances_)
pd.DataFrame().assign(Feature = feature_list,Importance = importances).set_index('Feature').sort_values('Importance',ascending=False)

Unnamed: 0_level_0,Importance
Feature,Unnamed: 1_level_1
teamdeaths,0.214261
teamkills,0.162198
team kpm,0.158863
assists,0.073336
deaths,0.065245
ckpm,0.034513
opp_inhibitors,0.031854
earned gpm,0.026867
inhibitors,0.026239
earnedgold,0.016813


In [34]:
#Final hyperparameter optimization to find the optimal amount of estimators.
#Using Random Forest Classifier is convenient here, because we only have to worry about tuning one parameter.

In [35]:
empty = ([])
for i in np.arange(100,1001,100):
    rfc = RandomForestClassifier(n_estimators = i)
    rfc.fit(X_train,y_train)
    pred_rfc = rfc.predict(X_test)
    accuracy = accuracy_score(y_test,pred_rfc)
    empty = np.append(accuracy,empty)
    
    

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


In [36]:
#Our final results are shown here, it appears that using 100 estimators found the highest accuracy for our model.
#Optimized final accuracy - 96.59%. 

In [41]:
results = pd.DataFrame().assign(Estimators=np.arange(100,1001,100),Accuracy=empty)
results = results.sort_values('Accuracy',ascending=False)
results

Unnamed: 0,Estimators,Accuracy
0,100,0.96588
2,300,0.965699
3,400,0.965699
5,600,0.965699
7,800,0.965517
8,900,0.965336
4,500,0.965154
1,200,0.964973
6,700,0.964791
9,1000,0.964247
