In [82]:
# Importing required libraries

import numpy as np
import pandas as pd

In [86]:
# readining in win prediction data
rng = np.random.RandomState(1)
data = pd.read_excel("data_win_prediction.xlsx")
data.columns = ["win", "map", 
                "Team_A_avg_win_percentage", 
                "Team_A_avg_KR", "Team_A_avg_elo", 
                "Team_B_avg_win_percentage", 
                "Team_B_avg_KR", "Team_B_avg_elo", 
                "Match ID"]
data

Unnamed: 0,win,map,Team_A_avg_win_percentage,Team_A_avg_KR,Team_A_avg_elo,Team_B_avg_win_percentage,Team_B_avg_KR,Team_B_avg_elo,Match ID
0,team a,de_mirage,58.864865,0.726308,1720.4,47.289216,0.792615,1536,1-051c5a18-6a99-4e5e-bef7-ed1143474b33
1,team a,de_inferno,52.341629,0.764807,1989.6,54.823232,0.762641,2073,1-8f29e3ec-f49d-4d05-8eac-98993126c405
2,team b,de_vertigo,20.202020,0.773030,753.8,66.095238,0.722705,158,1-0e83d2d4-9aef-4b16-8142-7105abbb9022
3,team b,de_mirage,48.414652,0.733409,2565.0,62.631108,0.838246,2219,1-34674634-6bc9-45bc-b022-3f3161d5aede
4,team b,de_mirage,43.571196,0.709982,1484.4,55.513072,0.768946,1352,1-a9869c55-52a9-45a8-beb6-1fdffed4506d
...,...,...,...,...,...,...,...,...,...
1460,team a,de_nuke,56.962213,0.766837,1916.0,45.598650,0.692673,1967,1-7700da52-272c-499a-aee2-6dcd9c79a1d4
1461,team b,de_ancient,56.580859,0.758207,1580.0,69.394938,0.759019,2026,1-e9383e0d-385e-487f-80cf-99d092ef7b74
1462,team a,de_inferno,46.397195,0.742710,1497.2,54.778613,0.665482,1319,1-fe1779d4-6478-4bb7-9d97-be0008acf5a6
1463,team b,de_mirage,53.792449,0.762264,1533.4,53.216783,0.755543,450,1-cb15c093-881c-4ac2-ab54-7e7e03eba2ae


In [6]:
# checking for missing data
data.isna().sum()

win                          0
map                          0
Team_A_avg_win_percentage    0
Team_A_avg_KR                0
Team_A_avg_elo               0
Team_B_avg_win_percentage    0
Team_B_avg_KR                0
Team_B_avg_elo               0
Match ID                     0
dtype: int64

In [7]:
# checking for duplicated rows
data.duplicated().sum()

20

In [8]:
# removing duplicated rows
data = data.drop_duplicates()
data.duplicated().sum()

0

In [42]:
# spliting features and response variable
features = data.loc[:, ["Team_A_avg_win_percentage", "Team_A_avg_KR", "Team_A_avg_elo",
                  "Team_B_avg_win_percentage", "Team_B_avg_KR", "Team_B_avg_elo"]]
# transforming response column into a 1d array
response = data.loc[:, ["win"]].values.ravel()

In [44]:
response

array(['team a', 'team a', 'team b', ..., 'team a', 'team b', 'team b'],
      dtype=object)

In [76]:
# checking for feature correlation
features.corr()

Unnamed: 0,Team_A_avg_win_percentage,Team_A_avg_KR,Team_A_avg_elo,Team_B_avg_win_percentage,Team_B_avg_KR,Team_B_avg_elo
Team_A_avg_win_percentage,1.0,0.523704,0.066735,-0.343511,-0.064403,0.060668
Team_A_avg_KR,0.523704,1.0,0.160686,-0.224177,0.03401,0.113984
Team_A_avg_elo,0.066735,0.160686,1.0,-0.002986,0.134077,0.789506
Team_B_avg_win_percentage,-0.343511,-0.224177,-0.002986,1.0,0.434137,0.018566
Team_B_avg_KR,-0.064403,0.03401,0.134077,0.434137,1.0,0.118695
Team_B_avg_elo,0.060668,0.113984,0.789506,0.018566,0.118695,1.0


In [88]:
# Importing required models for logistic regession
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold

# fitting the regression model
lr_model = LogisticRegression(max_iter = 200)

# splitting data into k folds
folds = KFold(n_splits = 10, shuffle = True, random_state = 50)

# Perform k-folds cross validation
scores = cross_val_score(lr_model, features, response, cv = folds, scoring = 'accuracy')
scores.mean()

0.7889415708812261

In [90]:
# importing required models for random forests
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score

# splitting data into test and train sets
X_train, X_test, y_train, y_test = train_test_split(features, response, test_size = 0.2, random_state = 50)


# fitting random forests model
rf_classifier = RandomForestClassifier(n_estimators = 500, random_state = 50)

# Trainng the model
rf_classifier.fit(X_train, y_train)

# making predictions
y_pred = rf_classifier.predict(X_test)

# Evaluating accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7612456747404844

In [104]:
# Random Forrests parameter tuning
from sklearn.model_selection import GridSearchCV

# parameters grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'max_features': ['sqrt', 'log2']
}

# Using CV to tune parameters for random forest 
grid_search = GridSearchCV(RandomForestClassifier(random_state = 50), 
                                                  param_grid, 
                                                  cv = 10, 
                                                  scoring = 'accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 100}
Best Accuracy: 0.769047976011994


In [106]:
# Refitting model with new parameters

# fitting random forests model
rf_classifier = RandomForestClassifier(n_estimators = 100, random_state = 50, max_features = "sqrt", max_depth = 10)

# Trainng the model
rf_classifier.fit(X_train, y_train)

# making predictions
y_pred = rf_classifier.predict(X_test)

# Evaluating accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7647058823529411

In [92]:
# Support Vector Machine

# importing required libraries
import matplotlib.lines as mlines
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.svm import SVC
from sklearn.datasets import make_blobs
from sklearn.inspection import DecisionBoundaryDisplay

# split data in to test and train sets
X_train, X_test, y_train, y_test = train_test_split(features, response, test_size = 0.2, random_state = 50)


# fitting svm classifier
svm_model = SVC(kernel = 'linear', C = 1.0)
svm_model.fit(X_train, y_train)

# predicting new 
y_pred = svm_model.predict(X_test)

# Evaluate
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

Accuracy: 0.78


In [100]:
# tuning SVM hyperparameters
from sklearn.model_selection import GridSearchCV

param_grid = {
    "C": [0.1, 1, 5, 10],
    "gamma": ['scale', 'auto', 0.1, 1],
    "kernel": ['linear', 'rbf']
}

# performing grid search
grid_cv = GridSearchCV(SVC(), param_grid, cv = 5)
grid_cv.fit(X_train, y_train)

# Best parameters
print("Best parameters:", grid_cv.best_params_)


Best parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


In [102]:
# refitting svm with optimal parameters
svm_model = SVC(kernel = 'linear', C = 0.1, gamma = "scale")
svm_model.fit(X_train, y_train)

# predicting new 
y_pred = svm_model.predict(X_test)

# Evaluate
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

Accuracy: 0.77
