# **Prepare Data**

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv("/content/drive/MyDrive/lichess.csv", delimiter = ",", encoding = "utf-8")

Descriptive statistics

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df.head()

In [None]:
round(df.describe(), 3)

In [None]:
plt.figure(dpi = 100)
plt.hist(df["whiteELO"])

In [None]:
plt.figure(dpi = 100)
plt.hist(df["blackELO"])

In [None]:
plt.figure(dpi = 100)
plt.scatter(df["whiteELO"], df["blackELO"], s = 1)

In [None]:
plt.figure(dpi = 100)
plt.hist(df["RatingDiff"])

In [None]:
plt.figure(dpi = 100)
plt.hist(df["totalMove"])

Handle Missing Value and Split Data

In [None]:
from sklearn.preprocessing import LabelEncoder

x = df.iloc[:,np.r_[9:15,16:19]]
y = df.iloc[:,3]

# Handle missing value
x["whiteELO"].fillna(round(x["whiteELO"].mean(), 0), inplace = True)
x["blackELO"].fillna(round(x["blackELO"].mean(), 0), inplace = True)
x["RatingDiff"].fillna(round(x["RatingDiff"].median(), 0), inplace = True)
x["whiteRatingDiff"].fillna(round(x["whiteRatingDiff"].mean(), 0), inplace = True)
x["blackRatingDiff"].fillna(round(x["blackRatingDiff"].mean(), 0), inplace = True)
x["timeControl"].fillna(x["timeControl"].mode().iloc[0], inplace = True)

le = LabelEncoder()
x = le.fit_transform(x)
y = le.fit_transform(y)

Descriptive statistics after handle missing value

In [None]:
plt.figure(dpi = 100)
plt.hist(x["whiteELO"])

In [None]:
plt.figure(dpi = 100)
plt.hist(x["blackELO"])

In [None]:
from scipy.stats import pearsonr

corr = pearsonr(x["whiteELO"], x["blackELO"])
print("Correlation of ELO Rating of both side: ", corr[0], "\n")

plt.figure(dpi = 100)
plt.scatter(x["whiteELO"], x["blackELO"], s = 1)

In [None]:
plt.figure(dpi = 100)
plt.hist(x["RatingDiff"])

In [None]:
plt.figure(dpi = 100)
plt.hist(x["totalMove"])

# **Data Modelling**

Modelling

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import plot_confusion_matrix
from catboost.utils import get_confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn import metrics

class_names = ["0-1","1-0","1/2-1/2"]
scorer = metrics.make_scorer(metrics.f1_score, average = "weighted")
k = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 10, random_state = 7848654)

LR = LogisticRegression(random_state = 1234, solver = "lbfgs", multi_class = "multinomial", max_iter = 100)
LR_score = cross_val_score(LR, x, y, cv = k, scoring = scorer)

DT = DecisionTreeClassifier()
DT_score = cross_val_score(DT, x, y, cv = k, scoring = scorer)

NN = MLPClassifier(solver = "lbfgs", alpha = 1e-5, hidden_layer_sizes = (100, 30), random_state = 1234)
NN_score = cross_val_score(DT, x, y, cv = k, scoring = scorer)

catBoost = CatBoostClassifier(
    iterations = 50,
    learning_rate = 0.1,
    random_strength = 0.1,
    depth = 8,
    loss_function = "MultiClass",
    eval_metric = "Accuracy",
    leaf_estimation_method = "Newton"
)
CB_score = cross_val_score(catBoost, x, y, cv = k, scoring = scorer)

Comparison of models score

In [None]:
print("Comparison of model efficiency base on Accuracy score")
print("Logistic Regression score  : ", LR_score , " (", round(LR_score * 100, 2), "% )")
print("Decision Tree score        : ", LR_score , " (", round(DT_score * 100, 2), "% )")
print("Neural Network score       : ", NN_score , " (", round(NN_score * 100, 2), "% )")
print("CatBoost score             : ", CB_score , " (", round(CB_score * 100, 2), "% )")