In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

In [3]:
## load the dataframe and view the first 5 rows
df = pd.read_csv('data/chess_games.csv')
df.head(1)

Unnamed: 0,game_id,rated,turns,victory_status,winner,time_increment,white_id,white_rating,black_id,black_rating,moves,opening_code,opening_moves,opening_fullname,opening_shortname,opening_response,opening_variation
0,1,False,13,Out of Time,White,15+2,bourgris,1500,a-00,1191,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...,D10,5,Slav Defense: Exchange Variation,Slav Defense,,Exchange Variation


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20058 entries, 0 to 20057
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   game_id            20058 non-null  int64 
 1   rated              20058 non-null  bool  
 2   turns              20058 non-null  int64 
 3   victory_status     20058 non-null  object
 4   winner             20058 non-null  object
 5   time_increment     20058 non-null  object
 6   white_id           20058 non-null  object
 7   white_rating       20058 non-null  int64 
 8   black_id           20058 non-null  object
 9   black_rating       20058 non-null  int64 
 10  moves              20058 non-null  object
 11  opening_code       20058 non-null  object
 12  opening_moves      20058 non-null  int64 
 13  opening_fullname   20058 non-null  object
 14  opening_shortname  20058 non-null  object
 15  opening_response   1207 non-null   object
 16  opening_variation  14398 non-null  objec

In [5]:
df.drop(['moves', 'opening_code', 'opening_fullname', 'opening_response', 'opening_variation', 'time_increment'], axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,game_id,rated,turns,victory_status,winner,white_id,white_rating,black_id,black_rating,opening_moves,opening_shortname
0,1,False,13,Out of Time,White,bourgris,1500,a-00,1191,5,Slav Defense
1,2,True,16,Resign,Black,a-00,1322,skinnerua,1261,4,Nimzowitsch Defense
2,3,True,61,Mate,White,ischia,1496,a-00,1500,3,King's Pawn Game
3,4,True,61,Mate,White,daniamurashov,1439,adivanov2009,1454,3,Queen's Pawn Game
4,5,True,95,Mate,White,nik221107,1523,adivanov2009,1469,5,Philidor Defense


In [7]:
## encode the categorical features
rated_enc = LabelEncoder()
victory_status_enc = LabelEncoder()
winner_enc = LabelEncoder()
opening_shortname_enc = LabelEncoder()

In [8]:
df_enc = df.copy()

In [9]:
df_enc['rated'] = rated_enc.fit_transform(df['rated'])
df_enc['victory_status'] = victory_status_enc.fit_transform(df['victory_status'])
df_enc['winner'] = winner_enc.fit_transform(df['winner'])
df_enc['opening_shortname'] = opening_shortname_enc.fit_transform(df['opening_shortname'])

In [10]:
df_enc.head()

Unnamed: 0,game_id,rated,turns,victory_status,winner,white_id,white_rating,black_id,black_rating,opening_moves,opening_shortname
0,1,0,13,2,2,bourgris,1500,a-00,1191,5,110
1,2,1,16,3,0,a-00,1322,skinnerua,1261,4,74
2,3,1,61,1,2,ischia,1496,a-00,1500,3,61
3,4,1,61,1,2,daniamurashov,1439,adivanov2009,1454,3,94
4,5,1,95,1,2,nik221107,1523,adivanov2009,1469,5,83


In [11]:
df_enc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20058 entries, 0 to 20057
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   game_id            20058 non-null  int64 
 1   rated              20058 non-null  int64 
 2   turns              20058 non-null  int64 
 3   victory_status     20058 non-null  int32 
 4   winner             20058 non-null  int32 
 5   white_id           20058 non-null  object
 6   white_rating       20058 non-null  int64 
 7   black_id           20058 non-null  object
 8   black_rating       20058 non-null  int64 
 9   opening_moves      20058 non-null  int64 
 10  opening_shortname  20058 non-null  int32 
dtypes: int32(3), int64(6), object(2)
memory usage: 1.5+ MB


In [12]:
winner_enc.classes_


array(['Black', 'Draw', 'White'], dtype=object)

## Build the Decision Tree Classifier model

In [13]:
## Select the features and labels
X = df_enc[['rated', 'turns', 'victory_status', 'white_rating', 'black_rating', 'opening_moves', 'opening_shortname']].to_numpy()
Y = df_enc['winner'].to_numpy().reshape(-1,1)

In [14]:
##Verify the matrices shapes
print("X dimension: ", X.shape)
print("Y dimension: ", Y.shape)

X dimension:  (20058, 7)
Y dimension:  (20058, 1)


In [15]:


##Train with cross validation (3-folds)
kf = KFold(n_splits=3, random_state=46, shuffle=True)
for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")


    print(f"  Train: ration={round(len(train_index)/X.shape[0], 2)}")
    print(f"  Test:  ration={round(len(test_index)/X.shape[0], 2)}")

    ##Init the model
    sc = StandardScaler()
    dt = DecisionTreeClassifier()

    ##Train the model

    dt.fit(sc.fit_transform(X[train_index]), Y[train_index])

    ##Evaluate on training
    print("Training Evaluation:")
    y_pred = dt.predict(sc.transform(X[train_index]))
    print("Classifiaction report: \n",classification_report(Y[train_index], y_pred, target_names=winner_enc.classes_))

    ##Evaluate on testing

    print("Testing Evaluation:")
    y_pred = dt.predict(sc.transform(X[test_index]))
    print("Classifiaction report: \n",classification_report(Y[test_index], y_pred, target_names=winner_enc.classes_))

    print('#'*10)

Fold 0:
  Train: ration=0.67
  Test:  ration=0.33
Training Evaluation:
Classifiaction report: 
               precision    recall  f1-score   support

       Black       1.00      1.00      1.00      6089
        Draw       1.00      1.00      1.00       641
       White       1.00      1.00      1.00      6642

    accuracy                           1.00     13372
   macro avg       1.00      1.00      1.00     13372
weighted avg       1.00      1.00      1.00     13372

Testing Evaluation:
Classifiaction report: 
               precision    recall  f1-score   support

       Black       0.58      0.61      0.59      3018
        Draw       0.95      0.96      0.95       309
       White       0.63      0.61      0.62      3359

    accuracy                           0.63      6686
   macro avg       0.72      0.73      0.72      6686
weighted avg       0.63      0.63      0.63      6686

##########
Fold 1:
  Train: ration=0.67
  Test:  ration=0.33
Training Evaluation:
Classifiaction 