**Import packages**

In [1]:
import pandas as pd
import numpy as np
import re
import chess

from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier

# import from chess winner package 
from chess_winner.board import Board64
from chess_winner.utils import transform_dataset

# Preprocess dataset

**Extract from the dataset**

In [2]:
data = pd.read_csv('../raw_data/club_games_data.csv')
data = data[['white_result','black_result','pgn']]

**Keep only finished win/loose game, then shuffle**

In [3]:
status = ['timeout','repetition','timevsinsufficient','stalemate','insufficient','agreed','threecheck','kingofthehill','50move']
sample = data[(~data['white_result'].isin(status))&(~data['black_result'].isin(status))].sample(frac=1).reset_index(drop=True)

**Add game result / target**

In [4]:
sample['result'] = sample['white_result'].map(lambda X: 1 if X=='win' else 0) 

**Get a data sample to reduce the preprocessing and modelizing time**

In [5]:
X_train_game = sample.drop(columns='result')
y_train_game = sample['result']

**Transform the train dataset into a usable 64 columns dataset**

In [21]:
df_preprocessed = transform_dataset(X_train_game,y_train_game,50,100)

**Store preprocessed data into csv**

In [22]:
df_preprocessed.to_csv('../raw_data/club_games_data_preprocessed.csv',index=False)

# Model Training part


**Get the preprocessed dataset**

In [24]:
df_preprocessed = pd.read_csv('../raw_data/club_games_data_preprocessed.csv')
df_preprocessed.columns=range(0,65)

**Define features X and target y for the model training**

In [25]:
## shuffle sample
sample = df_preprocessed.sample(frac = 1)
## define features and target
X = sample.drop(columns=64)
y = sample[64]

**Train and test split**

In [42]:
# define the train and test split
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.78,random_state=42)

In [43]:
X_train.shape

(299861, 64)

**Grid search on gradient boosting classifier**

In [28]:
grid_search = GridSearchCV(
    GradientBoostingClassifier(),
    param_grid={
        'n_estimators':[300],
        'max_depth':[14],
        'learning_rate':[0.05]
    },
    cv=5,
    verbose=2,
    return_train_score=True,
    scoring='accuracy')

#grid_search.get_params()

**Fit the search and get the best model**

In [29]:
grid_search.fit(X_train, y_train)
grid_search.best_params_,grid_search.best_score_

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .learning_rate=0.05, max_depth=14, n_estimators=300; total time= 6.6min
[CV] END .learning_rate=0.05, max_depth=14, n_estimators=300; total time= 6.2min
[CV] END .learning_rate=0.05, max_depth=14, n_estimators=300; total time= 6.2min
[CV] END .learning_rate=0.05, max_depth=14, n_estimators=300; total time= 6.3min
[CV] END .learning_rate=0.05, max_depth=14, n_estimators=300; total time= 6.2min


({'learning_rate': 0.05, 'max_depth': 14, 'n_estimators': 300},
 0.7269699192956713)

**Test the accuracy on test split**

In [30]:
best_model = grid_search.best_estimator_

In [31]:
best_model.score(X_test,y_test)

0.743464810202942

In [None]:
%%time
BM = GradientBoostingClassifier(max_depth=14,n_estimators=300,learning_rate=0.5).fit(X_train, y_train)

In [136]:
BM.score(X_test,y_test)

0.7586885192446432

**Make some predictions on random sample from the test split**

In [112]:
best_model.predict_proba(X_test.sample(frac=1).iloc[0:1000])

array([[0.04656869, 0.95343131],
       [0.66873966, 0.33126034],
       [0.08090063, 0.91909937],
       ...,
       [0.67111829, 0.32888171],
       [0.68754917, 0.31245083],
       [0.34761353, 0.65238647]])

**Compare the shape of test vs train**

In [114]:
X_test.shape,X_train.shape

((1253965, 64), (109040, 64))