**Import packages**

In [2]:
import pandas as pd
import numpy as np
import re
import chess

from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier

# import from chess winner package 
from chess_winner.board import Board64
from chess_winner.utils import transform_dataset

# Preprocess dataset

**Extract from the dataset**

In [10]:
data = pd.read_csv('../raw_data/club_games_data.csv')

In [9]:
pd.options.display.large_repr
data.sample(frac=1).iloc[0][['fen','white_result']]

fen             r7/pp1Bkpp1/5q1p/4n3/3Q4/7P/PPP3P1/1K1RR3 b - -
white_result                                                win
Name: 49186, dtype: object

In [11]:
data = data[['white_result','black_result','pgn']]

**Keep only finished win/loose game, then shuffle**

In [12]:
status = ['timeout','repetition','timevsinsufficient','stalemate','insufficient','agreed','threecheck','kingofthehill','50move']
sample = data[(~data['white_result'].isin(status))&(~data['black_result'].isin(status))].sample(frac=1).reset_index(drop=True)

**Add game result / target**

In [13]:
sample['result'] = sample['white_result'].map(lambda X: 1 if X=='win' else 0) 

**Get a data sample to reduce the preprocessing and modelizing time**

In [14]:
X_train_game = sample.drop(columns='result')
y_train_game = sample['result']

**Transform the train dataset into a usable 64 columns dataset**

In [17]:
df_preprocessed = transform_dataset(X_train_game,y_train_game,50,200)

**Store preprocessed data into csv**

In [18]:
df_preprocessed.to_csv('../raw_data/club_games_data_preprocessed.csv',index=False)

# Model Training part


**Get the preprocessed dataset**

In [19]:
df_preprocessed = pd.read_csv('../raw_data/club_games_data_preprocessed.csv')
df_preprocessed.columns=range(0,65)

In [20]:
df_preprocessed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,4,2,3,5,6,3,2,4,1,1,...,7,10,8,9,11,12,9,8,10,0
1,4,2,3,5,6,3,2,4,1,1,...,7,10,8,9,11,12,9,8,10,0
2,4,2,3,5,6,3,0,4,1,1,...,7,10,8,9,11,12,9,8,10,0
3,4,2,3,5,6,3,0,4,1,1,...,7,10,0,9,11,12,9,8,10,0
4,4,2,3,5,6,0,0,4,1,1,...,7,10,0,9,11,12,9,8,10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1788243,0,0,0,0,0,4,0,0,1,0,...,7,0,0,0,0,0,10,12,0,1
1788244,0,0,0,0,0,4,0,0,1,0,...,7,0,0,0,0,0,10,12,0,1
1788245,0,0,0,0,0,4,0,0,1,0,...,7,0,0,0,0,0,10,12,0,1
1788246,0,0,0,0,0,4,0,0,1,0,...,7,0,0,0,0,0,10,12,0,1


**Define features X and target y for the model training**

In [21]:
## shuffle sample
sample = df_preprocessed.sample(frac = 1)
## define features and target
X = sample.drop(columns=64)
y = sample[64]

**Train and test split**

In [44]:
# define the train and test split
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.50,random_state=42)

In [45]:
X_train.shape

(894124, 64)

**Grid search on gradient boosting classifier**

In [28]:
grid_search = GridSearchCV(
    GradientBoostingClassifier(),
    param_grid={
        'n_estimators':[300],
        'max_depth':[14],
        'learning_rate':[0.05]
    },
    cv=5,
    verbose=2,
    return_train_score=True,
    scoring='accuracy')

#grid_search.get_params()

**Fit the search and get the best model**

In [29]:
grid_search.fit(X_train, y_train)
grid_search.best_params_,grid_search.best_score_

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .learning_rate=0.05, max_depth=14, n_estimators=300; total time= 6.6min
[CV] END .learning_rate=0.05, max_depth=14, n_estimators=300; total time= 6.2min
[CV] END .learning_rate=0.05, max_depth=14, n_estimators=300; total time= 6.2min
[CV] END .learning_rate=0.05, max_depth=14, n_estimators=300; total time= 6.3min
[CV] END .learning_rate=0.05, max_depth=14, n_estimators=300; total time= 6.2min


({'learning_rate': 0.05, 'max_depth': 14, 'n_estimators': 300},
 0.7269699192956713)

**Test the accuracy on test split**

In [30]:
best_model = grid_search.best_estimator_

In [31]:
best_model.score(X_test,y_test)

0.743464810202942

**Train the best model directly**

In [46]:
%%time
BM = GradientBoostingClassifier(max_depth=14,n_estimators=300,learning_rate=0.5).fit(X_train, y_train)

CPU times: user 1h 15min 30s, sys: 108 ms, total: 1h 15min 31s
Wall time: 1h 15min 30s


In [47]:
BM.score(X_test,y_test)

0.9050836349320676

**Save the best model**

In [48]:
import pickle
import bz2
import _pickle as cPickle

# save compressed pickle 
model_pkl_file_compressed = "../backend/model/chess_winner_classifier_model.pbz2" 

with bz2.BZ2File(model_pkl_file_compressed, 'wb') as file: 
    cPickle.dump(BM, file)

In [38]:
!pwd

/home/farid/code/faruto33/chess_winner/notebook


**Make some predictions on random sample from the test split**

In [94]:
best_model.predict_proba(X_test.sample(frac=1).iloc[0:1000])

array([[0.55512813, 0.44487187],
       [0.4826486 , 0.5173514 ],
       [0.72982659, 0.27017341],
       ...,
       [0.61838287, 0.38161713],
       [0.19915088, 0.80084912],
       [0.63667307, 0.36332693]])

**Compare the shape of test vs train**

In [95]:
X_test.shape,X_train.shape

((863192, 64), (499813, 64))