**Import packages**

In [1]:
import pandas as pd
import numpy as np
import re
import chess
import os

from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from xgboost import XGBClassifier

# import from chess winner package 
from chess_winner.board import Board64
from chess_winner.utils import fen_to_input_columns,transform_dataset,symbols_to_dict,input_columns_to_fen,dict_to_symbols
import dotenv

# Preprocess dataset

**Extract from the dataset**

In [2]:
data = pd.read_csv('../raw_data/club_games_data.csv')

In [3]:
data.sample(frac=1).iloc[0][['fen','white_result']]

fen             rn3rk1/ppq1ppbp/5np1/8/1Q1N4/2PPP1P1/PP3P1P/RN...
white_result                                             resigned
Name: 7153, dtype: object

In [7]:
data = data[['white_result','black_result','pgn']]

**Keep only finished win/loose game, then shuffle**

In [8]:
status = ['abandoned','resigned','timeout','repetition','timevsinsufficient','stalemate','insufficient','agreed','threecheck','kingofthehill','50move']

In [9]:
sample = data[(~data['white_result'].isin(status))&(~data['black_result'].isin(status))].sample(frac=1).reset_index(drop=True)

**Add game result / target**

In [10]:
sample['result'] = sample['white_result'].map(lambda X: 1 if X=='win' else 0) 

**Get a data sample to reduce the preprocessing and modelizing time**

In [11]:
X_train_game = sample.drop(columns='result')
y_train_game = sample['result']

**Define envirnonment variables for the chess engine**

In [12]:
os.environ["ENGINE_DEPTH"] = "5"
os.environ["ENGINE_TIMEOUT"] = "0.1"
os.environ["ENGINE_MATE_SCORE"] = "10000"
os.environ.get("ENGINE_DEPTH"),os.environ.get("ENGINE_TIMEOUT"),os.environ.get("ENGINE_MATE_SCORE")

('5', '0.1', '10000')

**Transform the train dataset into a usable 66 columns dataset**

In [None]:
%%time
df_preprocessed = transform_dataset(X_train_game[0:100],y_train_game[0:100],0,1000)

**Store preprocessed data into csv**

In [20]:
df_preprocessed.to_csv('../raw_data/club_games_data_preprocessed_symbols.csv',index=False)

In [13]:
df_preprocessed

NameError: name 'df_preprocessed' is not defined

# Model Training part


**New dataset with user turn and symbols**

In [14]:
df_preprocessed_symbols = pd.read_csv('../raw_data/club_games_data_preprocessed_symbols.csv')
df_preprocessed_symbols.columns=range(0,67)

**Replace the symbols with categorical values**

In [15]:
df = symbols_to_dict(df_preprocessed_symbols)

In [16]:
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,57,58,59,60,61,62,63,64,65,66
0,4,2,3,5,6,3,2,4,1,1,...,12,13,15,16,13,12,14,1,54,1
1,4,2,3,5,6,3,2,4,1,1,...,12,13,15,16,13,12,14,0,68,1
2,4,2,3,5,6,3,2,4,1,1,...,12,13,15,16,13,12,14,1,226,1
3,4,2,3,5,6,3,0,4,1,1,...,12,13,15,16,13,12,14,0,48,1
4,4,2,3,5,6,3,0,4,1,1,...,0,13,15,16,13,12,14,1,69,1


**Define features X and target y for the model training**

In [17]:
## shuffle sample
sample = df.sample(frac = 1)
## define features and target
X = sample.drop(columns=66)
y = sample[66]

**Train and test split**

In [18]:
# define the train and test split
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.20,random_state=42)

In [22]:
X_train.shape,y_train.shape

((890178, 66), (890178,))

**Grid search on XG boost classifier**

In [None]:
grid_search = GridSearchCV(
    XGBoost,
    param_grid={
        'n_estimators':[300],
        'max_depth':[2,3,4,5,6],
        'learning_rate':[0.1,0.01],
        'enable_categorical': [True]
    },
    cv=5,
    verbose=2,
    return_train_score=True,
    scoring='accuracy')

#grid_search.get_params()

In [None]:
grid_search.fit(X_train[0:10000], y_train[0:10000])
grid_search.best_params_,grid_search.best_score_

In [None]:
best_model = grid_search.best_estimator_
best_model.score(X_test[0:1000],y_test[0:1000])

**Grid search on gradient boosting classifier**

In [None]:
grid_search = GridSearchCV(
    GradientBoostingClassifier(),
    param_grid={
        'n_estimators':[300],
        'max_depth':[12],
        'learning_rate':[0.1]
    },
    cv=5,
    verbose=2,
    return_train_score=True,
    scoring='accuracy')

#grid_search.get_params()

In [None]:
grid_search.fit(X_train[0:5000], y_train[0:5000])
grid_search.best_params_,grid_search.best_score_

**Voting and stacking**

In [None]:
gboost = GradientBoostingClassifier(n_estimators=300,max_depth=12,learning_rate=0.1,verbose=1)
xgb = XGBClassifier(n_estimators=300,max_depth=3, learning_rate=0.1,enable_categorical=True)

model = VotingClassifier(
    estimators = [("gboost", gboost), ('xgboost',xgb)],
    weights = [20, 10],
    n_jobs=-1,
    verbose=1
)
# model = StackingClassifier(
#     estimators=[("gboost", gboost),('xgboost',xgb)],    
#     #final_estimator=LinearRegression(),
#     cv=5,
#     n_jobs=-1
# )


In [None]:
voted_model = model.fit(X_train[0:100000], y_train[0:100000])

**Test the accuracy on test split**

In [None]:
voted_model.score(X_test,y_test)

**Train the best model directly**

In [None]:
%%time
GBC = GradientBoostingClassifier(max_depth=14,n_estimators=300,learning_rate=0.1,verbose=1).fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1           1.3096           98.51m
         2           1.2455           96.58m
         3           1.1914           97.10m
         4           1.1442           97.01m
         5           1.1033           96.33m
         6           1.0665           96.50m
         7           1.0339           96.66m
         8           1.0050           97.25m
         9           0.9789           97.34m
        10           0.9553           97.23m
        20           0.7959           92.74m
        30           0.7094           89.06m
        40           0.6452           85.81m
        50           0.6034           81.12m
        60           0.5595           77.77m
        70           0.5257           73.69m
        80           0.4983           70.67m
        90           0.4739           68.61m
       100           0.4539           65.22m


In [67]:
GBC.score(X_test,y_test)

0.8880855557303018

**Save the best model**

In [68]:
import pickle
import bz2
import _pickle as cPickle

# save compressed pickle 
model_pkl_file_compressed = "../backend/model/chess_winner_classifier_model.pbz2" 

with bz2.BZ2File(model_pkl_file_compressed, 'wb') as file: 
    cPickle.dump(GBC, file)

**Make some predictions on random sample from the test split**

In [70]:
GBC.predict_proba(X_test.sample(frac=1).iloc[0:1000])

array([[4.97173228e-01, 5.02826772e-01],
       [1.14909391e-04, 9.99885091e-01],
       [8.38767357e-01, 1.61232643e-01],
       ...,
       [1.33999046e-02, 9.86600095e-01],
       [1.52112517e-03, 9.98478875e-01],
       [6.80982348e-02, 9.31901765e-01]])

**Compare the shape of test vs train**

In [69]:
X_test.shape,X_train.shape

((222545, 66), (890178, 66))