**Import packages**

In [335]:
import pandas as pd
import numpy as np
import re
import chess

from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier

**Extract from the dataset**

In [351]:
data = pd.read_csv('../raw_data/club_games_data.csv')
data = data[['white_result','black_result','pgn']]

**Extract moves list from a PGN**

In [337]:
def get_moves(pgn):
    # split pgn by line break
    splitted = pgn.split('\n')
    # get last element
    turns = splitted[len(splitted)-2].strip()
    # clean PGN
    turns = re.sub('\{\[\%clk [0-9:\.]+\]}','', turns)
    turns = re.sub(' [0-9]+\.\.\.','', turns)
    turns = re.sub('[0-2\/]+-[0-2\/]+','', turns)
    turns=turns.replace('  ',' ')
    # init white and black list
    moves = []
    # for all turns
    for turn in re.split('[0-9]+\. ', turns):
        # extract all moves
        if turn:
            m = turn.strip().split(' ')
            moves.append(m[0])
            if len(m)==2:
                moves.append(m[1])
    return moves

**Put pieces on squares and append target**

In [338]:
def get_squares(board,target):
    squares=[]
    for i in range(0,64):
        piece = board.piece_at(i)
        if piece:
            squares.append(piece.piece_type)
        else :
            squares.append(0)
    squares.append(target)
    return squares

**Get all positions from a moves list and append target**

In [339]:
def get_all_positions(moves,target):
    all_positions=[]
    board = chess.Board()
    for index,m in enumerate(moves):
        try:
            board.push_san(m)
            all_positions.append(get_squares(board,target))
        except:
            do='wrong push'
    return all_positions
#chess.svg.board(board,size=200)

**Transform X and Y into a 64 squares dataset**

In [340]:
def transform(X,y,min_moves=5):
    # init empty nd array
    positions=[]
    # for each values in the game dataset
    for i in range(len(X)):
        # get moves list from pgn
        moves = get_moves(X.loc[i, "pgn"])
        # get target result from y
        target = y[i]
        # if enough moves
        if len(moves)>=min_moves:
            positions = positions + get_all_positions(moves,target)
    # return a new dataset
    return pd.DataFrame.from_dict(positions) 

**Keep only finished win/loose game, then shuffle**

In [342]:
status = ['timeout','repetition','timevsinsufficient','stalemate','insufficient','agreed','threecheck','kingofthehill','50move']
sample = data[(~data['white_result'].isin(status))&(~data['black_result'].isin(status))].sample(frac=1).reset_index(drop=True)

**Add game result / target**

In [343]:
sample['result'] = sample['white_result'].map(lambda X: 1 if X=='win' else 0) 

**Define features and target**

In [344]:
X_game = sample.drop(columns='result')
y_game = sample['result']

**Get a data sample to reduce the preprocessing and modelizing time**

In [345]:
# define the train and test split
X_train_game, X_test_game, y_train_game, y_test_game = train_test_split(X_game,y_game,test_size=0.90,random_state=42)
# reset index (necessary for png extraction)
X_train_game = X_train_game.reset_index(drop=True)
y_train_game = y_train_game.reset_index(drop=True)

**Transform the train dataset into a usable 64 columns dataset**

In [346]:
df_preprocessed = transform(X_train_game,y_train_game)

In [347]:
df_preprocessed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,4,2,3,5,6,3,2,4,1,1,...,1,4,2,3,5,6,3,2,4,0
1,4,2,3,5,6,3,2,4,1,1,...,1,4,2,3,5,6,3,2,4,0
2,4,2,3,5,6,3,2,4,1,1,...,1,4,2,3,5,6,3,2,4,0
3,4,2,3,5,6,3,2,4,1,1,...,1,4,2,3,5,6,3,0,4,0
4,4,0,3,5,6,3,2,4,1,1,...,1,4,2,3,5,6,3,0,4,0


**Define features X and target y for the model training**

In [207]:
## define features and target
sample = df_preprocessed.sample(frac = 1)
X = sample.drop(columns=64)
y = sample[64]

**Train and test split**

In [217]:
# define the train and test split
X_train, X_test, y_train, y_test= train_test_split(X_encoded,y,test_size=0.70,random_state=42)

**Grid search on gradient boosting classifier**

In [316]:
grid_search = GridSearchCV(
    GradientBoostingClassifier(),
    param_grid={
        'n_estimators':[300],
        'max_depth':[12],
        'learning_rate':[0.1],
        'max_features':[200],
    },
    cv=5,
    verbose=2,
    return_train_score=True,
    scoring='accuracy')

#grid_search.get_params()

**Fit the search and get the best model**

In [317]:
grid_search.fit(X_train, y_train)
grid_search.best_params_,grid_search.best_score_

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.1, max_depth=12, max_features=200, n_estimators=300; total time=26.2min
[CV] END learning_rate=0.1, max_depth=12, max_features=200, n_estimators=300; total time= 2.5min
[CV] END learning_rate=0.1, max_depth=12, max_features=200, n_estimators=300; total time= 2.5min
[CV] END learning_rate=0.1, max_depth=12, max_features=200, n_estimators=300; total time= 2.5min
[CV] END learning_rate=0.1, max_depth=12, max_features=200, n_estimators=300; total time= 2.5min


({'learning_rate': 0.1,
  'max_depth': 12,
  'max_features': 200,
  'n_estimators': 300},
 0.8624126553887344)

**Test the accuracy on test split**

In [320]:
best_model.score(X_test,y_test)

0.8763305032097742

**Make some predictions on random sample from the test split**

In [353]:
best_model.predict_proba(X_test.sample(frac=1).iloc[0:1000])

array([[0.98806922, 0.01193078],
       [0.4171383 , 0.5828617 ],
       [0.4483136 , 0.5516864 ],
       ...,
       [0.97529378, 0.02470622],
       [0.13888465, 0.86111535],
       [0.00500847, 0.99499153]])

**Compare the shape of test vs train**

In [324]:
X_test.shape,X_train.shape

((169015, 64), (72434, 64))