In [1]:
import pandas as pd
import numpy as np
import re
import chess

from scipy.stats import randint, uniform

from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from xgboost import XGBClassifier


# import from chess winner package
from chess_winner.board import Board64
from chess_winner.utils import transform_dataset

# Preprocess dataset

**Extract from the dataset**

In [2]:
data = pd.read_csv('../raw_data/club_games_data.csv')
data = data[['white_result','black_result','pgn']]

In [3]:
data.head(5)

Unnamed: 0,white_result,black_result,pgn
0,win,checkmated,"[Event ""Enjoyable games 2 - Round 1""]\n[Site ""..."
1,win,resigned,"[Event ""Rapid Rats - Board 5""]\n[Site ""Chess.c..."
2,win,resigned,"[Event ""CHESS BOARD CLASH - Round 1""]\n[Site ""..."
3,checkmated,win,"[Event ""CHESS BOARD CLASH - Round 1""]\n[Site ""..."
4,win,resigned,"[Event ""CHESS BOARD CLASH - Round 1""]\n[Site ""..."


**Keep only finished win/loose game, then shuffle**

In [4]:
status = ['timeout','repetition','timevsinsufficient','stalemate','insufficient','agreed','threecheck','kingofthehill','50move']
sample = data[(~data['white_result'].isin(status))&(~data['black_result'].isin(status))].sample(frac=1).reset_index(drop=True)

**Add game result / target**

In [5]:
sample['result'] = sample['white_result'].map(lambda X: 1 if X=='win' else 0)

**Get a data sample to reduce the preprocessing and modelizing time**

In [6]:
X_train_game = sample.drop(columns='result')
y_train_game = sample['result']

**Transform the train dataset into a usable 64 columns dataset**

In [7]:
df_preprocessed = transform_dataset(X_train_game,y_train_game,50,100)

**Store preprocessed data into csv**

In [8]:
df_preprocessed.to_csv('../raw_data/club_games_data_preprocessed.csv',index=False)

# Model Training part

**Get the preprocessed dataset**

In [9]:
df_preprocessed = pd.read_csv('../raw_data/club_games_data_preprocessed.csv')
df_preprocessed.columns=range(0,65)

**Define features X and target y for the model training**

In [10]:
## shuffle sample
sample = df_preprocessed.sample(frac = 1)
## define features and target
X = sample.drop(columns=64)
y = sample[64]

**Train and test split**

In [11]:
# define the train and test split
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.6333,random_state=42)

In [12]:
X_train.shape

(499813, 64)

**RandomizedSearchCV**

In [13]:
param_dist = {
    "n_estimators": randint(100, 500),
    "max_depth": randint(15, 20),
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "min_samples_split": randint(2, 11),
    "min_samples_leaf": randint(1, 11),
}

# continue training the model by increasing n_estimators and adjusting learning_rate
gbm = GradientBoostingClassifier(warm_start=True)

random_search = RandomizedSearchCV(
    gbm, param_distributions=param_dist,
    n_iter=2, cv=5, verbose=2, return_train_score=True,
    random_state=42,
    n_jobs=-1,
    scoring='accuracy')

In [14]:
%%time
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] END learning_rate=0.1, max_depth=18, min_samples_leaf=8, min_samples_split=6, n_estimators=202; total time=126.9min
[CV] END learning_rate=0.1, max_depth=18, min_samples_leaf=8, min_samples_split=6, n_estimators=202; total time=126.9min
[CV] END learning_rate=0.1, max_depth=18, min_samples_leaf=8, min_samples_split=6, n_estimators=202; total time=127.0min
[CV] END learning_rate=0.1, max_depth=18, min_samples_leaf=8, min_samples_split=6, n_estimators=202; total time=127.3min
[CV] END learning_rate=0.1, max_depth=18, min_samples_leaf=8, min_samples_split=6, n_estimators=202; total time=127.4min
[CV] END learning_rate=0.05, max_depth=17, min_samples_leaf=7, min_samples_split=9, n_estimators=472; total time=187.4min
[CV] END learning_rate=0.05, max_depth=17, min_samples_leaf=7, min_samples_split=9, n_estimators=472; total time=187.5min
[CV] END learning_rate=0.05, max_depth=17, min_samples_leaf=7, min_samples_split=9, n_estim

In [15]:
best_parameters = random_search.best_params_
print("Best parm found: ", best_parameters)

best_score = random_search.best_score_
print("Best score : ", best_score)

Best parm found:  {'learning_rate': 0.05, 'max_depth': 17, 'min_samples_leaf': 7, 'min_samples_split': 9, 'n_estimators': 472}
Best score :  0.8712758586526036


In [16]:
best_model = random_search.best_estimator_

test_score = best_model.score(X_test, y_test)
print("Score on test set : ", test_score)

Score on test set :  0.8851379530857562


## Targeted research

In [None]:
param_dist = {
    "n_estimators": randint(190, 220),
    "max_depth": randint(16, 20),
    "learning_rate": [0.05, 0.1, 0.15],
    "min_samples_split": randint(4, 9),
    "min_samples_leaf": randint(6, 10),
}

gbm = GradientBoostingClassifier(random_state=42)

random_search_targeted = RandomizedSearchCV(
    gbm, param_distributions=param_dist,
    n_iter=5,  # Compromis entre exploration et temps de calcul
    cv=5,  # Moins de folds pour accélérer
    verbose=2, return_train_score=True,
    random_state=42,
    n_jobs=-1,
    scoring='accuracy')

best_parameters_targeted = random_search_targeted.best_params_
print("Best parm found: ", best_parameters_targeted)

best_score_targeted = random_search_targeted.best_score_
print("Best score : ", best_score_targeted)

In [None]:
best_model_targeted = random_search_targeted.best_estimator_

test_score_targeted = best_model_targeted.score(X_test, y_test)
print("Score on test set : ", test_score_targeted)

# Essai autres model classification

In [17]:
xgb = XGBClassifier(
    n_estimators=350, max_depth=20, learning_rate=0.15,
    subsample=0.8,colsample_bytree=0.8,
    random_state=42, eval_metric='logloss')

xgb.fit(X_train, y_train)

In [18]:
scores_xgb = cross_val_score(xgb, X_train, y_train, cv=5, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_xgb.mean(), scores_xgb.std() * 2))
print("Accuracy: ", scores_xgb.mean())

Accuracy: 0.88 (+/- 0.00)
Accuracy:  0.8797730305768445


**Random_Search on xgb**

In [19]:
param_dist_xgb = {
    'n_estimators': randint(300, 500),
    'max_depth': randint(15, 30),
    'learning_rate': uniform(0.01, 0.2),
    'subsample': uniform(0.5, 0.8),
    'colsample_bytree': uniform(0.5, 0.8),
    'gamma': uniform(0, 0.5),
    'lambda': uniform(1, 10),
    'alpha': uniform(0, 10)
}

In [20]:
xgb_tuned = XGBClassifier( n_estimators=350, max_depth=20, learning_rate=0.15,
    subsample=0.8,colsample_bytree=0.8,
    random_state=42, eval_metric='logloss')

In [21]:
random_search_xgb_tuned = RandomizedSearchCV(
    estimator=xgb_tuned,
    param_distributions=param_dist_xgb,
    n_iter=5,
    scoring='accuracy',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

In [22]:
random_search_xgb_tuned.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END alpha=3.745401188473625, colsample_bytree=1.2605714451279328, gamma=0.36599697090570255, lambda=6.986584841970366, learning_rate=0.0412037280884873, max_depth=17, n_estimators=374, subsample=0.8673991135726937; total time=   2.7s
[CV] END alpha=3.745401188473625, colsample_bytree=1.2605714451279328, gamma=0.36599697090570255, lambda=6.986584841970366, learning_rate=0.0412037280884873, max_depth=17, n_estimators=374, subsample=0.8673991135726937; total time=   3.0s
[CV] END alpha=3.745401188473625, colsample_bytree=1.2605714451279328, gamma=0.36599697090570255, lambda=6.986584841970366, learning_rate=0.0412037280884873, max_depth=17, n_estimators=374, subsample=0.8673991135726937; total time=   3.1s
[CV] END alpha=3.745401188473625, colsample_bytree=1.2605714451279328, gamma=0.36599697090570255, lambda=6.986584841970366, learning_rate=0.0412037280884873, max_depth=17, n_estimators=374, subsample=0.8673991135726937; tot

15 fits failed out of a total of 25.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/habbarsofiane/.pyenv/versions/3.10.6/envs/chess_winner/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/habbarsofiane/.pyenv/versions/3.10.6/envs/chess_winner/lib/python3.10/site-packages/xgboost/core.py", line 730, in inner_f
    return func(**kwargs)
  File "/Users/habbarsofiane/.pyenv/versions/3.10.6/envs/chess_winner/lib/python3.10/site-packages/xgboost/sklearn.py", line 1519, in fit
    self._Booster = train(
  File "/Users/habbarsofiane/.pyenv/vers

In [25]:
print("Meilleurs paramètres trouvés: ", random_search_xgb_tuned.best_params_)
print("Meilleur score: ", random_search_xgb_tuned.best_score_)

# Utilisation du meilleur modèle
best_model_xgb_tuned = random_search_xgb_tuned.best_estimator_

# Évaluation du meilleur modèle sur le jeu de test
accuracy_xgb_tuned = best_model_xgb_tuned.score(X_test, y_test)
print("Precision test set:  ", accuracy_xgb_tuned)

Meilleurs paramètres trouvés:  {'alpha': 3.337086111390218, 'colsample_bytree': 0.6142934543375527, 'gamma': 0.32544423647442644, 'lambda': 1.5641157902710026, 'learning_rate': 0.15439975445336496, 'max_depth': 28, 'n_estimators': 337, 'subsample': 0.5006230126728115}
Meilleur score:  0.8526809024298876
Précision sur le jeu de test:  0.87125575769933


# tentative Stacking

In [26]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

model_final = StackingClassifier(
    estimators=[
        ('XGB Tuned', best_model_xgb_tuned),
        ('Random search', best_model)
    ],
    final_estimator=LogisticRegression()
)

model_final.fit(X_train, y_train)

final_scoring = model_final.score(X_test, y_test)
print("Precision test set: ", final_scoring)


Précision sur le jeu de test:  0.8891845614880582


In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler



# Standardisation des features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Réduction de la dimensionnalité
pca = PCA(n_components=0.95)  # Conserver 95% de variance
X_pca = pca.fit_transform(X_scaled)

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Modèles pour stacking
estimators = [
    ('rf', best_model),
    ('xgb', best_model_xgb_tuned),
    ('lgbm', LGBMClassifier(n_estimators=100, random_state=42))
]

final_estimator = LogisticRegression()

# Modèle d'ensemble avec Stacking
model_final = StackingClassifier(
    estimators=estimators,
    final_estimator=final_estimator,
    n_jobs=-1
)

In [None]:
model_final.fit(X_train, y_train)
final_scoring = model_final.score(X_test, y_test)

In [None]:
print("Precision test set : ", final_scoring)

Precision test set :  0.7993404279514749


In [None]:
import optuna
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.005, 0.05),
        'n_estimators': 1000,  # Fixons n_estimators et utilisons early_stopping
    }

    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    model = lgb.LGBMClassifier(**param)
    model.fit(
        X_train_split, y_train_split,
        eval_set=[(X_val_split, y_val_split)]
    )

    preds = model.predict(X_val_split)
    accuracy = accuracy_score(y_val_split, preds)

    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print('Meilleurs hyperparamètres :', study.best_trial.params)


In [None]:
best_params = study.best_trial.params
best_model_lgb = lgb.LGBMClassifier(**best_params)
best_model_lgb.fit(X_train, y_train)

# Évaluation sur l'ensemble de test
predictions = best_model_lgb.predict(X_test)
test_accuracy = accuracy_score(y_test, predictions)
print("Précision sur le jeu de test : ", test_accuracy)
