In [735]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer

In [736]:
username = 'Mr-Barros'
df = pd.read_csv('../../dados/base/chess_games_chesscom.csv')
df = df[df['player'] == username]

print(f'{username} games: {df.shape}')
df.head()

Mr-Barros games: (2149, 27)


Unnamed: 0,url,pgn,time_control,end_time,rated,accuracies,tcn,uuid,initial_setup,fen,...,opponent_rating,player_pieces,winner,win_method,move_list,move_evals,material_count,winrate_with_opening,opening_eval,midgame_eval
0,https://www.chess.com/game/live/13542785939,"[Event ""Live Chess""]\n[Site ""Chess.com""]\n[Dat...",600,2021-04-30 17:44:18,True,"{'white': 68.58, 'black': 64.14}",lB0KBK9zksz9nD5QjzWGgxGzszQziqzJdJZRJdRKd787DK...,c88d9027-a9d9-11eb-8f87-536974010001,,3k2R1/5R2/8/7B/1B5P/6P1/5K2/8 b - -,...,638,white,white,checkmated,"['d2d4', 'e7e5', 'd4e5', 'f8b4', 'c2c3', 'b4f8...","[{'type': 'cp', 'value': 25}, {'type': 'cp', '...","[(39, 39), (39, 39), (39, 38), (39, 38), (39, ...",0.479381,2.0,-4.0
1,https://www.chess.com/game/daily/330957418,"[Event ""Let's Play""]\n[Site ""Chess.com""]\n[Dat...",1/1209600,2021-04-30 17:50:33,True,,mC0Kgv,5c91e22a-a9dc-11eb-a4f6-c88b8701000b,,rnbqkbnr/pppp1ppp/8/4p3/4P3/5N2/PPPP1PPP/RNBQK...,...,800,black,black,resigned,"['e2e4', 'e7e5', 'g1f3']","[{'type': 'cp', 'value': 31}, {'type': 'cp', '...","[(39, 39), (39, 39), (39, 39)]",0.502762,0.0,0.0
2,https://www.chess.com/game/live/13557082425,"[Event ""Live Chess""]\n[Site ""Chess.com""]\n[Dat...",600,2021-04-30 21:37:31,True,,mC0KlBKBdB5QBs9zclzsls1TblZRec6EdeTLpFLClCRJCT...,92b893ca-a9fa-11eb-8f87-536974010001,,8/p7/R7/8/7P/k7/4NPP1/1R1K4 b - -,...,970,black,white,checkmated,"['e2e4', 'e7e5', 'd2d4', 'e5d4', 'd1d4', 'b8c6...","[{'type': 'cp', 'value': 31}, {'type': 'cp', '...","[(39, 39), (39, 39), (39, 39), (38, 39), (38, ...",0.446809,-1.0,-4.0
3,https://www.chess.com/game/live/13557721435,"[Event ""Live Chess""]\n[Site ""Chess.com""]\n[Dat...",600,2021-04-30 21:52:26,True,,mC0KlBZJgvJCcM90M080vK1TKA6SAu5QBJQKJS7dud0SnD...,4e3e9909-a9fc-11eb-8f87-536974010001,,1R6/6p1/R7/7p/5P2/k5P1/4r2P/6K1 b - -,...,856,black,white,checkmated,"['e2e4', 'e7e5', 'd2d4', 'd7d5', 'g1f3', 'd5e4...","[{'type': 'cp', 'value': 32}, {'type': 'cp', '...","[(39, 39), (39, 39), (39, 39), (39, 39), (39, ...",0.446809,2.0,3.0
4,https://www.chess.com/game/live/13558396887,"[Event ""Live Chess""]\n[Site ""Chess.com""]\n[Dat...",600,2021-04-30 22:04:11,True,"{'white': 40.22428648613115, 'black': 69.46623...",lBZJbs!Tnv6LoELUEMTNmCJCvC5QBJQKdB1Tpx7RfmNwhp...,6175a87a-a9fe-11eb-8f87-536974010001,,8/p5B1/1p2k2p/2b3p1/P1P1Kr1p/1RQ5/1P6/3r4 w - -,...,749,white,black,checkmated,"['d2d4', 'd7d5', 'b1c3', 'g8f6', 'f2f3', 'c8f5...","[{'type': 'cp', 'value': 25}, {'type': 'cp', '...","[(39, 39), (39, 39), (39, 39), (39, 39), (39, ...",0.5,-1.0,-4.0


In [737]:
def clean_base(data: pd.DataFrame):
    df = data.copy()
    # We only want to analyse normal chess games
    df = df[df['rules'] == 'chess']
    
    # Add the 'result' column
    def determine_result(row):
        if row['winner'] == 'draw':
            return 'draw'
        elif row['winner'] == row['player_pieces']:
            return 'win'
        else:
            return 'loss'
    
    df.loc[:, 'result'] = df.apply(determine_result, axis=1)

    # makes it so the value is in reference to the player advantage
    df.loc[df['player_pieces'] == 'black', 'opening_eval'] = (-1)*df.loc[df['player_pieces'] == 'black', 'opening_eval']
    df.loc[df['player_pieces'] == 'black', 'midgame_eval'] = (-1)*df.loc[df['player_pieces'] == 'black', 'midgame_eval']

    def treat_time_control(row):
        time_control = row['time_control']
        if '+' in row['time_control']:
            time, increment = time_control.split('+')
        elif '/' in row['time_control']:
            time = time_control.split('/')[1]
            if int(time) == 0:
                time = 9999999 # no time limit
            increment = 0
        else:
            time = row['time_control']
            increment = 0
        return pd.Series([int(time), int(increment)], index=['time_control', 'increment'])

    df.loc[:, ['time_control', 'increment']] = df.apply(treat_time_control, axis=1)

    df = df.drop([
        df.columns[0], 
        'url',
        'pgn', 
        'rated',
        'accuracies',
        'end_time',
        'rules', 
        'tcn', 
        'winner',
        'player',
        'opponent',
        'uuid', 
        'initial_setup', 
        'fen', 
        'start_time', 
        'move_list',
        'move_evals',
        'material_count'
        ], axis=1)

    df.dropna(inplace=True)
    return df

df = clean_base(df)


In [738]:
print(f"Shape: {df.shape}\n")
print(f"Columns: {df.columns}\n")
for column in ['time_control', 'increment', 'time_class', 'player_pieces', 'win_method', 'opening_eval', 'midgame_eval']:
    print(f'Unique values of {column}: {df[column].unique()}')

df.isna().sum()

Shape: (2136, 12)

Columns: Index(['time_control', 'time_class', 'eco', 'player_rating', 'opponent_rating',
       'player_pieces', 'win_method', 'winrate_with_opening', 'opening_eval',
       'midgame_eval', 'result', 'increment'],
      dtype='object')

Unique values of time_control: [600 1209600 1800 60 3600 180 300 7200 9999999]
Unique values of increment: [0 1 2]
Unique values of time_class: ['rapid' 'daily' 'bullet' 'blitz']
Unique values of player_pieces: ['white' 'black']
Unique values of win_method: ['checkmated' 'resigned' 'timeout' 'agreed' 'abandoned'
 'timevsinsufficient' 'stalemate' 'insufficient' 'repetition']
Unique values of opening_eval: [ 2. -0.  1. -2. -1. -3.  3. -4.  4.]
Unique values of midgame_eval: [-4. -0.  4. -3. -1.  3.  1.  2. -2.]


time_control            0
time_class              0
eco                     0
player_rating           0
opponent_rating         0
player_pieces           0
win_method              0
winrate_with_opening    0
opening_eval            0
midgame_eval            0
result                  0
increment               0
dtype: int64

In [739]:
df['time_control'].value_counts()
df['time_class'].value_counts()

time_class
blitz     1162
rapid      865
bullet     107
daily        2
Name: count, dtype: int64

In [740]:
df.head()

Unnamed: 0,time_control,time_class,eco,player_rating,opponent_rating,player_pieces,win_method,winrate_with_opening,opening_eval,midgame_eval,result,increment
0,600,rapid,Englund Gambit,962,638,white,checkmated,0.479381,2.0,-4.0,win,0
1,1209600,daily,Kings Pawn,800,800,black,resigned,0.502762,-0.0,-0.0,win,0
2,600,rapid,Center Game,820,970,black,checkmated,0.446809,1.0,4.0,loss,0
3,600,rapid,Center Game,721,856,black,checkmated,0.446809,-2.0,-3.0,loss,0
4,600,rapid,Queens Pawn,644,749,white,checkmated,0.5,-1.0,-4.0,loss,0


In [741]:
oe_time_class = OrdinalEncoder(categories=[['bullet', 'blitz', 'rapid', 'daily']])
ohe_eco = OneHotEncoder(sparse_output=False).set_output(transform='pandas')
le_player_pieces = LabelEncoder()
ohe_win_method = OneHotEncoder(sparse_output=False).set_output(transform='pandas')
oe_result = OrdinalEncoder(categories=[['loss', 'draw', 'win']])
scaler = MinMaxScaler()

def preprocessing(data: pd.DataFrame, remove_columns: list[str] = []) -> pd.DataFrame:
    df = data.copy()
    
    df.drop(columns=remove_columns, inplace=True)
    
    if 'time_class' in df.columns:
        if not hasattr(oe_time_class, 'categories_'):
            df['time_class'] = oe_time_class.fit_transform(df[['time_class']])
        else:
            df['time_class'] = oe_time_class.transform(df[['time_class']])
    
    if 'eco' in df.columns:
        if not hasattr(ohe_eco, 'categories_'):
            eco_encoded = ohe_eco.fit_transform(df[['eco']])
        else:
            eco_encoded = ohe_eco.transform(df[['eco']])
    
        df = pd.concat([df, eco_encoded], axis=1)
        df.drop(columns=['eco'], axis = 1, inplace = True)

    if 'player_pieces' in df.columns:
        if not hasattr(le_player_pieces, 'classes_'):
            df['player_pieces'] = le_player_pieces.fit_transform(df['player_pieces'])
        else:
            df['player_pieces'] = le_player_pieces.transform(df['player_pieces'])

    if 'win_method' in df.columns:
        if not hasattr(ohe_win_method, 'categories_'):
            win_method_encoded = ohe_win_method.fit_transform(df[['win_method']])
        else:
            win_method_encoded = ohe_win_method.transform(df[['win_method']])

        df = pd.concat([df, win_method_encoded], axis=1)
        df.drop(columns=['win_method'], axis = 1, inplace = True)

    if 'result' in df.columns:
        if not hasattr(oe_result, 'categories_'):
            df['result'] = oe_result.fit_transform(df[['result']])
        else:
            df['result'] = oe_result.transform(df[['result']])
    
    if not hasattr(scaler, 'scale_'):
        df[df.columns] = scaler.fit_transform(df[df.columns])
    else:
        df[df.columns] = scaler.transform(df[df.columns])

    return df

test = preprocessing(df)

print(test.shape)
test.head()

(2136, 79)


Unnamed: 0,time_control,time_class,player_rating,opponent_rating,player_pieces,winrate_with_opening,opening_eval,midgame_eval,result,increment,...,eco_Ware Opening,win_method_abandoned,win_method_agreed,win_method_checkmated,win_method_insufficient,win_method_repetition,win_method_resigned,win_method_stalemate,win_method_timeout,win_method_timevsinsufficient
0,5.4e-05,0.666667,0.587421,0.374388,1.0,0.479381,0.75,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.120955,1.0,0.383648,0.506525,0.0,0.502762,0.5,0.5,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,5.4e-05,0.666667,0.408805,0.645188,0.0,0.446809,0.625,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.4e-05,0.666667,0.284277,0.552202,0.0,0.446809,0.25,0.125,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5.4e-05,0.666667,0.187421,0.464927,1.0,0.5,0.375,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [742]:
def remove_outliers(data: pd.DataFrame) -> pd.DataFrame:
    df = data
    for col in ['time_control', 'opponent_rating', 'winrate_with_opening']:
        if col not in df.columns:
            continue
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        # for winrate_with_opening, substitute outliers by mean value
        if col == 'winrate_with_opening':
            col_mean = df[col].mean()
            out_of_bounds = (df[col] < lower_bound) | (df[col] > upper_bound)
            df.loc[out_of_bounds, col] = col_mean           
        else:
            df = df[(df[col] > lower_bound) & (df[col] < upper_bound)]
    
    return df

df = remove_outliers(df)
df.shape

(2101, 12)

In [743]:
# we will try combinations removing the following columns: 
# time_control, increment, win_method, winrate_with_opening
column_combinations = [
    ['win_method', 'winrate_with_opening'],
    ['time_control', 'increment', 'win_method', 'winrate_with_opening'],
    ['time_control', 'increment'],
    []
]

In [744]:
model_params = {
    'Árvore de Decisão': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy', 'log_loss'],
            'splitter': ['best', 'random'],
            'max_depth': [None, 7, 10],
            'min_samples_leaf': [3, 10, 25],
        }
    }
}

params = model_params['Árvore de Decisão']['params']

In [745]:
columns = pd.MultiIndex.from_tuples([
    ('Pré-Processamento', 'Outliers', 'manter'),
    ('Pré-Processamento', 'Outliers', 'remover'),

    ('Pré-Processamento', 'Remover Coluna', 'time_control'),
    ('Pré-Processamento', 'Remover Coluna', 'increment'),
    ('Pré-Processamento', 'Remover Coluna', 'win_method'),
    ('Pré-Processamento', 'Remover Coluna', 'winrate_with_opening'),

    ('Mineração de Dados', 'Árvore de Classificação', f'criterion={params["criterion"][0]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'criterion={params["criterion"][1]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'criterion={params["criterion"][2]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'splitter={params["splitter"][0]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'splitter={params["splitter"][1]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'max_depth={params["max_depth"][0]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'max_depth={params["max_depth"][1]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'max_depth={params["max_depth"][2]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'min_samples_leaf={params["min_samples_leaf"][0]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'min_samples_leaf={params["min_samples_leaf"][1]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'min_samples_leaf={params["min_samples_leaf"][2]}'),

    ('Pós-Processamento', 'Medidas', 'Acurácia'),
    ('Pós-Processamento', 'Medidas', 'Precisão'),
    ('Pós-Processamento', 'Medidas', 'Recall'),
    ('Pós-Processamento', 'Medidas', 'F-Measure')
])

experiment = pd.DataFrame(columns=columns)
experiment

Unnamed: 0_level_0,Pré-Processamento,Pré-Processamento,Pré-Processamento,Pré-Processamento,Pré-Processamento,Pré-Processamento,Mineração de Dados,Mineração de Dados,Mineração de Dados,Mineração de Dados,Mineração de Dados,Mineração de Dados,Mineração de Dados,Mineração de Dados,Mineração de Dados,Mineração de Dados,Mineração de Dados,Pós-Processamento,Pós-Processamento,Pós-Processamento,Pós-Processamento
Unnamed: 0_level_1,Outliers,Outliers,Remover Coluna,Remover Coluna,Remover Coluna,Remover Coluna,Árvore de Classificação,Árvore de Classificação,Árvore de Classificação,Árvore de Classificação,Árvore de Classificação,Árvore de Classificação,Árvore de Classificação,Árvore de Classificação,Árvore de Classificação,Árvore de Classificação,Árvore de Classificação,Medidas,Medidas,Medidas,Medidas
Unnamed: 0_level_2,manter,remover,time_control,increment,win_method,winrate_with_opening,criterion=gini,criterion=entropy,criterion=log_loss,splitter=best,...,max_depth=None,max_depth=7,max_depth=10,min_samples_leaf=3,min_samples_leaf=10,min_samples_leaf=25,Acurácia,Precisão,Recall,F-Measure


In [746]:
# Define metrics
def make_scorers():
    return {
        'Accuracy': make_scorer(accuracy_score),
        'Precision': make_scorer(precision_score, average='weighted'),
        'Recall': make_scorer(recall_score, average='weighted'),
        'F1-Score': make_scorer(f1_score, average='weighted')
    }

In [747]:
# Split data
X = df.drop('result', axis=1)
y = df['result']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [748]:
# Perform grid search
results = []
scorers = make_scorers()

In [749]:
for model_name, mp in model_params.items():
    clf = GridSearchCV(
        mp['model'], 
        mp['params'], 
        cv=5, 
        scoring=scorers, 
        refit='Accuracy', 
        return_train_score=False
    )
    clf.fit(X_train, y_train)
    

ValueError: 
All the 270 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
270 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/davimartignonibarros/Developer/PUC/Data Science/venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/davimartignonibarros/Developer/PUC/Data Science/venv/lib/python3.11/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/davimartignonibarros/Developer/PUC/Data Science/venv/lib/python3.11/site-packages/sklearn/tree/_classes.py", line 1009, in fit
    super()._fit(
  File "/Users/davimartignonibarros/Developer/PUC/Data Science/venv/lib/python3.11/site-packages/sklearn/tree/_classes.py", line 252, in _fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "/Users/davimartignonibarros/Developer/PUC/Data Science/venv/lib/python3.11/site-packages/sklearn/base.py", line 645, in _validate_data
    X = check_array(X, input_name="X", **check_X_params)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/davimartignonibarros/Developer/PUC/Data Science/venv/lib/python3.11/site-packages/sklearn/utils/validation.py", line 1012, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/davimartignonibarros/Developer/PUC/Data Science/venv/lib/python3.11/site-packages/sklearn/utils/_array_api.py", line 751, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/davimartignonibarros/Developer/PUC/Data Science/venv/lib/python3.11/site-packages/pandas/core/generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'rapid'


In [None]:
# Collect results
results_df = pd.DataFrame(clf.cv_results_).filter(['params', 'mean_test_Accuracy', 'mean_test_Precision', 'mean_test_Recall', 'mean_test_F1-Score'])
results.append(results_df)

In [None]:
# Combine results
final_results = pd.concat(results, keys=model_params.keys(), names=['Model', 'Index'])
final_results.reset_index(level='Model', inplace=True)

In [None]:
# Save to Excel
final_results.to_excel('decision_tree_grid_search_results.xlsx', index=False)
print("Results saved to 'decision_tree_grid_search_results.xlsx'")

Results saved to 'decision_tree_grid_search_results.xlsx'
