In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer

In [None]:
username = 'Mr-Barros'
df = pd.read_csv('../../dados/base/chess_games_chesscom.csv')
df = df[df['player'] == username]

print(f'{username} games: {df.shape}')
df.head()

In [None]:
def clean_base(data: pd.DataFrame):
    df = data.copy()
    # We only want to analyse normal chess games
    df = df[df['rules'] == 'chess']
    
    # Add the 'result' column
    def determine_result(row):
        if row['winner'] == 'draw':
            return 'draw'
        elif row['winner'] == row['player_pieces']:
            return 'win'
        else:
            return 'loss'
    
    df.loc[:, 'result'] = df.apply(determine_result, axis=1)

    # makes it so the value is in reference to the player advantage
    df.loc[df['player_pieces'] == 'black', 'opening_eval'] = (-1)*df.loc[df['player_pieces'] == 'black', 'opening_eval']
    df.loc[df['player_pieces'] == 'black', 'midgame_eval'] = (-1)*df.loc[df['player_pieces'] == 'black', 'midgame_eval']

    def treat_time_control(row):
        time_control = row['time_control']
        if '+' in row['time_control']:
            time, increment = time_control.split('+')
        elif '/' in row['time_control']:
            time = time_control.split('/')[1]
            if int(time) == 0:
                time = 9999999 # no time limit
            increment = 0
        else:
            time = row['time_control']
            increment = 0
        return pd.Series([int(time), int(increment)], index=['time_control', 'increment'])

    df.loc[:, ['time_control', 'increment']] = df.apply(treat_time_control, axis=1)

    df = df.drop([
        df.columns[0], 
        'url',
        'pgn', 
        'rated',
        'accuracies',
        'end_time',
        'rules', 
        'tcn', 
        'winner',
        'player',
        'opponent',
        'uuid', 
        'initial_setup', 
        'fen', 
        'start_time', 
        'move_list',
        'move_evals',
        'material_count'
        ], axis=1)

    df.dropna(inplace=True)
    return df

df = clean_base(df)


In [None]:
print(f"Shape: {df.shape}\n")
print(f"Columns: {df.columns}\n")
for column in ['time_control', 'increment', 'time_class', 'player_pieces', 'win_method', 'opening_eval', 'midgame_eval']:
    print(f'Unique values of {column}: {df[column].unique()}')

df.isna().sum()

In [None]:
df['time_control'].value_counts()
df['time_class'].value_counts()

In [None]:
df.head()

In [None]:
oe_time_class = OrdinalEncoder(categories=[['bullet', 'blitz', 'rapid', 'daily']])
ohe_eco = OneHotEncoder(sparse_output=False).set_output(transform='pandas')
le_player_pieces = LabelEncoder()
ohe_win_method = OneHotEncoder(sparse_output=False).set_output(transform='pandas')
oe_result = OrdinalEncoder(categories=[['loss', 'draw', 'win']])
scaler = MinMaxScaler()

df['time_class'] = oe_time_class.fit_transform(df[['time_class']])
df['player_pieces'] = le_player_pieces.fit_transform(df['player_pieces'])
# df['result'] = oe_result.fit_transform(df[['result']])

eco_encoded = ohe_eco.fit_transform(df[['eco']])
win_method_encoded = ohe_win_method.fit_transform(df[['win_method']]) 

df = pd.concat([df, eco_encoded, win_method_encoded], axis=1)
df.drop(columns=['eco', 'win_method'], axis = 1, inplace = True)

df[df.columns.drop('result')] = scaler.fit_transform(df[df.columns.drop('result')])

df.head()

In [None]:
def remove_columns(data: pd.DataFrame, columns_to_remove: list[str]) -> pd.DataFrame:
    df = data.copy()
    for column in df.columns:
        if any(column_to_remove in column for column_to_remove in columns_to_remove):
            df.drop([column], axis=1, inplace=True)
    return df

In [None]:
def treat_outliers(data: pd.DataFrame) -> pd.DataFrame:
    df = data.copy()
    for col in ['time_control', 'opponent_rating', 'winrate_with_opening']:
        if col not in df.columns:
            continue
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        # for winrate_with_opening, substitute outliers by mean value
        if col == 'winrate_with_opening':
            col_mean = df[col].mean()
            out_of_bounds = (df[col] < lower_bound) | (df[col] > upper_bound)
            df.loc[out_of_bounds, col] = col_mean           
        else:
            df = df[(df[col] > lower_bound) & (df[col] < upper_bound)]
    
    return df

test = treat_outliers(df)
test.shape

In [None]:
# we will try combinations removing the following columns: 
# time_control, increment, win_method, winrate_with_opening
column_combinations = [
    ['win_method', 'winrate_with_opening'],
    ['time_control', 'increment', 'win_method', 'winrate_with_opening'],
    ['time_control', 'increment'],
    []
]

In [None]:
params = {
            'criterion': ['gini', 'entropy', 'log_loss'],
            'splitter': ['best', 'random'],
            'max_depth': [None, 7, 10],
            'min_samples_leaf': [3, 10, 25],
        }

model_params = {
    'Árvore de Classificação': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy', 'log_loss'],
            'splitter': ['best', 'random'],
            'max_depth': [None, 7, 10],
            'min_samples_leaf': [3, 10, 25],
        }
    }
}

In [None]:
columns = pd.MultiIndex.from_tuples([
    ('Pré-Processamento', 'Outliers', 'manter'),
    ('Pré-Processamento', 'Outliers', 'remover'),

    ('Pré-Processamento', 'Remover Coluna', 'time_control'),
    ('Pré-Processamento', 'Remover Coluna', 'increment'),
    ('Pré-Processamento', 'Remover Coluna', 'win_method'),
    ('Pré-Processamento', 'Remover Coluna', 'winrate_with_opening'),

    ('Mineração de Dados', 'Árvore de Classificação', f'criterion={params["criterion"][0]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'criterion={params["criterion"][1]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'criterion={params["criterion"][2]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'splitter={params["splitter"][0]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'splitter={params["splitter"][1]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'max_depth={params["max_depth"][0]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'max_depth={params["max_depth"][1]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'max_depth={params["max_depth"][2]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'min_samples_leaf={params["min_samples_leaf"][0]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'min_samples_leaf={params["min_samples_leaf"][1]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'min_samples_leaf={params["min_samples_leaf"][2]}'),

    ('Pós-Processamento', 'Medidas', 'Acurácia'),
    ('Pós-Processamento', 'Medidas', 'Precisão'),
    ('Pós-Processamento', 'Medidas', 'Recall'),
    ('Pós-Processamento', 'Medidas', 'F-Measure')
])

experiment = pd.DataFrame(columns=columns)
experiment.head()

In [None]:
scorers = {
    'Accuracy': make_scorer(accuracy_score),
    'Precision': make_scorer(precision_score, average='weighted'),
    'Recall': make_scorer(recall_score, average='weighted'),
    'F1-Score': make_scorer(f1_score, average='weighted')
}

In [None]:
# TODO: figure out how to handle multiple class classification (the precision metric is doing a division by 0)
df = df[df['result'] != 'draw']

# Split data
train, test = train_test_split(df, train_size=0.8, random_state=42)

In [None]:
scores = []

for remove_outliers in [True, False]:
    for combination in column_combinations:
        train_treated = remove_columns(train, combination)
        if remove_outliers:
            train_treated = treat_outliers(train)

        

        X_train = train_treated.drop('result', axis=1)
        y_train = train_treated['result']

        
        for model_name, mp in model_params.items():
            clf = GridSearchCV(mp['model'], mp['params'], cv=5, scoring=scorers, return_train_score=False, refit='F1-Score')
            clf.fit(X_train, y_train)

            results = pd.DataFrame(clf.cv_results_)[['params', 'mean_test_Accuracy', 'mean_test_Precision', 'mean_test_Recall', 'mean_test_F1-Score']]

            for _, metrics in results.iterrows():
                row = pd.Series(index=columns, dtype=str)
                row[:] = ' '

                if remove_outliers:
                    row[('Pré-Processamento', 'Outliers', 'remover')] = 'x'
                else:
                    row[('Pré-Processamento', 'Outliers', 'manter')] = 'x'
                
                for column in combination:
                    row[('Pré-Processamento', 'Remover Coluna', column)] = 'x'
                
                for param_name, param_value in metrics['params'].items():
                    row[('Mineração de Dados', f'{model_name}', f'{param_name}={param_value}')] = 'x'
                
                row[('Pós-Processamento', 'Medidas', 'Acurácia')] = metrics['mean_test_Accuracy']
                row[('Pós-Processamento', 'Medidas', 'Precisão')] = metrics['mean_test_Precision']
                row[('Pós-Processamento', 'Medidas', 'Recall')] = metrics['mean_test_Recall']
                row[('Pós-Processamento', 'Medidas', 'F-Measure')] = metrics['mean_test_F1-Score']

                experiment.loc[len(experiment)] = row 

print(experiment.shape)
experiment.head()

In [None]:
# Save to Excel
experiment.to_excel('decision_tree_grid_search_results.xlsx')
print("Results saved to 'decision_tree_grid_search_results.xlsx'")

In [None]:
best_model = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=10, min_samples_leaf=10)
columns_to_remove = []

train_treated = remove_columns(train, columns_to_remove)
train_treated = treat_outliers(train_treated)

X_train = train_treated.drop('result', axis=1)
y_train = train_treated['result']

best_model.fit(X_train, y_train)

test_treated = remove_columns(test, columns_to_remove)
test_treated = treat_outliers(test_treated)

X_test = train_treated.drop('result', axis=1)
y_test = train_treated['result']

pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred, pos_label='win')
recall = recall_score(y_test, pred, pos_label='win')
f1 = f1_score(y_test, pred, pos_label='win')

print(f"Accuracy: {accuracy:.4%}")
print(f"Precision: {precision:.4%}")
print(f"Recall: {recall:.4%}")
print(f"F1 Score: {f1:.4%}\n")