In [206]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer

In [207]:
username = 'Mr-Barros'
df = pd.read_csv('../../dados/base/chess_games_chesscom.csv')
df = df[df['player'] == username]

print(f'{username} games: {df.shape}')
df.head()

Mr-Barros games: (2149, 27)


Unnamed: 0,url,pgn,time_control,end_time,rated,accuracies,tcn,uuid,initial_setup,fen,...,opponent_rating,player_pieces,winner,win_method,move_list,move_evals,material_count,winrate_with_opening,opening_eval,midgame_eval
0,https://www.chess.com/game/live/13542785939,"[Event ""Live Chess""]\n[Site ""Chess.com""]\n[Dat...",600,2021-04-30 17:44:18,True,"{'white': 68.58, 'black': 64.14}",lB0KBK9zksz9nD5QjzWGgxGzszQziqzJdJZRJdRKd787DK...,c88d9027-a9d9-11eb-8f87-536974010001,,3k2R1/5R2/8/7B/1B5P/6P1/5K2/8 b - -,...,638,white,white,checkmated,"['d2d4', 'e7e5', 'd4e5', 'f8b4', 'c2c3', 'b4f8...","[{'type': 'cp', 'value': 25}, {'type': 'cp', '...","[(39, 39), (39, 39), (39, 38), (39, 38), (39, ...",0.479381,2.0,-4.0
1,https://www.chess.com/game/daily/330957418,"[Event ""Let's Play""]\n[Site ""Chess.com""]\n[Dat...",1/1209600,2021-04-30 17:50:33,True,,mC0Kgv,5c91e22a-a9dc-11eb-a4f6-c88b8701000b,,rnbqkbnr/pppp1ppp/8/4p3/4P3/5N2/PPPP1PPP/RNBQK...,...,800,black,black,resigned,"['e2e4', 'e7e5', 'g1f3']","[{'type': 'cp', 'value': 31}, {'type': 'cp', '...","[(39, 39), (39, 39), (39, 39)]",0.502762,0.0,0.0
2,https://www.chess.com/game/live/13557082425,"[Event ""Live Chess""]\n[Site ""Chess.com""]\n[Dat...",600,2021-04-30 21:37:31,True,,mC0KlBKBdB5QBs9zclzsls1TblZRec6EdeTLpFLClCRJCT...,92b893ca-a9fa-11eb-8f87-536974010001,,8/p7/R7/8/7P/k7/4NPP1/1R1K4 b - -,...,970,black,white,checkmated,"['e2e4', 'e7e5', 'd2d4', 'e5d4', 'd1d4', 'b8c6...","[{'type': 'cp', 'value': 31}, {'type': 'cp', '...","[(39, 39), (39, 39), (39, 39), (38, 39), (38, ...",0.446809,-1.0,-4.0
3,https://www.chess.com/game/live/13557721435,"[Event ""Live Chess""]\n[Site ""Chess.com""]\n[Dat...",600,2021-04-30 21:52:26,True,,mC0KlBZJgvJCcM90M080vK1TKA6SAu5QBJQKJS7dud0SnD...,4e3e9909-a9fc-11eb-8f87-536974010001,,1R6/6p1/R7/7p/5P2/k5P1/4r2P/6K1 b - -,...,856,black,white,checkmated,"['e2e4', 'e7e5', 'd2d4', 'd7d5', 'g1f3', 'd5e4...","[{'type': 'cp', 'value': 32}, {'type': 'cp', '...","[(39, 39), (39, 39), (39, 39), (39, 39), (39, ...",0.446809,2.0,3.0
4,https://www.chess.com/game/live/13558396887,"[Event ""Live Chess""]\n[Site ""Chess.com""]\n[Dat...",600,2021-04-30 22:04:11,True,"{'white': 40.22428648613115, 'black': 69.46623...",lBZJbs!Tnv6LoELUEMTNmCJCvC5QBJQKdB1Tpx7RfmNwhp...,6175a87a-a9fe-11eb-8f87-536974010001,,8/p5B1/1p2k2p/2b3p1/P1P1Kr1p/1RQ5/1P6/3r4 w - -,...,749,white,black,checkmated,"['d2d4', 'd7d5', 'b1c3', 'g8f6', 'f2f3', 'c8f5...","[{'type': 'cp', 'value': 25}, {'type': 'cp', '...","[(39, 39), (39, 39), (39, 39), (39, 39), (39, ...",0.5,-1.0,-4.0


In [None]:
def clean_base(data: pd.DataFrame):
    df = data
    # We only want to analyse normal chess games
    df = df[df['rules'] == 'chess']
    
    # Add the 'result' column
    def determine_result(row):
        if row['winner'] == 'draw':
            return 'draw'
        elif row['winner'] == row['player_pieces']:
            return 'win'
        else:
            return 'loss'
    
    df.loc[:, 'result'] = df.apply(determine_result, axis=1)

    # makes it so the value is in reference to the player advantage
    df.loc[df['player_pieces'] == 'black', 'opening_eval'] = (-1)*df.loc[df['player_pieces'] == 'black', 'opening_eval']
    df.loc[df['player_pieces'] == 'black', 'midgame_eval'] = (-1)*df.loc[df['player_pieces'] == 'black', 'midgame_eval']

    df = df.drop([
        df.columns[0], 
        'url',
        'pgn', 
        'rated',
        'accuracies',
        'end_time',
        'rules', 
        'tcn', 
        'winner',
        'player',
        'opponent',
        'uuid', 
        'initial_setup', 
        'fen', 
        'start_time', 
        'move_list',
        'move_evals',
        'material_count',
        ], axis=1)

    df.dropna(inplace=True)
    return df

df = clean_base(df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'result'] = df.apply(determine_result, axis=1)


In [209]:
print(f"Shape: {df.shape}\n")
print(f"Columns: {df.columns}\n")
for column in ['time_control', 'time_class', 'player_pieces', 'win_method', 'opening_eval', 'midgame_eval']:
    print(f'Unique values of {column}: {df[column].unique()}')

df.isna().sum()

Shape: (2136, 11)

Columns: Index(['time_control', 'time_class', 'eco', 'player_rating', 'opponent_rating',
       'player_pieces', 'win_method', 'winrate_with_opening', 'opening_eval',
       'midgame_eval', 'result'],
      dtype='object')

Unique values of time_control: ['600' '1/1209600' '1800' '60' '3600' '180' '300' '60+1' '180+2' '7200'
 '1/0']
Unique values of time_class: ['rapid' 'daily' 'bullet' 'blitz']
Unique values of player_pieces: ['white' 'black']
Unique values of win_method: ['checkmated' 'resigned' 'timeout' 'agreed' 'abandoned'
 'timevsinsufficient' 'stalemate' 'insufficient' 'repetition']
Unique values of opening_eval: [ 2. -0.  1. -2. -1. -3.  3. -4.  4.]
Unique values of midgame_eval: [-4. -0.  4. -3. -1.  3.  1.  2. -2.]


time_control            0
time_class              0
eco                     0
player_rating           0
opponent_rating         0
player_pieces           0
win_method              0
winrate_with_opening    0
opening_eval            0
midgame_eval            0
result                  0
dtype: int64

In [210]:
df.head()

Unnamed: 0,time_control,time_class,eco,player_rating,opponent_rating,player_pieces,win_method,winrate_with_opening,opening_eval,midgame_eval,result
0,600,rapid,Englund Gambit,962,638,white,checkmated,0.479381,2.0,-4.0,win
1,1/1209600,daily,Kings Pawn,800,800,black,resigned,0.502762,-0.0,-0.0,win
2,600,rapid,Center Game,820,970,black,checkmated,0.446809,1.0,4.0,loss
3,600,rapid,Center Game,721,856,black,checkmated,0.446809,-2.0,-3.0,loss
4,600,rapid,Queens Pawn,644,749,white,checkmated,0.5,-1.0,-4.0,loss


In [None]:
def preprocessing(data: pd.DataFrame, remove_columns: list[str] = []):
    df = data
    
    df.drop(columns=remove_columns, inplace=True)




SyntaxError: incomplete input (2880314112.py, line 2)

In [None]:
def remove_outliers(data: pd.DataFrame) -> pd.DataFrame:
    df = data
    
    for col in ['Sales', 'CompPrice', 'Price']:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] > lower_bound) & (df[col] < upper_bound)]
    
    return df

In [None]:
model_params = {
    'Árvore de Decisão': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy', 'log_loss'],
            'splitter': ['best', 'random'],
            'max_depth': [None, 7, 10],
            'min_samples_leaf': [3, 10, 25],
        }
    }
}

params = model_params['Árvore de Decisão']['params']

In [None]:
columns = pd.MultiIndex.from_tuples([
    ('Pré-Processamento', 'Outliers', 'manter'),
    ('Pré-Processamento', 'Outliers', 'remover'),

    ('Pré-Processamento', 'Remover Coluna', 'Opening'),

    ('Mineração de Dados', 'Árvore de Classificação', f'criterion={params["criterion"][0]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'criterion={params["criterion"][1]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'criterion={params["criterion"][2]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'splitter={params["splitter"][0]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'splitter={params["splitter"][1]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'max_depth={params["max_depth"][0]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'max_depth={params["max_depth"][1]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'max_depth={params["max_depth"][2]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'min_samples_leaf={params["min_samples_leaf"][0]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'min_samples_leaf={params["min_samples_leaf"][1]}'),
    ('Mineração de Dados', 'Árvore de Classificação', f'min_samples_leaf={params["min_samples_leaf"][2]}'),

    ('Pós-Processamento', 'Medidas', 'Acurácia'),
    ('Pós-Processamento', 'Medidas', 'Precisão'),
    ('Pós-Processamento', 'Medidas', 'Recall'),
    ('Pós-Processamento', 'Medidas', 'F-Measure')
])

experiment = pd.DataFrame(columns=columns)
experiment

Unnamed: 0_level_0,Pré-Processamento,Pré-Processamento,Pré-Processamento,Mineração de Dados,Mineração de Dados,Mineração de Dados,Mineração de Dados,Mineração de Dados,Mineração de Dados,Mineração de Dados,Mineração de Dados,Mineração de Dados,Mineração de Dados,Mineração de Dados,Pós-Processamento,Pós-Processamento,Pós-Processamento,Pós-Processamento
Unnamed: 0_level_1,Outliers,Outliers,Remover Coluna,Árvore de Classificação,Árvore de Classificação,Árvore de Classificação,Árvore de Classificação,Árvore de Classificação,Árvore de Classificação,Árvore de Classificação,Árvore de Classificação,Árvore de Classificação,Árvore de Classificação,Árvore de Classificação,Medidas,Medidas,Medidas,Medidas
Unnamed: 0_level_2,manter,remover,Opening,criterion=gini,criterion=entropy,criterion=log_loss,splitter=best,splitter=random,max_depth=None,max_depth=7,max_depth=10,min_samples_leaf=3,min_samples_leaf=10,min_samples_leaf=25,Acurácia,Precisão,Recall,F-Measure
