In [137]:
import os
import pandas as pd
import numpy as np

import plotly.graph_objs as go
import plotly.subplots as sp
import plotly.offline as pyo
import matplotlib.pyplot as plt

import statsmodels.api as sm
import statsmodels.tsa.arima_model as arima
from statsmodels.tsa.arima.model import ARIMA
from pmdarima.arima import auto_arima

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer, mean_absolute_percentage_error
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, chi2, f_classif
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score

import warnings

In [138]:
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',500)

In [139]:
ano = range(2022,2023,1)
df = pd.DataFrame()

for i in ano:
    try:
        url = f'https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_{i}.csv'
        df2 = pd.read_csv(url)
        df = concat([df, df2])
    except Exception as e:
        print(f"Erro no ano {i}: {e}")
        continue
df


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2022-8888,Atp Cup,Hard,16,A,20220103,300,200000,,,Felix Auger Aliassime,R,193.0,CAN,21.4,105138,,,Roberto Bautista Agut,R,183.0,ESP,33.7,7-6(3) 6-3,3,F,129.0,15.0,6.0,78.0,51.0,38.0,14.0,11.0,10.0,11.0,0.0,2.0,70.0,50.0,32.0,7.0,10.0,3.0,5.0,11.0,3308.0,19.0,2260.0
1,2022-8888,Atp Cup,Hard,16,A,20220103,299,133430,,,Denis Shapovalov,L,185.0,CAN,22.7,105807,,,Pablo Carreno Busta,R,188.0,ESP,30.4,6-4 6-3,3,F,98.0,7.0,2.0,78.0,49.0,34.0,16.0,10.0,8.0,9.0,1.0,0.0,50.0,33.0,21.0,8.0,9.0,3.0,6.0,14.0,2475.0,20.0,2230.0
2,2022-8888,Atp Cup,Hard,16,A,20220103,298,105138,,,Roberto Bautista Agut,R,183.0,ESP,33.7,128034,,,Hubert Hurkacz,R,196.0,POL,24.8,7-6(6) 2-6 7-6(5),3,SF,164.0,1.0,2.0,96.0,64.0,50.0,20.0,16.0,1.0,4.0,24.0,3.0,120.0,80.0,62.0,20.0,16.0,6.0,7.0,19.0,2260.0,9.0,3706.0
3,2022-8888,Atp Cup,Hard,16,A,20220103,297,105807,,,Pablo Carreno Busta,R,188.0,ESP,30.4,126591,,,Jan Zielinski,R,,POL,25.1,6-2 6-1,3,SF,53.0,6.0,0.0,45.0,33.0,25.0,8.0,8.0,0.0,0.0,2.0,1.0,38.0,27.0,17.0,1.0,7.0,4.0,8.0,20.0,2230.0,860.0,18.0
4,2022-8888,Atp Cup,Hard,16,A,20220103,296,106421,,,Daniil Medvedev,R,198.0,RUS,25.8,200000,,,Felix Auger Aliassime,R,193.0,CAN,21.4,6-4 6-0,3,SF,68.0,6.0,4.0,41.0,25.0,22.0,10.0,8.0,0.0,0.0,6.0,2.0,48.0,35.0,22.0,4.0,8.0,3.0,7.0,2.0,8640.0,11.0,3308.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2912,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,20220304,4,209362,,,Aristotelis Thanos,R,,GRE,20.8,209080,,,Blaise Bicknell,R,188.0,USA,20.2,6-4 6-4,3,RR,115.0,0.0,4.0,70.0,48.0,35.0,12.0,10.0,5.0,5.0,1.0,3.0,96.0,68.0,42.0,12.0,10.0,11.0,13.0,1103.0,9.0,1130.0,8.0
2913,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,20220304,5,202065,,,Petros Tsitsipas,R,185.0,GRE,21.5,202120,,,Rowland Phillips,R,,JAM,28.0,4-6 6-4 6-4,3,RR,161.0,7.0,2.0,94.0,55.0,37.0,23.0,15.0,9.0,12.0,1.0,1.0,94.0,56.0,40.0,20.0,15.0,4.0,8.0,808.0,23.0,1390.0,4.0
2914,2022-M-DC-2022-WG2-PO-HKG-BEN-01,Davis Cup WG2 PO: HKG vs BEN,Hard,4,D,20220304,1,138846,,,Hong Kit Jack Wong,R,,HKG,23.4,105278,,,Alexis Klegou,U,,BEN,33.0,1-6 6-4 4-1 RET,3,RR,250.0,4.0,8.0,56.0,33.0,20.0,13.0,11.0,3.0,7.0,0.0,3.0,72.0,54.0,29.0,8.0,11.0,6.0,10.0,1059.0,10.0,1881.0,1.0
2915,2022-M-DC-2022-WG2-PO-HKG-BEN-01,Davis Cup WG2 PO: HKG vs BEN,Hard,4,D,20220304,2,209409,,,Coleman Wong,R,,HKG,17.7,200583,,,Delmas Ntcha,R,,BEN,21.7,6-4 6-4,3,RR,98.0,2.0,2.0,68.0,38.0,24.0,18.0,10.0,6.0,8.0,1.0,5.0,58.0,39.0,24.0,7.0,10.0,5.0,9.0,,,,


In [140]:
drop = ['tourney_id', 'tourney_name', 'tourney_level', 'tourney_date', 'winner_ioc', 'loser_ioc', 'score', 'winner_seed', 'loser_seed', 'winner_entry', 'loser_entry']
df = df.drop(columns=drop)
df

Unnamed: 0,surface,draw_size,match_num,winner_id,winner_name,winner_hand,winner_ht,winner_age,loser_id,loser_name,loser_hand,loser_ht,loser_age,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,Hard,16,300,200000,Felix Auger Aliassime,R,193.0,21.4,105138,Roberto Bautista Agut,R,183.0,33.7,3,F,129.0,15.0,6.0,78.0,51.0,38.0,14.0,11.0,10.0,11.0,0.0,2.0,70.0,50.0,32.0,7.0,10.0,3.0,5.0,11.0,3308.0,19.0,2260.0
1,Hard,16,299,133430,Denis Shapovalov,L,185.0,22.7,105807,Pablo Carreno Busta,R,188.0,30.4,3,F,98.0,7.0,2.0,78.0,49.0,34.0,16.0,10.0,8.0,9.0,1.0,0.0,50.0,33.0,21.0,8.0,9.0,3.0,6.0,14.0,2475.0,20.0,2230.0
2,Hard,16,298,105138,Roberto Bautista Agut,R,183.0,33.7,128034,Hubert Hurkacz,R,196.0,24.8,3,SF,164.0,1.0,2.0,96.0,64.0,50.0,20.0,16.0,1.0,4.0,24.0,3.0,120.0,80.0,62.0,20.0,16.0,6.0,7.0,19.0,2260.0,9.0,3706.0
3,Hard,16,297,105807,Pablo Carreno Busta,R,188.0,30.4,126591,Jan Zielinski,R,,25.1,3,SF,53.0,6.0,0.0,45.0,33.0,25.0,8.0,8.0,0.0,0.0,2.0,1.0,38.0,27.0,17.0,1.0,7.0,4.0,8.0,20.0,2230.0,860.0,18.0
4,Hard,16,296,106421,Daniil Medvedev,R,198.0,25.8,200000,Felix Auger Aliassime,R,193.0,21.4,3,SF,68.0,6.0,4.0,41.0,25.0,22.0,10.0,8.0,0.0,0.0,6.0,2.0,48.0,35.0,22.0,4.0,8.0,3.0,7.0,2.0,8640.0,11.0,3308.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2912,Clay,4,4,209362,Aristotelis Thanos,R,,20.8,209080,Blaise Bicknell,R,188.0,20.2,3,RR,115.0,0.0,4.0,70.0,48.0,35.0,12.0,10.0,5.0,5.0,1.0,3.0,96.0,68.0,42.0,12.0,10.0,11.0,13.0,1103.0,9.0,1130.0,8.0
2913,Clay,4,5,202065,Petros Tsitsipas,R,185.0,21.5,202120,Rowland Phillips,R,,28.0,3,RR,161.0,7.0,2.0,94.0,55.0,37.0,23.0,15.0,9.0,12.0,1.0,1.0,94.0,56.0,40.0,20.0,15.0,4.0,8.0,808.0,23.0,1390.0,4.0
2914,Hard,4,1,138846,Hong Kit Jack Wong,R,,23.4,105278,Alexis Klegou,U,,33.0,3,RR,250.0,4.0,8.0,56.0,33.0,20.0,13.0,11.0,3.0,7.0,0.0,3.0,72.0,54.0,29.0,8.0,11.0,6.0,10.0,1059.0,10.0,1881.0,1.0
2915,Hard,4,2,209409,Coleman Wong,R,,17.7,200583,Delmas Ntcha,R,,21.7,3,RR,98.0,2.0,2.0,68.0,38.0,24.0,18.0,10.0,6.0,8.0,1.0,5.0,58.0,39.0,24.0,7.0,10.0,5.0,9.0,,,,


In [141]:
df 

Unnamed: 0,surface,draw_size,match_num,winner_id,winner_name,winner_hand,winner_ht,winner_age,loser_id,loser_name,loser_hand,loser_ht,loser_age,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,Hard,16,300,200000,Felix Auger Aliassime,R,193.0,21.4,105138,Roberto Bautista Agut,R,183.0,33.7,3,F,129.0,15.0,6.0,78.0,51.0,38.0,14.0,11.0,10.0,11.0,0.0,2.0,70.0,50.0,32.0,7.0,10.0,3.0,5.0,11.0,3308.0,19.0,2260.0
1,Hard,16,299,133430,Denis Shapovalov,L,185.0,22.7,105807,Pablo Carreno Busta,R,188.0,30.4,3,F,98.0,7.0,2.0,78.0,49.0,34.0,16.0,10.0,8.0,9.0,1.0,0.0,50.0,33.0,21.0,8.0,9.0,3.0,6.0,14.0,2475.0,20.0,2230.0
2,Hard,16,298,105138,Roberto Bautista Agut,R,183.0,33.7,128034,Hubert Hurkacz,R,196.0,24.8,3,SF,164.0,1.0,2.0,96.0,64.0,50.0,20.0,16.0,1.0,4.0,24.0,3.0,120.0,80.0,62.0,20.0,16.0,6.0,7.0,19.0,2260.0,9.0,3706.0
3,Hard,16,297,105807,Pablo Carreno Busta,R,188.0,30.4,126591,Jan Zielinski,R,,25.1,3,SF,53.0,6.0,0.0,45.0,33.0,25.0,8.0,8.0,0.0,0.0,2.0,1.0,38.0,27.0,17.0,1.0,7.0,4.0,8.0,20.0,2230.0,860.0,18.0
4,Hard,16,296,106421,Daniil Medvedev,R,198.0,25.8,200000,Felix Auger Aliassime,R,193.0,21.4,3,SF,68.0,6.0,4.0,41.0,25.0,22.0,10.0,8.0,0.0,0.0,6.0,2.0,48.0,35.0,22.0,4.0,8.0,3.0,7.0,2.0,8640.0,11.0,3308.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2912,Clay,4,4,209362,Aristotelis Thanos,R,,20.8,209080,Blaise Bicknell,R,188.0,20.2,3,RR,115.0,0.0,4.0,70.0,48.0,35.0,12.0,10.0,5.0,5.0,1.0,3.0,96.0,68.0,42.0,12.0,10.0,11.0,13.0,1103.0,9.0,1130.0,8.0
2913,Clay,4,5,202065,Petros Tsitsipas,R,185.0,21.5,202120,Rowland Phillips,R,,28.0,3,RR,161.0,7.0,2.0,94.0,55.0,37.0,23.0,15.0,9.0,12.0,1.0,1.0,94.0,56.0,40.0,20.0,15.0,4.0,8.0,808.0,23.0,1390.0,4.0
2914,Hard,4,1,138846,Hong Kit Jack Wong,R,,23.4,105278,Alexis Klegou,U,,33.0,3,RR,250.0,4.0,8.0,56.0,33.0,20.0,13.0,11.0,3.0,7.0,0.0,3.0,72.0,54.0,29.0,8.0,11.0,6.0,10.0,1059.0,10.0,1881.0,1.0
2915,Hard,4,2,209409,Coleman Wong,R,,17.7,200583,Delmas Ntcha,R,,21.7,3,RR,98.0,2.0,2.0,68.0,38.0,24.0,18.0,10.0,6.0,8.0,1.0,5.0,58.0,39.0,24.0,7.0,10.0,5.0,9.0,,,,


In [142]:
# 2. Engenharia de recursos
# Criando variáveis para o histórico de confrontos entre os jogadores (exemplo simplificado)
def calculate_head_to_head(player1, player2, df):
    matches = df[(df['winner_name'] == player1) & (df['loser_name'] == player2)].shape[0]
    matches += df[(df['winner_name'] == player2) & (df['loser_name'] == player1)].shape[0]
    return matches

df['head_to_head'] = df.apply(lambda x: calculate_head_to_head('Kyle Edmund', 'Dominic Thiem', df), axis=1)
df['head_to_head']

0       0
1       0
2       0
3       0
4       0
       ..
2912    0
2913    0
2914    0
2915    0
2916    0
Name: head_to_head, Length: 2917, dtype: int64

In [143]:
# Instanciando o objeto LabelEncoder
label_encoder = LabelEncoder()

# Iterando sobre as colunas do DataFrame
for column in df.columns:
    # Ignorando as colunas 'winner_name' e 'loser_name'
    if column in ['surface', 'winner_hand', 'loser_hand', 'round']:
        # Aplicando Label Encoding na coluna
        df[column] = label_encoder.fit_transform(df[column])


In [144]:
df

Unnamed: 0,surface,draw_size,match_num,winner_id,winner_name,winner_hand,winner_ht,winner_age,loser_id,loser_name,loser_hand,loser_ht,loser_age,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,head_to_head
0,2,16,300,200000,Felix Auger Aliassime,1,193.0,21.4,105138,Roberto Bautista Agut,1,183.0,33.7,3,0,129.0,15.0,6.0,78.0,51.0,38.0,14.0,11.0,10.0,11.0,0.0,2.0,70.0,50.0,32.0,7.0,10.0,3.0,5.0,11.0,3308.0,19.0,2260.0,0
1,2,16,299,133430,Denis Shapovalov,0,185.0,22.7,105807,Pablo Carreno Busta,1,188.0,30.4,3,0,98.0,7.0,2.0,78.0,49.0,34.0,16.0,10.0,8.0,9.0,1.0,0.0,50.0,33.0,21.0,8.0,9.0,3.0,6.0,14.0,2475.0,20.0,2230.0,0
2,2,16,298,105138,Roberto Bautista Agut,1,183.0,33.7,128034,Hubert Hurkacz,1,196.0,24.8,3,7,164.0,1.0,2.0,96.0,64.0,50.0,20.0,16.0,1.0,4.0,24.0,3.0,120.0,80.0,62.0,20.0,16.0,6.0,7.0,19.0,2260.0,9.0,3706.0,0
3,2,16,297,105807,Pablo Carreno Busta,1,188.0,30.4,126591,Jan Zielinski,1,,25.1,3,7,53.0,6.0,0.0,45.0,33.0,25.0,8.0,8.0,0.0,0.0,2.0,1.0,38.0,27.0,17.0,1.0,7.0,4.0,8.0,20.0,2230.0,860.0,18.0,0
4,2,16,296,106421,Daniil Medvedev,1,198.0,25.8,200000,Felix Auger Aliassime,1,193.0,21.4,3,7,68.0,6.0,4.0,41.0,25.0,22.0,10.0,8.0,0.0,0.0,6.0,2.0,48.0,35.0,22.0,4.0,8.0,3.0,7.0,2.0,8640.0,11.0,3308.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2912,0,4,4,209362,Aristotelis Thanos,1,,20.8,209080,Blaise Bicknell,1,188.0,20.2,3,6,115.0,0.0,4.0,70.0,48.0,35.0,12.0,10.0,5.0,5.0,1.0,3.0,96.0,68.0,42.0,12.0,10.0,11.0,13.0,1103.0,9.0,1130.0,8.0,0
2913,0,4,5,202065,Petros Tsitsipas,1,185.0,21.5,202120,Rowland Phillips,1,,28.0,3,6,161.0,7.0,2.0,94.0,55.0,37.0,23.0,15.0,9.0,12.0,1.0,1.0,94.0,56.0,40.0,20.0,15.0,4.0,8.0,808.0,23.0,1390.0,4.0,0
2914,2,4,1,138846,Hong Kit Jack Wong,1,,23.4,105278,Alexis Klegou,2,,33.0,3,6,250.0,4.0,8.0,56.0,33.0,20.0,13.0,11.0,3.0,7.0,0.0,3.0,72.0,54.0,29.0,8.0,11.0,6.0,10.0,1059.0,10.0,1881.0,1.0,0
2915,2,4,2,209409,Coleman Wong,1,,17.7,200583,Delmas Ntcha,1,,21.7,3,6,98.0,2.0,2.0,68.0,38.0,24.0,18.0,10.0,6.0,8.0,1.0,5.0,58.0,39.0,24.0,7.0,10.0,5.0,9.0,,,,,0


In [145]:
df = df.fillna(0)

In [151]:
df

Unnamed: 0,surface,draw_size,match_num,winner_id,winner_name,winner_hand,winner_ht,winner_age,loser_id,loser_name,loser_hand,loser_ht,loser_age,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,head_to_head
0,2,16,300,200000,Felix Auger Aliassime,1,193.0,21.4,105138,Roberto Bautista Agut,1,183.0,33.7,3,0,129.0,15.0,6.0,78.0,51.0,38.0,14.0,11.0,10.0,11.0,0.0,2.0,70.0,50.0,32.0,7.0,10.0,3.0,5.0,11.0,3308.0,19.0,2260.0,0
1,2,16,299,133430,Denis Shapovalov,0,185.0,22.7,105807,Pablo Carreno Busta,1,188.0,30.4,3,0,98.0,7.0,2.0,78.0,49.0,34.0,16.0,10.0,8.0,9.0,1.0,0.0,50.0,33.0,21.0,8.0,9.0,3.0,6.0,14.0,2475.0,20.0,2230.0,0
2,2,16,298,105138,Roberto Bautista Agut,1,183.0,33.7,128034,Hubert Hurkacz,1,196.0,24.8,3,7,164.0,1.0,2.0,96.0,64.0,50.0,20.0,16.0,1.0,4.0,24.0,3.0,120.0,80.0,62.0,20.0,16.0,6.0,7.0,19.0,2260.0,9.0,3706.0,0
3,2,16,297,105807,Pablo Carreno Busta,1,188.0,30.4,126591,Jan Zielinski,1,0.0,25.1,3,7,53.0,6.0,0.0,45.0,33.0,25.0,8.0,8.0,0.0,0.0,2.0,1.0,38.0,27.0,17.0,1.0,7.0,4.0,8.0,20.0,2230.0,860.0,18.0,0
4,2,16,296,106421,Daniil Medvedev,1,198.0,25.8,200000,Felix Auger Aliassime,1,193.0,21.4,3,7,68.0,6.0,4.0,41.0,25.0,22.0,10.0,8.0,0.0,0.0,6.0,2.0,48.0,35.0,22.0,4.0,8.0,3.0,7.0,2.0,8640.0,11.0,3308.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2912,0,4,4,209362,Aristotelis Thanos,1,0.0,20.8,209080,Blaise Bicknell,1,188.0,20.2,3,6,115.0,0.0,4.0,70.0,48.0,35.0,12.0,10.0,5.0,5.0,1.0,3.0,96.0,68.0,42.0,12.0,10.0,11.0,13.0,1103.0,9.0,1130.0,8.0,0
2913,0,4,5,202065,Petros Tsitsipas,1,185.0,21.5,202120,Rowland Phillips,1,0.0,28.0,3,6,161.0,7.0,2.0,94.0,55.0,37.0,23.0,15.0,9.0,12.0,1.0,1.0,94.0,56.0,40.0,20.0,15.0,4.0,8.0,808.0,23.0,1390.0,4.0,0
2914,2,4,1,138846,Hong Kit Jack Wong,1,0.0,23.4,105278,Alexis Klegou,2,0.0,33.0,3,6,250.0,4.0,8.0,56.0,33.0,20.0,13.0,11.0,3.0,7.0,0.0,3.0,72.0,54.0,29.0,8.0,11.0,6.0,10.0,1059.0,10.0,1881.0,1.0,0
2915,2,4,2,209409,Coleman Wong,1,0.0,17.7,200583,Delmas Ntcha,1,0.0,21.7,3,6,98.0,2.0,2.0,68.0,38.0,24.0,18.0,10.0,6.0,8.0,1.0,5.0,58.0,39.0,24.0,7.0,10.0,5.0,9.0,0.0,0.0,0.0,0.0,0


In [146]:
# 5. Divisão do conjunto de dados
# Aqui, vamos criar um DataFrame para armazenar os nomes dos vencedores e perdedores como rótulos
labels = df[['winner_name', 'loser_name']]
features = df.drop(columns=['winner_name', 'loser_name'])

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)

In [147]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

# 6. Treinamento do modelo
# Regressão logística
logistic_regression = LogisticRegression(random_state=42)
logistic_regression.fit(X_train, y_train['winner_name'])

# RandomForestClassifier
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train['winner_name'])

# 7. Validação do modelo
y_pred_logistic = logistic_regression.predict(X_test)
y_pred_random_forest = random_forest.predict(X_test)

# 8. Avaliação do modelo
print("Acurácia da Regressão Logística:", accuracy_score(y_test['winner_name'], y_pred_logistic))
print("F1-Score da Regressão Logística:", f1_score(y_test['winner_name'], y_pred_logistic, average='weighted'))

print("Acurácia do RandomForest:", accuracy_score(y_test['winner_name'], y_pred_random_forest))
print("F1-Score do RandomForest:", f1_score(y_test['winner_name'], y_pred_random_forest, average='weighted'))

# 9. Ajuste do modelo (se necessário)
# Baseado nos resultados da avaliação, você pode ajustar os hiperparâmetros do seu modelo, se necessário.


Acurácia da Regressão Logística: 0.06621004566210045
F1-Score da Regressão Logística: 0.043146411157464516
Acurácia do RandomForest: 0.8755707762557078
F1-Score do RandomForest: 0.8532323876792598


In [149]:
def extract_features(player1_name, player2_name, df):
    player1_stats = df[df['winner_name'] == player1_name].iloc[-1]  # Carrega as informações do último jogo do jogador 1
    player2_stats = df[df['winner_name'] == player2_name].iloc[-1]  # Carrega as informações do último jogo do jogador 2

    # Aqui, você pode selecionar as características relevantes e criar um vetor de características combinadas
    # Por exemplo:
    combined_features = [
        player1_stats['w_ace'] - player2_stats['w_ace'],
        player1_stats['w_df'] - player2_stats['w_df'],
        player1_stats['w_svpt'] - player2_stats['w_svpt'],
        # Adicione outras características conforme necessário
    ]

    return np.array([combined_features])  # Retorna um vetor NumPy em formato apropriado para o modelo


In [150]:
# Certifique-se de que o nome dos jogadores esteja correto e exista no conjunto de dados
player1_name = "Kyle Edmund"
player2_name = "Dominic Thiem"

# Extrair características
X_new = extract_features(player1_name, player2_name, df)

# Prever a probabilidade de vitória
probabilities = random_forest.predict_proba(X_new)

# Probabilidade do primeiro jogador ganhar
player1_win_probability = probabilities[0][0]  # Retorna a probabilidade de vitória do jogador 1

print(f"A probabilidade de {player1_name} ganhar é de {player1_win_probability:.2%}")


ValueError: X has 3 features, but RandomForestClassifier is expecting 37 features as input.