In [None]:
import pandas as pd
import numpy as np
import shap
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

Definindo funções auxiliares para a modelagem

In [None]:
# Função auxiliar para substituir a coluna de posição
def chance_of_goal(position):
    if position == 'Forward':
        return 1  # Alta chance de marcar gol
    elif position == 'Midfielder':
        return 0.7  # Chance intermediária
    elif position == 'Defender':
        return 0.3 # Baixa chance de marcar gol
    else:
        return 0 # Chance nula
    
# Normalizar as previsões para intervalo [0, 1] usando uma transformação sigmoide
def sigmoid(x):
    return np.round((1 / (1 + np.exp(-x))) * 100)

In [None]:
# Numerando os times para evitar possíveis erros de digitação nas testagens.
times = {
    0: 'Vitória',
    1: 'Flamengo',
    2: 'Cruzeiro',
    3: 'Botafogo',
    4: 'Grêmio',
    5: 'Fluminense',
    6: 'São Paulo',
    7: 'Palmeiras',
    8: 'Atlético Mineiro',
    9: 'Atlético PR',
    10: 'Corinthians',
    11: 'Vasco da Gama',
    12: 'Bahia',
    13: 'Atlético GO',
    14: 'Internacional', 
    15: 'Bragantino',
    16: 'Criciúma',
    17: 'Juventude',
    18: 'Cuiabá',
    19: 'Fortaleza'
}

In [None]:
df_rf = pd.read_csv('./Data/jogadores.csv')
df_lasso = pd.read_csv('./Data/jogadores.csv')

In [None]:
features_to_drop_lasso = ['age', 'birthday', 'birthday_GMT', 'league', 'season', 'nationality', 'clean_sheets_overall', 'clean_sheets_home', 'clean_sheets_away', 'conceded_overall', 'conceded_away', 'conceded_home', 'yellow_cards_overall', 'red_cards_overall', 'min_per_conceded_overall', 'min_per_card_overall', 'cards_per_90_overall', 'rank_in_league_top_attackers', 'rank_in_league_top_midfielders', 'rank_in_league_top_defenders', 'rank_in_club_top_scorer', 'passes_per_90_overall', 'passes_per_game_overall', 'passes_per90_percentile_overall', 'passes_total_overall', 'passes_completed_per_game_overall', 'passes_completed_total_overall', 'pass_completion_rate_percentile_overall', 'passes_completed_per_90_overall', 'passes_completed_per90_percentile_overall', 'short_passes_per_game_overall', 'long_passes_per_game_overall', 'key_passes_per_game_overall', 'key_passes_total_overall', 'through_passes_per_game_overall', 'crosses_per_game_overall', 'dispossesed_per_game_overall', 'possession_regained_per_game_overall', 'pressures_per_game_overall', 'saves_per_game_overall', 'interceptions_per_game_overall', 'shots_faced_per_game_overall', 'shots_per_goal_scored_overall', 'shots_off_target_per_game_overall', 'distance_travelled_per_game_overall', 'possession_regained_per_90_overall', 'possession_regained_total_overall', 'possession_regained_per90_percentile_overall', 'additional_info', 'shots_off_target_total_overall', 'shots_off_target_per_90_overall', 'shots_off_target_per90_percentile_overall', 'games_subbed_out', 'interceptions_total_overall', 'interceptions_per_90_overall', 'interceptions_per90_percentile_overall', 'crosses_total_overall', 'cross_completion_rate_percentile_overall', 'crosses_per_90_overall', 'crosses_per90_percentile_overall', 'through_passes_total_overall', 'through_passes_per_90_overall', 'through_passes_per90_percentile_overall', 'long_passes_total_overall', 'long_passes_per_90_overall', 'long_passes_per90_percentile_overall', 'short_passes_total_overall', 'short_passes_per_90_overall', 'short_passes_per90_percentile_overall', 'key_passes_per_90_overall', 'key_passes_per90_percentile_overall', 'dribbles_total_overall', 'dribbles_per_90_overall', 'dribbles_per90_percentile_overall', 'dribbles_successful_total_overall', 'dribbles_successful_per_90_overall', 'dribbles_successful_percentage_overall', 'chances_created_total_overall', 'chances_created_per_90_overall', 'chances_created_per90_percentile_overall', 'saves_total_overall', 'save_percentage_percentile_overall', 'saves_per_90_overall', 'saves_per90_percentile_overall', 'shots_faced_total_overall', 'shots_per_goal_conceded_overall', 'shots_faced_per_90_overall', 'shots_faced_per90_percentile_overall', 'xg_faced_per_90_overall', 'xg_faced_per90_percentile_overall', 'xg_faced_per_game_overall', 'xg_faced_total_overall', 'save_percentage_overall', 'pressures_total_overall', 'pressures_per_90_overall', 'pressures_per90_percentile_overall', 'xg_total_overall', 'market_value', 'market_value_percentile', 'pass_completion_rate_overall', 'dribbled_past_per90_percentile_overall', 'dribbled_past_per_game_overall', 'dribbled_past_per_90_overall', 'dribbled_past_total_overall', 'inside_box_saves_total_overall', 'blocks_per_game_overall', 'blocks_per_90_overall', 'blocks_total_overall', 'blocks_per90_percentile_overall', 'ratings_total_overall', 'clearances_per_game_overall', 'clearances_total_overall', 'clearances_per_90_overall', 'pen_save_percentage_overall', 'pen_committed_total_overall', 'pen_committed_per_90_overall', 'pen_committed_per90_percentile_overall', 'pen_committed_per_game_overall', 'pens_saved_total_overall', 'pens_taken_total_overall', 'hit_woodwork_total_overall', 'hit_woodwork_per_90_overall', 'punches_total_overall', 'punches_per_game_overall', 'punches_per_90_overall', 'offsides_per_90_overall', 'offsides_per_game_overall', 'offsides_total_overall', 'shot_conversion_rate_overall', 'shot_conversion_rate_percentile_overall', 'sm_minutes_played_per90_percentile_overall', 'sm_minutes_played_recorded_overall', 'min_per_goal_percentile_overall', 'min_per_conceded_percentile_overall', 'xa_total_overall', 'xa_per90_percentile_overall', 'xa_per_game_overall', 'xa_per_90_overall', 'npxg_total_overall', 'npxg_per90_percentile_overall', 'npxg_per_game_overall', 'npxg_per_90_overall', 'fouls_drawn_per90_percentile_overall', 'fouls_drawn_total_overall', 'fouls_drawn_per_game_overall', 'fouls_drawn_per_90_overall', 'fouls_committed_per_90_overall', 'fouls_committed_per_game_overall', 'fouls_committed_per90_percentile_overall', 'fouls_committed_total_overall', 'xg_per_90_overall', 'xg_per90_percentile_overall', 'average_rating_percentile_overall', 'clearances_per90_percentile_overall', 'hit_woodwork_per90_percentile_overall', 'punches_per90_percentile_overall', 'offsides_per90_percentile_overall', 'aerial_duels_total_overall', 'aerial_duels_per_90_overall', 'aerial_duels_per90_percentile_overall', 'aerial_duels_won_percentage_overall', 'duels_per_game_overall', 'duels_total_overall', 'duels_won_total_overall', 'duels_won_per90_percentile_overall', 'duels_per90_percentile_overall', 'duels_won_per_90_overall', 'duels_won_per_game_overall', 'duels_won_percentage_overall', 'dispossesed_total_overall', 'dispossesed_per_90_overall', 'dispossesed_per90_percentile_overall', 'progressive_passes_total_overall', 'cross_completion_rate_overall', 'distance_travelled_total_overall', 'distance_travelled_per_90_overall', 'distance_travelled_per90_percentile_overall', 'accurate_crosses_total_overall', 'accurate_crosses_per_game_overall', 'accurate_crosses_per_game_overall', 'accurate_crosses_per_90_overall', 'accurate_crosses_per90_percentile_overall', 'sm_matches_recorded_total_overall', 'games_started_percentile_overall', 'games_subbed_in_percentile_overall', 'games_subbed_out_percentile_overall', 'hattricks_total_overall', 'two_goals_in_a_game_total_overall', 'three_goals_in_a_game_total_overall', 'two_goals_in_a_game_percentage_overall', 'three_goals_in_a_game_percentage_overall', 'man_of_the_match_total_overall', 'annual_salary_eur', 'annual_salary_eur_percentile', 'clean_sheets_percentage_percentile_overall', 'min_per_card_percentile_overall', 'cards_per90_percentile_overall', 'booked_over05_overall', 'booked_over05_percentage_overall', 'booked_over05_percentage_percentile_overall', 'shirt_number', 'annual_salary_gbp', 'annual_salary_usd', 'z_score', 'is_outlier']
features_to_keep_rf = ['minutes_played_overall', 'shots_per90_percentile_overall', 'goals_per90_percentile_overall', 'assists_overall', 'annual_salary_eur_percentile', 'minutes_played_home', 'booked_over05_overall', 'minutes_played_away','clean_sheets_percentage_percentile_overall','cards_per90_percentile_overall','penalty_goals', 'shirt_number', 'three_goals_in_a_game_total_overall','booked_over05_percentage_overall', 'goals_home', 'goals_away', 'min_per_card_percentile_overall', 'sm_goals_conceded_total_overall', 'sm_goals_scored_total_overall', 'two_goals_in_a_game_total_overall']

In [None]:
# Cria a coluna de gols binário para selecionar quem fex mais de 0 gols
df_rf['gols_binario'] = (df_rf['goals_overall'] > 0).astype(int)
#Cria a coluna de chance de fazer o primeiro gol que será o target
df_rf['chance_primeiro_gol'] = (df_rf['gols_binario'] * df_rf['minutes_played_overall'] * df_rf['shots_per90_percentile_overall']/10000)

# Separa as features (X) e o target (y)
X_rf = df_rf[features_to_keep_rf]
y_rf = df_rf['chance_primeiro_gol']

Realizando as limpezas necessárias para cada modelo

In [None]:
df_lasso = df_lasso.drop(columns=features_to_drop_lasso)

non_numeric_cols = df_lasso[['full_name', 'position', 'Current Club']].copy()
df_numeric = df_lasso.select_dtypes(include=['number']).copy()

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
df_numeric_imputed = pd.DataFrame(imp_mean.fit_transform(df_numeric), columns=df_numeric.columns)

data_lasso = pd.concat([non_numeric_cols, df_numeric_imputed], axis=1)

In [None]:
numericas = X_rf.select_dtypes(include=['float64', 'int64']).columns
categoricas = X_rf.select_dtypes(include=['object']).columns
imputer_numerico = SimpleImputer(strategy='median')
imputer_categorico = SimpleImputer(strategy='most_frequent')

X_rf[numericas] = imputer_numerico.fit_transform(X_rf[numericas])
# Preenche valores nulos nas colunas categóricas, se houver
if not categoricas.empty:
    X_rf[categoricas] = imputer_categorico.fit_transform(X_rf[categoricas])

scaler = MinMaxScaler()
X_rf[numericas] = scaler.fit_transform(X_rf[numericas])

In [None]:
# Trata os valores nulos em y com simpleImputer
imputer_numerico = SimpleImputer(strategy='median')

# Convertendo y para numpy e preenchendo valores nulos
y_rf = y_rf.to_numpy().reshape(-1, 1)
y_imputado_rf = imputer_numerico.fit_transform(y_rf)

Definindo os times que irão jogar

In [None]:
time1 = times[0]
time2 = times[1]

In [None]:
df_teams_rf = df_rf[(df_rf['Current Club'] == time1) | (df_rf['Current Club'] == time2)].copy()

In [None]:
# Filtrar os dados
data_time1_lasso = data_lasso[data_lasso['Current Club'] == time1]
data_time2_lasso = data_lasso[data_lasso['Current Club'] == time2]
data_filtered_lasso = pd.concat([data_time1_lasso, data_time2_lasso])

In [None]:
X_lasso = data_filtered_lasso.select_dtypes(include=['number']).copy()
y_lasso = data_filtered_lasso['position'].apply(chance_of_goal)

In [None]:
scaler = StandardScaler()
X_lasso_scaled = scaler.fit_transform(X_lasso)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_rf, y_imputado_rf.flatten(), test_size=0.3, random_state=42)

Utilizando GridSearch para buscar os melhores hiperparâmetros

In [None]:
# Definir os hiperparâmetros a serem ajustados
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30, 40],
}

rf = RandomForestRegressor(random_state=42)

# Configurar o GridSearch
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2)

# Ajusta o GridSearch ao conjunto de treinamento
grid_search.fit(X_train, y_train)

# Exibe os melhores hiperparâmetros
print("Melhores parâmetros encontrados: ", grid_search.best_params_)

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

Executando o treinamento dos modelos

In [None]:
# Dividindo os dados em treino e teste

rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)

# Treinando o modelo
rf.fit(X_train, y_train)


predicoes_teams = best_model.predict(X_test)


predicoes_df = pd.DataFrame(predicoes_teams, columns=['Previsão de Gols'], index=X_test.index)

res = X_test.join(predicoes_df)

resultado_teams_filtrado = res[res['Previsão de Gols'] >= 1]


resultado_teams_ordenado = resultado_teams_filtrado.sort_values(by='Previsão de Gols', ascending=False)

In [None]:
predicoes_df = pd.DataFrame(predicoes_teams, columns=['Previsão de Gols'], index=X_test.index)

res = X_test.join(predicoes_df)

# Filtra jogadores com previsão de gols >= 1
resultado_teams_filtrado = res[res['Previsão de Gols'] >= 1]

# Ordena os jogadores pela previsão de gols
resultado_teams_ordenado = resultado_teams_filtrado.sort_values(by='Previsão de Gols', ascending=False)

In [None]:
model = Lasso(alpha=0.01, random_state=42)  
model.fit(X_lasso_scaled, y_lasso)
predictions = model.predict(X_lasso_scaled)

# Aplicar a função sigmoide para transformar previsões em probabilidades
data_filtered_lasso['predicted_probability'] = sigmoid(predictions)

best = data_filtered_lasso.sort_values(by='predicted_probability', ascending=False)
best[['Current Club', 'full_name', 'predicted_probability']]


Comparando as métricas dos modelos

In [None]:
mse_lasso = mean_squared_error(y_lasso, predictions)
mae_lasso = mean_absolute_error(y_lasso, predictions)
r2_lasso = r2_score(y_lasso, predictions)

mse_rf = mean_squared_error(y_test, predicoes_teams)
r2_rf = r2_score(y_test, predicoes_teams)
mae_rf = mean_absolute_error(y_test, predicoes_teams)

metrics_data = {
    'Modelo': ['Lasso', 'Random Forest'],
    'MSE': [mse_lasso, mse_rf],
    'MAE': [mae_lasso, mae_rf],
    'R²': [r2_lasso, r2_rf]
}

# Convertendo o dicionário em um DataFrame
metrics_df = pd.DataFrame(metrics_data)

# Exibindo o DataFrame
metrics_df
