In [6]:
import pandas as pd
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score
import numpy as np
import shap
import matplotlib.pyplot as plt
import joblib
import json
from sklearn.model_selection import KFold

In [7]:
import warnings
warnings.resetwarnings()
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter("default", ConvergenceWarning)

In [11]:
matches = pd.read_csv("../../preparation_before_models/data/matches.csv")
matches['Date'] = pd.to_datetime(matches['Date'])

In [12]:
matches=matches.drop(columns=['player1_bet_odds','player2_bet_odds',"w_ace_avg", "l_ace_avg", "w_CO_ace_avg", "l_CO_ace_avg","w_df_avg", "l_df_avg", "w_CO_df_avg", "l_CO_df_avg", "w_2ndIn_avg","l_2ndIn_avg","w_CO_2ndIn_avg", "l_CO_2ndIn_avg"])

In [13]:
test_data = matches[matches["Date"].dt.year == 2023]
train_data = matches[matches["Date"].dt.year != 2023]

In [14]:
X_train=train_data.drop(columns=['target', 'match_id', 'Date'])
y_train=train_data['target']
X_test=test_data.drop(columns=['target', 'match_id', 'Date'])
y_test=test_data['target']

### wstępna selekcja za pomocą shapa

In [15]:
def feature_selection_with_shap(X, y, percent):
    model = xgb.XGBClassifier( eval_metric="mlogloss")
    model.fit(X, y)

    explainer = shap.Explainer(model, X)
    shap_values = explainer(X)

    # Obliczenie średniej absolutnej ważności cech
    feature_importance = np.abs(shap_values.values).mean(axis=0)

    # Wybór najważniejszych cech 
    importance_threshold = np.percentile(feature_importance, 100-percent)  
    selected_features = np.where(feature_importance > importance_threshold)[0]

    print(f"Selected {len(selected_features)} features out of {X.shape[1]} with top {percent}%.")
    return X.iloc[:, selected_features], selected_features
    

In [16]:
def objective_log(trial, X_train, y_train):
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "lambda": trial.suggest_float("lambda", 1e-3, 10, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10, log=True),
    }
    model = xgb.XGBClassifier(**params)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring="neg_log_loss")

    return -scores.mean()

### optymalizacja i szukanie najlepszego zbioru cech

In [17]:
percentages = [ 50, 65, 75, 85, 100]  
best_logloss = float("inf")
best_features = None
best_params = None
best_num_features = 0

for percent in percentages:
    print(f"Testing top {percent}% features...")
    X_train_selected, selected_features = feature_selection_with_shap(X_train, y_train, percent)
    X_test_selected = X_test.iloc[:, selected_features]

    # Optuna optymalizacja dla aktualnego podzbioru cech
    def wrapped_objective(trial):
        return objective_log(trial, X_train_selected, y_train)

    study_logloss = optuna.create_study(direction="minimize",sampler=optuna.samplers.RandomSampler(seed=42))
    study_logloss.optimize(wrapped_objective, n_trials=200, timeout=3600)

    # Sprawdzenie wyników
    if study_logloss.best_value < best_logloss:
        best_logloss = study_logloss.best_value
        best_features = selected_features
        best_params = study_logloss.best_params
        best_num_features = len(selected_features)

print(f"Best logloss: {best_logloss}")
print(f"Best number of features: {best_num_features}")
print(f"Best parameters: {best_params}")

Testing top 50% features...


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


Selected 47 features out of 95 with top 50%.


[I 2025-01-17 16:30:36,809] Trial 0 finished with value: 0.7572365750013799 and parameters: {'max_depth': 5, 'learning_rate': 0.2536999076681772, 'n_estimators': 759, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'min_child_weight': 1, 'lambda': 2.9154431891537547, 'alpha': 0.2537815508265665}. Best is trial 0 with value: 0.7572365750013799.
[I 2025-01-17 16:31:59,418] Trial 1 finished with value: 0.6344759057160022 and parameters: {'max_depth': 8, 'learning_rate': 0.010725209743171996, 'n_estimators': 973, 'subsample': 0.9162213204002109, 'colsample_bytree': 0.6061695553391381, 'gamma': 0.9091248360355031, 'min_child_weight': 2, 'lambda': 0.016480446427978974, 'alpha': 0.12561043700013558}. Best is trial 1 with value: 0.6344759057160022.
[I 2025-01-17 16:32:15,958] Trial 2 finished with value: 0.6362293772104755 and parameters: {'max_depth': 6, 'learning_rate': 0.02692655251486473, 'n_estimators': 651, 'subsample': 0.569746930326

Testing top 65% features...


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


Selected 62 features out of 95 with top 65%.


[I 2025-01-17 17:27:11,665] Trial 0 finished with value: 0.7606992579910284 and parameters: {'max_depth': 5, 'learning_rate': 0.2536999076681772, 'n_estimators': 759, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'min_child_weight': 1, 'lambda': 2.9154431891537547, 'alpha': 0.2537815508265665}. Best is trial 0 with value: 0.7606992579910284.
[I 2025-01-17 17:28:00,076] Trial 1 finished with value: 0.6355637369222029 and parameters: {'max_depth': 8, 'learning_rate': 0.010725209743171996, 'n_estimators': 973, 'subsample': 0.9162213204002109, 'colsample_bytree': 0.6061695553391381, 'gamma': 0.9091248360355031, 'min_child_weight': 2, 'lambda': 0.016480446427978974, 'alpha': 0.12561043700013558}. Best is trial 1 with value: 0.6355637369222029.
[I 2025-01-17 17:28:15,729] Trial 2 finished with value: 0.638428246096217 and parameters: {'max_depth': 6, 'learning_rate': 0.02692655251486473, 'n_estimators': 651, 'subsample': 0.5697469303260

Testing top 75% features...


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


Selected 71 features out of 95 with top 75%.


[I 2025-01-17 17:55:38,707] Trial 0 finished with value: 0.7628419712202491 and parameters: {'max_depth': 5, 'learning_rate': 0.2536999076681772, 'n_estimators': 759, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'min_child_weight': 1, 'lambda': 2.9154431891537547, 'alpha': 0.2537815508265665}. Best is trial 0 with value: 0.7628419712202491.
[I 2025-01-17 17:56:20,732] Trial 1 finished with value: 0.6355041989550784 and parameters: {'max_depth': 8, 'learning_rate': 0.010725209743171996, 'n_estimators': 973, 'subsample': 0.9162213204002109, 'colsample_bytree': 0.6061695553391381, 'gamma': 0.9091248360355031, 'min_child_weight': 2, 'lambda': 0.016480446427978974, 'alpha': 0.12561043700013558}. Best is trial 1 with value: 0.6355041989550784.
[I 2025-01-17 17:56:33,921] Trial 2 finished with value: 0.638397371394029 and parameters: {'max_depth': 6, 'learning_rate': 0.02692655251486473, 'n_estimators': 651, 'subsample': 0.5697469303260

Testing top 85% features...


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


Selected 80 features out of 95 with top 85%.


[I 2025-01-17 18:41:11,442] Trial 0 finished with value: 0.7561098501065473 and parameters: {'max_depth': 5, 'learning_rate': 0.2536999076681772, 'n_estimators': 759, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'min_child_weight': 1, 'lambda': 2.9154431891537547, 'alpha': 0.2537815508265665}. Best is trial 0 with value: 0.7561098501065473.
[I 2025-01-17 18:42:34,887] Trial 1 finished with value: 0.6355872295000375 and parameters: {'max_depth': 8, 'learning_rate': 0.010725209743171996, 'n_estimators': 973, 'subsample': 0.9162213204002109, 'colsample_bytree': 0.6061695553391381, 'gamma': 0.9091248360355031, 'min_child_weight': 2, 'lambda': 0.016480446427978974, 'alpha': 0.12561043700013558}. Best is trial 1 with value: 0.6355872295000375.
[I 2025-01-17 18:43:16,632] Trial 2 finished with value: 0.6370042736536135 and parameters: {'max_depth': 6, 'learning_rate': 0.02692655251486473, 'n_estimators': 651, 'subsample': 0.569746930326

Testing top 100% features...


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


Selected 91 features out of 95 with top 100%.


[I 2025-01-17 19:38:41,148] Trial 0 finished with value: 0.760334626584326 and parameters: {'max_depth': 5, 'learning_rate': 0.2536999076681772, 'n_estimators': 759, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'min_child_weight': 1, 'lambda': 2.9154431891537547, 'alpha': 0.2537815508265665}. Best is trial 0 with value: 0.760334626584326.
[I 2025-01-17 19:39:25,143] Trial 1 finished with value: 0.6355142687845603 and parameters: {'max_depth': 8, 'learning_rate': 0.010725209743171996, 'n_estimators': 973, 'subsample': 0.9162213204002109, 'colsample_bytree': 0.6061695553391381, 'gamma': 0.9091248360355031, 'min_child_weight': 2, 'lambda': 0.016480446427978974, 'alpha': 0.12561043700013558}. Best is trial 1 with value: 0.6355142687845603.
[I 2025-01-17 19:39:39,846] Trial 2 finished with value: 0.6364110774548812 and parameters: {'max_depth': 6, 'learning_rate': 0.02692655251486473, 'n_estimators': 651, 'subsample': 0.56974693032602

Best logloss: 0.6209425208362582
Best number of features: 91
Best parameters: {'max_depth': 4, 'learning_rate': 0.011323698336394534, 'n_estimators': 525, 'subsample': 0.7824205666313082, 'colsample_bytree': 0.5328543197141762, 'gamma': 3.877638083475053, 'min_child_weight': 5, 'lambda': 0.12518753190194004, 'alpha': 0.0579496727458303}


In [18]:
final_model = xgb.XGBClassifier(**best_params)
X_train_selected = X_train.iloc[:, best_features]
X_test_selected = X_test.iloc[:, best_features]
final_model.fit(X_train_selected, y_train)

# Predykcje na zbiorze testowym
y_pred = final_model.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
print("Final model accuracy on test data:", accuracy)


Final model accuracy on test data: 0.6458333333333334


## tutaj odpalam study dla najlepszych cech, żeby wykresy dostać

In [19]:
def wrapped_objective(trial):
    return objective_log(trial, X_train_selected, y_train)
study_best_features_log = optuna.create_study(direction="minimize", sampler=optuna.samplers.RandomSampler(seed=42))
study_best_features_log.optimize(wrapped_objective, n_trials=200, timeout=3600)
best_params_log = study_best_features_log.best_params
print("Best parameters:", best_params_log)

[I 2025-01-17 20:38:14,588] A new study created in memory with name: no-name-88323f1d-c210-47b6-a4d3-84114ed4c4f7
[I 2025-01-17 20:38:27,277] Trial 0 finished with value: 0.760334626584326 and parameters: {'max_depth': 5, 'learning_rate': 0.2536999076681772, 'n_estimators': 759, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'min_child_weight': 1, 'lambda': 2.9154431891537547, 'alpha': 0.2537815508265665}. Best is trial 0 with value: 0.760334626584326.
[I 2025-01-17 20:39:44,977] Trial 1 finished with value: 0.6355142687845603 and parameters: {'max_depth': 8, 'learning_rate': 0.010725209743171996, 'n_estimators': 973, 'subsample': 0.9162213204002109, 'colsample_bytree': 0.6061695553391381, 'gamma': 0.9091248360355031, 'min_child_weight': 2, 'lambda': 0.016480446427978974, 'alpha': 0.12561043700013558}. Best is trial 1 with value: 0.6355142687845603.
[I 2025-01-17 20:40:13,188] Trial 2 finished with value: 0.6364110774548812 and par

Best parameters: {'max_depth': 4, 'learning_rate': 0.011323698336394534, 'n_estimators': 525, 'subsample': 0.7824205666313082, 'colsample_bytree': 0.5328543197141762, 'gamma': 3.877638083475053, 'min_child_weight': 5, 'lambda': 0.12518753190194004, 'alpha': 0.0579496727458303}


In [49]:
from sklearn.metrics import log_loss, brier_score_loss

In [None]:
X_train_selected = X_train.iloc[:, best_features]
X_test_selected = X_test.iloc[:, best_features]
final_model_log = xgb.XGBClassifier(**best_params_log)
final_model_log.fit(X_train_selected, y_train)
y_pred_log = final_model_log.predict(X_test_selected)


## wizualizacja optymalizacji, szukanie zaleznosci jaka siatka i jakie hiperparametry modyfikowac

In [None]:
values = np.array([t.value for t in study_best_features_log.trials if t.state == optuna.trial.TrialState.COMPLETE])
values.sort()
quantile_80 = np.percentile(values, 80)
fig=optuna.visualization.plot_edf(study_best_features_log)
fig.update_layout(
    title="",
    template="plotly_white", 
    font=dict(size=14),
)
fig.add_vline(
    x=quantile_80,
    line_dash="dash",
    line_color="red",
)

# Zapis do wysokiej jakości pliku PDF
#pio.write_image(fig, "edf_plot_xgb.pdf", format="pdf", scale=10)  # Skalowanie dla wysokiej jakości

# Wyświetlenie wykresu
fig.show()

In [None]:
fig=optuna.visualization.plot_rank(study_best_features_log, params=["learning_rate", "gamma", "subsample"])
fig.update_layout(
    title="",
    template="plotly_white",  
    font=dict(size=14),  
    plot_bgcolor='white',  
    paper_bgcolor='white',  
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey'),  
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey')   
)
#pio.write_image(fig, "xgb_plot_rank.pdf", format="pdf", scale=3)

fig.show()

In [None]:
fig=optuna.visualization.plot_slice(study_best_features_log, params=["learning_rate", "gamma", "subsample"])
fig.update_layout(
    title="",
    template="plotly_white",
    font=dict(size=14),
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey'),
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey')
)

# Save as high-quality PDF
#pio.write_image(fig, "xgb_plot_slice.pdf", format="pdf", scale=3)

# Show the plot
fig.show()

In [None]:
fig=optuna.visualization.plot_param_importances(study_best_features_log)
fig.update_layout(
    title="",
    template="plotly_white",  # Jasne tło
    font=dict(size=14),  # Ustawienie czcionki
    plot_bgcolor='white',  # Tło wykresu
    paper_bgcolor='white',  # Tło całego dokumentu
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey'),  # Siatka na osi X
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey')   # Siatka na osi Y
)

# Zapis do wysokiej jakości pliku PDF
#pio.write_image(fig, "param_importance_plot_xgb.pdf", format="pdf", scale=3)  # Skalowanie dla świetnej jakości

# Wyświetlenie wykresu
fig.show()

In [25]:
optuna.visualization.plot_optimization_history(study_best_features_log)

In [21]:
fig=optuna.visualization.plot_param_importances(study_best_features_log)
fig.update_layout(title="")

In [22]:
fig=optuna.visualization.plot_slice(study_best_features_log, params=["learning_rate", "gamma", "subsample"])
fig.update_layout(title="")

In [23]:
fig=optuna.visualization.plot_rank(study_best_features_log, params=["learning_rate", "gamma", "subsample"])
fig.update_layout(title="")

In [24]:
fig=optuna.visualization.plot_edf(study_best_features_log)
fig.update_layout(title="")

In [30]:
optuna.visualization.plot_terminator_improvement(study_best_features_log)


plot_terminator_improvement is experimental (supported from v3.2.0). The interface can change in the future.


RegretBoundEvaluator is experimental (supported from v3.2.0). The interface can change in the future.


CrossValidationErrorEvaluator is experimental (supported from v3.2.0). The interface can change in the future.

100%|██████████| 200/200 [00:14<00:00, 13.63it/s]


In [31]:
optuna.visualization.plot_parallel_coordinate(study_best_features_log)

In [32]:
joblib.dump(final_model_log, "../XGBoost/best_models/best_model_log_cros.pkl")

['../XGBoost/best_models/best_model_log_cros.pkl']

In [33]:
np.save("../XGBoost/best_models/best_features_log_cros.npy", best_features)

In [34]:
with open("../XGBoost/best_models/best_params_log_cros.json", "w") as f:
    json.dump(best_params_log, f)

In [35]:
with open("../XGBoost/best_models/best_model_log_cros_as_txt.txt", "w") as f:
    f.write(str(final_model_log))

In [12]:
with open("../XGBoost/best_models/best_params_log.json", "r") as f:
    best_params_log = json.load(f)
best_features_log = np.load("../XGBoost/best_models/best_features_log.npy")
best_model_log = joblib.load("../XGBoost/best_models/best_model_log.pkl")

## na podstawie wykresów zmodyfikujmy siatkę hiperparametrów

In [37]:
def objective_log_improve(trial, X_train, y_train):
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.04, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.8, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 3.5, 8),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "lambda": trial.suggest_float("lambda", 1e-3, 10, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10, log=True),
    }
    model = xgb.XGBClassifier(**params)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_log_loss")

    return -scores.mean()

In [38]:
def wrapped_objective(trial):
    return objective_log_improve(trial, X_train_selected, y_train)
study_log_improve = optuna.create_study(direction="minimize", sampler=optuna.samplers.RandomSampler(seed=42))
study_log_improve.optimize(wrapped_objective, n_trials=200, timeout=3600)
best_params_log_improve = study_log_improve.best_params
print("Best parameters:", best_params_log_improve)

[I 2025-01-17 23:40:27,193] A new study created in memory with name: no-name-6c456279-8da4-439f-a18e-714b7e7f8e03
[I 2025-01-17 23:40:32,734] Trial 0 finished with value: 0.6235439789329265 and parameters: {'max_depth': 5, 'learning_rate': 0.03735829498998231, 'n_estimators': 759, 'subsample': 0.9197316968394074, 'colsample_bytree': 0.5780093202212182, 'gamma': 4.201975341512912, 'min_child_weight': 1, 'lambda': 2.9154431891537547, 'alpha': 0.2537815508265665}. Best is trial 0 with value: 0.6235439789329265.
[I 2025-01-17 23:40:47,570] Trial 1 finished with value: 0.6237990926330114 and parameters: {'max_depth': 8, 'learning_rate': 0.01028947225504226, 'n_estimators': 973, 'subsample': 0.9664885281600843, 'colsample_bytree': 0.6061695553391381, 'gamma': 4.318212352431953, 'min_child_weight': 2, 'lambda': 0.016480446427978974, 'alpha': 0.12561043700013558}. Best is trial 0 with value: 0.6235439789329265.
[I 2025-01-17 23:40:55,201] Trial 2 finished with value: 0.621570203323404 and para

Best parameters: {'max_depth': 4, 'learning_rate': 0.015913351454109003, 'n_estimators': 769, 'subsample': 0.8321519792096617, 'colsample_bytree': 0.9089835120595311, 'gamma': 7.244603800809839, 'min_child_weight': 6, 'lambda': 0.001060580131290621, 'alpha': 0.014065414421987705}


In [38]:
optuna.visualization.plot_optimization_history(study_log_improve)

In [42]:
fig=optuna.visualization.plot_param_importances(study_log_improve)
fig.update_layout(title="")

In [43]:
fig=optuna.visualization.plot_slice(study_log_improve) 
fig.update_layout(title="")

In [41]:
optuna.visualization.plot_rank(study_log_improve)

In [44]:
fig=optuna.visualization.plot_edf(study_log_improve)
fig.update_layout(title="")

In [None]:
final_model_log_improve = xgb.XGBClassifier(**best_params_log_improve)
final_model_log_improve.fit(X_train_selected, y_train)
y_pred_log_improve = final_model_log_improve.predict(X_test_selected)
accuracy_log_improve = accuracy_score(y_test, y_pred_log_improve)
print("Final model accuracy on test data:", accuracy_log_improve)
print("Logloss on test data:", log_loss(y_test, final_model_log_improve.predict_proba(X_test_selected)))
print("Brier score on test data:", brier_score_loss(y_test, final_model_log_improve.predict_proba(X_test_selected)[:, 1]))

Final model accuracy on test data: 0.6372767857142857
Logloss on test data: 0.6252536073981937
Brier score on test data: 0.21834964874050367


In [58]:
joblib.dump(final_model_log_improve, "../XGBoost/best_models/best_model_log_cros.pkl")
np.save("../XGBoost/best_models/best_features_log_cros.npy", best_features)
with open("../XGBoost/best_models/best_params_log_cros.json", "w") as f:
    json.dump(best_params_log_improve, f)
with open("../XGBoost/best_models/best_model_log_cros_as_txt.txt", "w") as f:
    f.write(str(final_model_log_improve))