In [1]:
import pandas as pd
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
import numpy as np
import shap
import matplotlib.pyplot as plt
import joblib
import json
from sklearn.model_selection import KFold

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings
warnings.resetwarnings()
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter("default", ConvergenceWarning)

In [3]:
matches = pd.read_csv("../../preparation_before_models/data/matches.csv")

In [4]:
matches['Date'] = pd.to_datetime(matches['Date'])
print(matches['Date'].head())
print(matches['Date'].dtype)

0   2017-12-31
1   2017-12-31
2   2018-01-01
3   2018-01-01
4   2018-01-01
Name: Date, dtype: datetime64[ns]
datetime64[ns]


In [5]:
matches=matches.drop(columns=['player1_bet_odds','player2_bet_odds',"w_ace_avg", "l_ace_avg", "w_CO_ace_avg", "l_CO_ace_avg","w_df_avg", "l_df_avg", "w_CO_df_avg", "l_CO_df_avg", "w_2ndIn_avg","l_2ndIn_avg","w_CO_2ndIn_avg", "l_CO_2ndIn_avg"])

In [6]:
test_data = matches[matches["Date"].dt.year == 2023]
train_data = matches[matches["Date"].dt.year != 2023]

In [7]:
X_train_data = train_data.drop(columns=['target','Date'])
y_train_data = train_data['target']


In [8]:
X_test_data  = test_data.drop(columns=['target','Date'])
y_test_data= test_data['target']

In [9]:
def rolling_origin_split(data):
    data = data.sort_values(by="Date")

    
    splits = []
    for year in range(2018, 2022):
        training_data = data[data["Date"].dt.year <= year]
        val_data = data[data["Date"].dt.year == year + 1]

        if not val_data.empty:
            splits.append((training_data, val_data))

    return splits


### wstępna selekcja za pomocą shapa

In [10]:
def feature_selection_with_shap(X, y, percent):
    if "Date" in X.columns:
        X = X.drop(columns=["Date"])
    
    model = xgb.XGBClassifier( eval_metric="mlogloss")
    model.fit(X, y)

    explainer = shap.Explainer(model, X)
    shap_values = explainer(X)

    # Obliczenie średniej absolutnej ważności cech
    feature_importance = np.abs(shap_values.values).mean(axis=0)

    # Wybór najważniejszych cech 
    importance_threshold = np.percentile(feature_importance, 100-percent)  
    selected_features = np.where(feature_importance > importance_threshold)[0]

    print(f"Selected {len(selected_features)} features out of {X.shape[1]} with top {percent}%.")
    return X.iloc[:, selected_features], selected_features
    

In [11]:
def objective_log(trial, data):
    # Suggest hyperparameters
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "lambda": trial.suggest_float("lambda", 1e-3, 10, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10, log=True),
    }

    # Prepare rolling origin splits
    splits = rolling_origin_split(data)

    # Initialize model
    model = xgb.XGBClassifier(**params, random_state=42)

    log_loss_scores = []

    for training_data, val_data in splits:
        # Ensure proper sorting to avoid leakage
        training_data = training_data.sort_values("Date")
        val_data = val_data.sort_values("Date")

        # Separate features and target
        X_train = training_data.drop(columns=["target", "Date"])
        y_train = training_data["target"]
        X_val = val_data.drop(columns=["target", "Date"])
        y_val = val_data["target"]

        # Train the model on the training set
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

        # Predict probabilities for the validation set
        y_pred_proba = model.predict_proba(X_val)[:, 1]

        # Calculate log loss for the validation set
        log_loss_scores.append(log_loss(y_val, y_pred_proba))

    # Return the mean log loss as the objective to minimize
    return np.mean(log_loss_scores)


In [None]:
# def objective_log(trial, X_train, y_train):
#     params = {
#         "objective": "binary:logistic",
#         "eval_metric": "logloss",
#         "max_depth": trial.suggest_int("max_depth", 3, 10),
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
#         "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
#         "subsample": trial.suggest_float("subsample", 0.5, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
#         "gamma": trial.suggest_float("gamma", 0, 5),
#         "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
#         "lambda": trial.suggest_float("lambda", 1e-3, 10, log=True),
#         "alpha": trial.suggest_float("alpha", 1e-3, 10, log=True),
#     }
#     model = xgb.XGBClassifier(**params)
#     scores = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_log_loss")

#     return -scores.mean()

### optymalizacja i szukanie najlepszego zbioru cech

In [14]:
percentages = [ 50, 65, 75, 85, 100]  
best_logloss = float("inf")
best_features = None
best_params = None
best_num_features = 0

for percent in percentages:
    print(f"Testing top {percent}% features...")
    X_train_selected, selected_features = feature_selection_with_shap(X_train_data, y_train_data, percent)
    train_data_selected = train_data[["Date", "target"] + list(X_train_selected.columns)]

    # Optuna optymalizacja dla aktualnego podzbioru cech
    def wrapped_objective(trial):
        return objective_log(trial, train_data_selected)

    study_logloss = optuna.create_study(direction="minimize",sampler=optuna.samplers.RandomSampler(seed=42))
    study_logloss.optimize(wrapped_objective, n_trials=200, timeout=3600)

    # Sprawdzenie wyników
    if study_logloss.best_value < best_logloss:
        best_logloss = study_logloss.best_value
        best_features = selected_features
        best_params = study_logloss.best_params
        best_num_features = len(selected_features)

print(f"Best logloss: {best_logloss}")
print(f"Best number of features: {best_num_features}")
print(f"Best parameters: {best_params}")

Testing top 50% features...




Selected 47 features out of 95 with top 50%.


[I 2025-01-02 18:21:12,046] Trial 0 finished with value: 0.7586357481865926 and parameters: {'max_depth': 5, 'learning_rate': 0.2536999076681772, 'n_estimators': 759, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'min_child_weight': 1, 'lambda': 2.9154431891537547, 'alpha': 0.2537815508265665}. Best is trial 0 with value: 0.7586357481865926.
[I 2025-01-02 18:21:36,356] Trial 1 finished with value: 0.6397104278801929 and parameters: {'max_depth': 8, 'learning_rate': 0.010725209743171996, 'n_estimators': 973, 'subsample': 0.9162213204002109, 'colsample_bytree': 0.6061695553391381, 'gamma': 0.9091248360355031, 'min_child_weight': 2, 'lambda': 0.016480446427978974, 'alpha': 0.12561043700013558}. Best is trial 1 with value: 0.6397104278801929.
[I 2025-01-02 18:21:45,825] Trial 2 finished with value: 0.646719093956974 and parameters: {'max_depth': 6, 'learning_rate': 0.02692655251486473, 'n_estimators': 651, 'subsample': 0.5697469303260

Testing top 65% features...




Selected 62 features out of 95 with top 65%.


[I 2025-01-02 18:42:33,605] Trial 0 finished with value: 0.7578135735711069 and parameters: {'max_depth': 5, 'learning_rate': 0.2536999076681772, 'n_estimators': 759, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'min_child_weight': 1, 'lambda': 2.9154431891537547, 'alpha': 0.2537815508265665}. Best is trial 0 with value: 0.7578135735711069.
[I 2025-01-02 18:43:03,582] Trial 1 finished with value: 0.6426769350418301 and parameters: {'max_depth': 8, 'learning_rate': 0.010725209743171996, 'n_estimators': 973, 'subsample': 0.9162213204002109, 'colsample_bytree': 0.6061695553391381, 'gamma': 0.9091248360355031, 'min_child_weight': 2, 'lambda': 0.016480446427978974, 'alpha': 0.12561043700013558}. Best is trial 1 with value: 0.6426769350418301.
[I 2025-01-02 18:43:15,561] Trial 2 finished with value: 0.6467906210895035 and parameters: {'max_depth': 6, 'learning_rate': 0.02692655251486473, 'n_estimators': 651, 'subsample': 0.569746930326

Testing top 75% features...




Selected 71 features out of 95 with top 75%.


[I 2025-01-02 19:09:41,744] Trial 0 finished with value: 0.7573152896659876 and parameters: {'max_depth': 5, 'learning_rate': 0.2536999076681772, 'n_estimators': 759, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'min_child_weight': 1, 'lambda': 2.9154431891537547, 'alpha': 0.2537815508265665}. Best is trial 0 with value: 0.7573152896659876.
[I 2025-01-02 19:10:14,301] Trial 1 finished with value: 0.6428936671844151 and parameters: {'max_depth': 8, 'learning_rate': 0.010725209743171996, 'n_estimators': 973, 'subsample': 0.9162213204002109, 'colsample_bytree': 0.6061695553391381, 'gamma': 0.9091248360355031, 'min_child_weight': 2, 'lambda': 0.016480446427978974, 'alpha': 0.12561043700013558}. Best is trial 1 with value: 0.6428936671844151.
[I 2025-01-02 19:10:27,279] Trial 2 finished with value: 0.6507475089642314 and parameters: {'max_depth': 6, 'learning_rate': 0.02692655251486473, 'n_estimators': 651, 'subsample': 0.569746930326

Testing top 85% features...




Selected 80 features out of 95 with top 85%.


[I 2025-01-02 19:39:51,828] Trial 0 finished with value: 0.7631990154117505 and parameters: {'max_depth': 5, 'learning_rate': 0.2536999076681772, 'n_estimators': 759, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'min_child_weight': 1, 'lambda': 2.9154431891537547, 'alpha': 0.2537815508265665}. Best is trial 0 with value: 0.7631990154117505.
[I 2025-01-02 19:40:26,582] Trial 1 finished with value: 0.6441793215982993 and parameters: {'max_depth': 8, 'learning_rate': 0.010725209743171996, 'n_estimators': 973, 'subsample': 0.9162213204002109, 'colsample_bytree': 0.6061695553391381, 'gamma': 0.9091248360355031, 'min_child_weight': 2, 'lambda': 0.016480446427978974, 'alpha': 0.12561043700013558}. Best is trial 1 with value: 0.6441793215982993.
[I 2025-01-02 19:40:41,097] Trial 2 finished with value: 0.6479393813489263 and parameters: {'max_depth': 6, 'learning_rate': 0.02692655251486473, 'n_estimators': 651, 'subsample': 0.569746930326

Testing top 100% features...




Selected 93 features out of 95 with top 100%.


[I 2025-01-02 20:12:03,556] Trial 0 finished with value: 0.7600838939248982 and parameters: {'max_depth': 5, 'learning_rate': 0.2536999076681772, 'n_estimators': 759, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'min_child_weight': 1, 'lambda': 2.9154431891537547, 'alpha': 0.2537815508265665}. Best is trial 0 with value: 0.7600838939248982.
[I 2025-01-02 20:12:48,946] Trial 1 finished with value: 0.6416076381507769 and parameters: {'max_depth': 8, 'learning_rate': 0.010725209743171996, 'n_estimators': 973, 'subsample': 0.9162213204002109, 'colsample_bytree': 0.6061695553391381, 'gamma': 0.9091248360355031, 'min_child_weight': 2, 'lambda': 0.016480446427978974, 'alpha': 0.12561043700013558}. Best is trial 1 with value: 0.6416076381507769.
[I 2025-01-02 20:13:08,017] Trial 2 finished with value: 0.647260867714294 and parameters: {'max_depth': 6, 'learning_rate': 0.02692655251486473, 'n_estimators': 651, 'subsample': 0.5697469303260

Best logloss: 0.6215920929985139
Best number of features: 80
Best parameters: {'max_depth': 4, 'learning_rate': 0.011323698336394534, 'n_estimators': 525, 'subsample': 0.7824205666313082, 'colsample_bytree': 0.5328543197141762, 'gamma': 3.877638083475053, 'min_child_weight': 5, 'lambda': 0.12518753190194004, 'alpha': 0.0579496727458303}


In [15]:
final_model = xgb.XGBClassifier(**best_params)
X_train_selected = X_train_data.iloc[:, best_features]
X_test_selected = X_test_data.iloc[:, best_features]
final_model.fit(X_train_selected, y_train_data)

# Predykcje na zbiorze testowym
y_pred = final_model.predict(X_test_selected)
accuracy = accuracy_score(y_test_data, y_pred)
print("Final model accuracy on test data:", accuracy)


Final model accuracy on test data: 0.6462053571428571


## tutaj odpalam study dla najlepszych cech, żeby wykresy dostać

In [16]:
train_data_selected = train_data[["Date", "target"] + list(X_train_selected.columns)]
def wrapped_objective(trial):
    return objective_log(trial, train_data_selected)
study_best_features_log = optuna.create_study(direction="minimize", sampler=optuna.samplers.RandomSampler(seed=42))
study_best_features_log.optimize(wrapped_objective, n_trials=200, timeout=3600)
best_params_log = study_best_features_log.best_params
print("Best parameters:", best_params_log)

[I 2025-01-03 01:19:23,684] A new study created in memory with name: no-name-dc92475b-e148-436c-8b05-6c4b25ccc336
[I 2025-01-03 01:19:31,641] Trial 0 finished with value: 0.7631990154117505 and parameters: {'max_depth': 5, 'learning_rate': 0.2536999076681772, 'n_estimators': 759, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'min_child_weight': 1, 'lambda': 2.9154431891537547, 'alpha': 0.2537815508265665}. Best is trial 0 with value: 0.7631990154117505.
[I 2025-01-03 01:20:06,209] Trial 1 finished with value: 0.6441793215982993 and parameters: {'max_depth': 8, 'learning_rate': 0.010725209743171996, 'n_estimators': 973, 'subsample': 0.9162213204002109, 'colsample_bytree': 0.6061695553391381, 'gamma': 0.9091248360355031, 'min_child_weight': 2, 'lambda': 0.016480446427978974, 'alpha': 0.12561043700013558}. Best is trial 1 with value: 0.6441793215982993.
[I 2025-01-03 01:20:20,635] Trial 2 finished with value: 0.6479393813489263 and p

Best parameters: {'max_depth': 4, 'learning_rate': 0.011323698336394534, 'n_estimators': 525, 'subsample': 0.7824205666313082, 'colsample_bytree': 0.5328543197141762, 'gamma': 3.877638083475053, 'min_child_weight': 5, 'lambda': 0.12518753190194004, 'alpha': 0.0579496727458303}


In [17]:
X_train_selected = X_train_data.iloc[:, best_features]
X_test_selected = X_test_data.iloc[:, best_features]
final_model_log = xgb.XGBClassifier(**best_params_log)
final_model_log.fit(X_train_selected, y_train_data)
y_pred_log = final_model_log.predict(X_test_selected)
accuracy_log = accuracy_score(y_test_data, y_pred_log)
print("Final model accuracy on test data:", accuracy_log)

Final model accuracy on test data: 0.6462053571428571


## wizualizacja optymalizacji, szukanie zaleznosci jaka siatka i jakie hiperparametry modyfikowac

In [18]:
optuna.visualization.plot_optimization_history(study_best_features_log)

In [19]:
optuna.visualization.plot_param_importances(study_best_features_log)

In [20]:
optuna.visualization.plot_slice(study_best_features_log)

In [21]:
optuna.visualization.plot_rank(study_best_features_log)

In [22]:
optuna.visualization.plot_edf(study_best_features_log)

In [23]:
optuna.visualization.plot_terminator_improvement(study_best_features_log)


plot_terminator_improvement is experimental (supported from v3.2.0). The interface can change in the future.


RegretBoundEvaluator is experimental (supported from v3.2.0). The interface can change in the future.



ModuleNotFoundError: No module named 'torch'

In [24]:
optuna.visualization.plot_parallel_coordinate(study_best_features_log)

In [25]:
joblib.dump(final_model_log, "../XGBoost/best_models/best_model_log_nested.pkl")

['../XGBoost/best_models/best_model_log_nested.pkl']

In [26]:
np.save("../XGBoost/best_models/best_features_log_nested.npy", best_features)

In [27]:
with open("../XGBoost/best_models/best_params_log_nested.json", "w") as f:
    json.dump(best_params_log, f)

In [28]:
with open("../XGBoost/best_models/best_model_log_nested_as_txt.txt", "w") as f:
    f.write(str(final_model_log))

In [12]:
with open("../XGBoost/best_models/best_params_log.json", "r") as f:
    best_params_log = json.load(f)
best_features_log = np.load("../XGBoost/best_models/best_features_log.npy")
best_model_log = joblib.load("../XGBoost/best_models/best_model_log.pkl")

## na podstawie wykresów zmodyfikujmy siatkę hiperparametrów

In [15]:
X_train_selected = X_train_data.iloc[:, best_features_log]
X_test_selected = X_test_data.iloc[:, best_features_log]

In [29]:
def objective_log_improve(trial, data):
    # Suggest hyperparameters
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.04, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.8, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 3.5, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "lambda": trial.suggest_float("lambda", 1e-3, 10, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10, log=True),
    }

    # Prepare rolling origin splits
    splits = rolling_origin_split(data)

    # Initialize model
    model = xgb.XGBClassifier(**params)

    log_loss_scores = []

    for training_data, val_data in splits:
        # Ensure proper sorting to avoid leakage
        training_data = training_data.sort_values("Date")
        val_data = val_data.sort_values("Date")

        # Separate features and target
        X_train = training_data.drop(columns=["target", "Date"])
        y_train = training_data["target"]
        X_val = val_data.drop(columns=["target", "Date"])
        y_val = val_data["target"]

        # Train the model on the training set
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

        # Predict probabilities for the validation set
        y_pred_proba = model.predict_proba(X_val)[:, 1]

        # Calculate log loss for the validation set
        log_loss_scores.append(log_loss(y_val, y_pred_proba))

    # Return the mean log loss as the objective to minimize
    return np.mean(log_loss_scores)


In [13]:
# def objective_log_improve(trial, X_train, y_train):
#     params = {
#         "objective": "binary:logistic",
#         "eval_metric": "logloss",
#         "max_depth": trial.suggest_int("max_depth", 3, 10),
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.04, log=True),
#         "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
#         "subsample": trial.suggest_float("subsample", 0.8, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
#         "gamma": trial.suggest_float("gamma", 3.5, 5),
#         "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
#         "lambda": trial.suggest_float("lambda", 1e-3, 10, log=True),
#         "alpha": trial.suggest_float("alpha", 1e-3, 10, log=True),
#     }
#     model = xgb.XGBClassifier(**params)
#     scores = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_log_loss")

#     return -scores.mean()

In [30]:
train_data_selected = train_data[["Date", "target"] + list(X_train_selected.columns)]
def wrapped_objective(trial):
    return objective_log_improve(trial, train_data_selected)
study_log_improve = optuna.create_study(direction="minimize", sampler=optuna.samplers.RandomSampler(seed=42))
study_log_improve.optimize(wrapped_objective, n_trials=200, timeout=3600)
best_params_log_improve = study_log_improve.best_params
print("Best parameters:", best_params_log_improve)

[I 2025-01-03 02:03:07,910] A new study created in memory with name: no-name-90968de5-1282-4cf0-8d2b-9093faeb9cda
[I 2025-01-03 02:03:15,881] Trial 0 finished with value: 0.6254835740869009 and parameters: {'max_depth': 5, 'learning_rate': 0.03735829498998231, 'n_estimators': 759, 'subsample': 0.9197316968394074, 'colsample_bytree': 0.5780093202212182, 'gamma': 3.733991780504304, 'min_child_weight': 1, 'lambda': 2.9154431891537547, 'alpha': 0.2537815508265665}. Best is trial 0 with value: 0.6254835740869009.
[I 2025-01-03 02:03:32,321] Trial 1 finished with value: 0.6250049898794656 and parameters: {'max_depth': 8, 'learning_rate': 0.01028947225504226, 'n_estimators': 973, 'subsample': 0.9664885281600843, 'colsample_bytree': 0.6061695553391381, 'gamma': 3.772737450810651, 'min_child_weight': 2, 'lambda': 0.016480446427978974, 'alpha': 0.12561043700013558}. Best is trial 1 with value: 0.6250049898794656.
[I 2025-01-03 02:03:44,404] Trial 2 finished with value: 0.624303824833887 and para

Best parameters: {'max_depth': 4, 'learning_rate': 0.01255734611818063, 'n_estimators': 833, 'subsample': 0.93303944413924, 'colsample_bytree': 0.7615327123845597, 'gamma': 4.038245726185254, 'min_child_weight': 9, 'lambda': 0.03713474360443033, 'alpha': 1.8467059900514502}


## dodatkowa optymalizacja nie przyniosła już lepszych wyników logloss