In [15]:
import pandas as pd
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
import numpy as np
import shap
import matplotlib.pyplot as plt
import joblib
import json
from sklearn.model_selection import KFold
import plotly.io as pio

In [3]:
import warnings
warnings.resetwarnings()
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter("default", ConvergenceWarning)

In [4]:
matches = pd.read_csv("../../preparation_before_models/data/matches.csv")

In [None]:
matches['Date'] = pd.to_datetime(matches['Date'])
print(matches['Date'].head())
print(matches['Date'].dtype)

In [6]:
matches=matches.drop(columns=['match_id','player1_bet_odds','player2_bet_odds',"w_ace_avg", "l_ace_avg", "w_CO_ace_avg", "l_CO_ace_avg","w_df_avg", "l_df_avg", "w_CO_df_avg", "l_CO_df_avg", "w_2ndIn_avg","l_2ndIn_avg","w_CO_2ndIn_avg", "l_CO_2ndIn_avg"])

In [7]:
test_data = matches[matches["Date"].dt.year == 2023]
train_data = matches[matches["Date"].dt.year != 2023]

In [8]:
X_train_data = train_data.drop(columns=['target','Date'])
y_train_data = train_data['target']


In [9]:
X_test_data  = test_data.drop(columns=['target','Date'])
y_test_data= test_data['target']

In [10]:
def rolling_origin_split(data):
    data = data.sort_values(by="Date")

    
    splits = []
    for year in range(2018, 2022):
        training_data = data[data["Date"].dt.year <= year]
        val_data = data[data["Date"].dt.year == year + 1]

        if not val_data.empty:
            splits.append((training_data, val_data))

    return splits


### wstępna selekcja za pomocą shapa

In [10]:
def feature_selection_with_shap(X, y, percent):
    if "Date" in X.columns:
        X = X.drop(columns=["Date"])
    
    model = xgb.XGBClassifier( eval_metric="mlogloss")
    model.fit(X, y)

    explainer = shap.Explainer(model, X)
    shap_values = explainer(X)

    
    feature_importance = np.abs(shap_values.values).mean(axis=0)

   
    importance_threshold = np.percentile(feature_importance, 100-percent)  
    selected_features = np.where(feature_importance > importance_threshold)[0]

    print(f"Selected {len(selected_features)} features out of {X.shape[1]} with top {percent}%.")
    return X.iloc[:, selected_features], selected_features
    

In [11]:
def objective_log(trial, data):
    
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "lambda": trial.suggest_float("lambda", 1e-3, 10, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10, log=True),
    }

    
    splits = rolling_origin_split(data)

    
    model = xgb.XGBClassifier(**params, random_state=42)

    log_loss_scores = []

    for training_data, val_data in splits:
        
        training_data = training_data.sort_values("Date")
        val_data = val_data.sort_values("Date")

        
        X_train = training_data.drop(columns=["target", "Date"])
        y_train = training_data["target"]
        X_val = val_data.drop(columns=["target", "Date"])
        y_val = val_data["target"]

        
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

        
        y_pred_proba = model.predict_proba(X_val)[:, 1]

        
        log_loss_scores.append(log_loss(y_val, y_pred_proba))

    
    return np.mean(log_loss_scores)


### optymalizacja i szukanie najlepszego zbioru cech

In [None]:
percentages = [ 50, 65, 75, 85, 100]  
best_logloss = float("inf")
best_features = None
best_params = None
best_num_features = 0

for percent in percentages:
    print(f"Testing top {percent}% features...")
    X_train_selected, selected_features = feature_selection_with_shap(X_train_data, y_train_data, percent)
    train_data_selected = train_data[["Date", "target"] + list(X_train_selected.columns)]

    
    def wrapped_objective(trial):
        return objective_log(trial, train_data_selected)

    study_logloss = optuna.create_study(direction="minimize",sampler=optuna.samplers.RandomSampler(seed=42))
    study_logloss.optimize(wrapped_objective, n_trials=200, timeout=3600)

    
    if study_logloss.best_value < best_logloss:
        best_logloss = study_logloss.best_value
        best_features = selected_features
        best_params = study_logloss.best_params
        best_num_features = len(selected_features)

print(f"Best logloss: {best_logloss}")
print(f"Best number of features: {best_num_features}")
print(f"Best parameters: {best_params}")

In [None]:
final_model = xgb.XGBClassifier(**best_params)
X_train_selected = X_train_data.iloc[:, best_features]
X_test_selected = X_test_data.iloc[:, best_features]
final_model.fit(X_train_selected, y_train_data)

In [None]:
train_data_selected = train_data[["Date", "target"] + list(X_train_selected.columns)]
def wrapped_objective(trial):
    return objective_log(trial, train_data_selected)
study_best_features_log = optuna.create_study(direction="minimize", sampler=optuna.samplers.RandomSampler(seed=42))
study_best_features_log.optimize(wrapped_objective, n_trials=200, timeout=3600)
best_params_log = study_best_features_log.best_params
print("Best parameters:", best_params_log)

In [None]:
X_train_selected = X_train_data.iloc[:, best_features]
X_test_selected = X_test_data.iloc[:, best_features]
final_model_log = xgb.XGBClassifier(**best_params_log)
final_model_log.fit(X_train_selected, y_train_data)

## wizualizacja optymalizacji, szukanie zaleznosci jaka siatka i jakie hiperparametry modyfikowac

In [None]:
fig=optuna.visualization.plot_param_importances(study_best_features_log)
fig.update_layout(
    title="",
    template="plotly_white",  
    font=dict(size=14),  
    plot_bgcolor='white',  
    paper_bgcolor='white',  
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey'),  
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey')   
)


pio.write_image(fig, "param_importance_plot_xgb_nested.pdf", format="pdf", scale=3)  


fig.show()

In [None]:
fig=optuna.visualization.plot_slice(study_best_features_log, params=["learning_rate", "gamma", "subsample"])
fig.update_layout(
    title="",
    template="plotly_white",
    font=dict(size=14),
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey'),
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey')
)

pio.write_image(fig, "xgb_plot_slice_nested.pdf", format="pdf", scale=3)
fig.show()

In [None]:
fig=optuna.visualization.plot_rank(study_best_features_log, params=["learning_rate", "gamma", "subsample"])
fig.update_layout(
    title="",
    template="plotly_white",  
    font=dict(size=14),  
    plot_bgcolor='white',  
    paper_bgcolor='white',  
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey'),  
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey')   
)
pio.write_image(fig, "xgb_plot_rank_nested.pdf", format="pdf", scale=3)

fig.show()

In [None]:
values = np.array([t.value for t in study_best_features_log.trials if t.state == optuna.trial.TrialState.COMPLETE])
values.sort()
quantile_80 = np.percentile(values, 80)
fig=optuna.visualization.plot_edf(study_best_features_log)
fig.update_layout(
    title="",
    template="plotly_white", 
    font=dict(size=14),
)
fig.add_vline(
    x=quantile_80,
    line_dash="dash",
    line_color="red",
)
pio.write_image(fig, "edf_plot_xgb_nested.pdf", format="pdf", scale=10)  # Skalowanie dla wysokiej jakości
fig.show()

In [None]:
joblib.dump(final_model_log, "../XGBoost/best_models/best_model_log_nested.pkl")

In [26]:
np.save("../XGBoost/best_models/best_features_log_nested.npy", best_features)

In [27]:
with open("../XGBoost/best_models/best_params_log_nested.json", "w") as f:
    json.dump(best_params_log, f)

In [28]:
with open("../XGBoost/best_models/best_model_log_nested_as_txt.txt", "w") as f:
    f.write(str(final_model_log))

In [12]:
with open("../XGBoost/best_models/best_params_log.json", "r") as f:
    best_params_log = json.load(f)
best_features_log = np.load("../XGBoost/best_models/best_features_log.npy")
best_model_log = joblib.load("../XGBoost/best_models/best_model_log.pkl")

## na podstawie wykresów zmodyfikujmy siatkę hiperparametrów

In [15]:
X_train_selected = X_train_data.iloc[:, best_features_log]
X_test_selected = X_test_data.iloc[:, best_features_log]

In [29]:
def objective_log_improve(trial, data):
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.04, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.8, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 3.5, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "lambda": trial.suggest_float("lambda", 1e-3, 10, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10, log=True),
    }

    splits = rolling_origin_split(data)

    model = xgb.XGBClassifier(**params)

    log_loss_scores = []

    for training_data, val_data in splits:
        training_data = training_data.sort_values("Date")
        val_data = val_data.sort_values("Date")

        
        X_train = training_data.drop(columns=["target", "Date"])
        y_train = training_data["target"]
        X_val = val_data.drop(columns=["target", "Date"])
        y_val = val_data["target"]

        
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        y_pred_proba = model.predict_proba(X_val)[:, 1]
        log_loss_scores.append(log_loss(y_val, y_pred_proba))

    
    return np.mean(log_loss_scores)


In [None]:
train_data_selected = train_data[["Date", "target"] + list(X_train_selected.columns)]
def wrapped_objective(trial):
    return objective_log_improve(trial, train_data_selected)
study_log_improve = optuna.create_study(direction="minimize", sampler=optuna.samplers.RandomSampler(seed=42))
study_log_improve.optimize(wrapped_objective, n_trials=200, timeout=3600)
best_params_log_improve = study_log_improve.best_params
print("Best parameters:", best_params_log_improve)