In [20]:
import pandas as pd
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import numpy as np
import shap
import matplotlib.pyplot as plt
import joblib

In [2]:
import warnings
warnings.resetwarnings()
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter("default", ConvergenceWarning)

In [3]:
matches = pd.read_csv("../../preparation_before_models/data/matches.csv")

In [4]:
X=matches.drop(columns=['target','player1_bet_odds','player2_bet_odds'])
y=matches['target']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

### wstępna selekcja za pomocą shapa

In [6]:
def feature_selection_with_shap(X, y, percent):
    model = xgb.XGBClassifier( eval_metric="mlogloss")
    model.fit(X, y)

    explainer = shap.Explainer(model, X)
    shap_values = explainer(X)

    # Obliczenie średniej absolutnej ważności cech
    feature_importance = np.abs(shap_values.values).mean(axis=0)

    # Wybór najważniejszych cech 
    importance_threshold = np.percentile(feature_importance, 100-percent)  
    selected_features = np.where(feature_importance > importance_threshold)[0]

    print(f"Selected {len(selected_features)} features out of {X.shape[1]} with top {percent}%.")
    return X.iloc[:, selected_features], selected_features
    

In [7]:
def objective_log(trial, X_train, y_train):
    # Proponowane hiperparametry przez Optunę
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "lambda": trial.suggest_float("lambda", 1e-3, 10, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10, log=True),
    }
    model = xgb.XGBClassifier(**params)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_log_loss")

    # Negatywna wartość accuracy (Optuna minimalizuje, więc -accuracy)
    return -scores.mean()

### optymalizacja i szukanie najlepszego zbioru cech

In [9]:
percentages = [ 50, 65, 75, 85, 100]  
best_logloss = float("inf")
best_features = None
best_params = None
best_num_features = 0

for percent in percentages:
    print(f"Testing top {percent}% features...")
    X_train_selected, selected_features = feature_selection_with_shap(X_train, y_train, percent)
    X_test_selected = X_test.iloc[:, selected_features]

    # Optuna optymalizacja dla aktualnego podzbioru cech
    def wrapped_objective(trial):
        return objective_log(trial, X_train_selected, y_train)

    study_logloss = optuna.create_study(direction="minimize",sampler=optuna.samplers.RandomSampler(seed=42))
    study_logloss.optimize(wrapped_objective, n_trials=200, timeout=3600)

    # Sprawdzenie wyników
    if study_logloss.best_value < best_logloss:
        best_logloss = study_logloss.best_value
        best_features = selected_features
        best_params = study_logloss.best_params
        best_num_features = len(selected_features)

print(f"Best logloss: {best_logloss}")
print(f"Best number of features: {best_num_features}")
print(f"Best parameters: {best_params}")

Testing top 50% features...


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


Selected 40 features out of 80 with top 50%.


[I 2024-12-20 22:52:01,047] Trial 0 finished with value: 0.7631547987953355 and parameters: {'max_depth': 5, 'learning_rate': 0.2536999076681772, 'n_estimators': 759, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'min_child_weight': 1, 'lambda': 2.9154431891537547, 'alpha': 0.2537815508265665}. Best is trial 0 with value: 0.7631547987953355.
[I 2024-12-20 22:52:21,088] Trial 1 finished with value: 0.6377235074508654 and parameters: {'max_depth': 8, 'learning_rate': 0.010725209743171996, 'n_estimators': 973, 'subsample': 0.9162213204002109, 'colsample_bytree': 0.6061695553391381, 'gamma': 0.9091248360355031, 'min_child_weight': 2, 'lambda': 0.016480446427978974, 'alpha': 0.12561043700013558}. Best is trial 1 with value: 0.6377235074508654.
[I 2024-12-20 22:52:27,513] Trial 2 finished with value: 0.6419049988778118 and parameters: {'max_depth': 6, 'learning_rate': 0.02692655251486473, 'n_estimators': 651, 'subsample': 0.569746930326

Testing top 65% features...


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


Selected 52 features out of 80 with top 65%.


[I 2024-12-20 23:07:12,048] Trial 0 finished with value: 0.7703403499514847 and parameters: {'max_depth': 5, 'learning_rate': 0.2536999076681772, 'n_estimators': 759, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'min_child_weight': 1, 'lambda': 2.9154431891537547, 'alpha': 0.2537815508265665}. Best is trial 0 with value: 0.7703403499514847.
[I 2024-12-20 23:07:36,390] Trial 1 finished with value: 0.6382833688666367 and parameters: {'max_depth': 8, 'learning_rate': 0.010725209743171996, 'n_estimators': 973, 'subsample': 0.9162213204002109, 'colsample_bytree': 0.6061695553391381, 'gamma': 0.9091248360355031, 'min_child_weight': 2, 'lambda': 0.016480446427978974, 'alpha': 0.12561043700013558}. Best is trial 1 with value: 0.6382833688666367.
[I 2024-12-20 23:07:44,064] Trial 2 finished with value: 0.638768597079556 and parameters: {'max_depth': 6, 'learning_rate': 0.02692655251486473, 'n_estimators': 651, 'subsample': 0.5697469303260

Testing top 75% features...


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


Selected 60 features out of 80 with top 75%.


[I 2024-12-20 23:24:43,706] Trial 0 finished with value: 0.7656278788592351 and parameters: {'max_depth': 5, 'learning_rate': 0.2536999076681772, 'n_estimators': 759, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'min_child_weight': 1, 'lambda': 2.9154431891537547, 'alpha': 0.2537815508265665}. Best is trial 0 with value: 0.7656278788592351.
[I 2024-12-20 23:25:08,467] Trial 1 finished with value: 0.6381890712567143 and parameters: {'max_depth': 8, 'learning_rate': 0.010725209743171996, 'n_estimators': 973, 'subsample': 0.9162213204002109, 'colsample_bytree': 0.6061695553391381, 'gamma': 0.9091248360355031, 'min_child_weight': 2, 'lambda': 0.016480446427978974, 'alpha': 0.12561043700013558}. Best is trial 1 with value: 0.6381890712567143.
[I 2024-12-20 23:25:16,404] Trial 2 finished with value: 0.6394702338865937 and parameters: {'max_depth': 6, 'learning_rate': 0.02692655251486473, 'n_estimators': 651, 'subsample': 0.569746930326

Testing top 85% features...


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


Selected 68 features out of 80 with top 85%.


[I 2024-12-20 23:43:41,900] Trial 0 finished with value: 0.7652958576467757 and parameters: {'max_depth': 5, 'learning_rate': 0.2536999076681772, 'n_estimators': 759, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'min_child_weight': 1, 'lambda': 2.9154431891537547, 'alpha': 0.2537815508265665}. Best is trial 0 with value: 0.7652958576467757.
[I 2024-12-20 23:44:09,014] Trial 1 finished with value: 0.6356429757832268 and parameters: {'max_depth': 8, 'learning_rate': 0.010725209743171996, 'n_estimators': 973, 'subsample': 0.9162213204002109, 'colsample_bytree': 0.6061695553391381, 'gamma': 0.9091248360355031, 'min_child_weight': 2, 'lambda': 0.016480446427978974, 'alpha': 0.12561043700013558}. Best is trial 1 with value: 0.6356429757832268.
[I 2024-12-20 23:44:18,032] Trial 2 finished with value: 0.6394782582567653 and parameters: {'max_depth': 6, 'learning_rate': 0.02692655251486473, 'n_estimators': 651, 'subsample': 0.569746930326

Testing top 100% features...


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


Selected 79 features out of 80 with top 100%.


[I 2024-12-21 00:04:11,881] Trial 0 finished with value: 0.7571818003413139 and parameters: {'max_depth': 5, 'learning_rate': 0.2536999076681772, 'n_estimators': 759, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'min_child_weight': 1, 'lambda': 2.9154431891537547, 'alpha': 0.2537815508265665}. Best is trial 0 with value: 0.7571818003413139.
[I 2024-12-21 00:04:43,546] Trial 1 finished with value: 0.636386904351156 and parameters: {'max_depth': 8, 'learning_rate': 0.010725209743171996, 'n_estimators': 973, 'subsample': 0.9162213204002109, 'colsample_bytree': 0.6061695553391381, 'gamma': 0.9091248360355031, 'min_child_weight': 2, 'lambda': 0.016480446427978974, 'alpha': 0.12561043700013558}. Best is trial 1 with value: 0.636386904351156.
[I 2024-12-21 00:04:53,942] Trial 2 finished with value: 0.6391855851542606 and parameters: {'max_depth': 6, 'learning_rate': 0.02692655251486473, 'n_estimators': 651, 'subsample': 0.56974693032602

Best logloss: 0.6215267525672743
Best number of features: 60
Best parameters: {'max_depth': 4, 'learning_rate': 0.01727212894967222, 'n_estimators': 268, 'subsample': 0.6425475843469235, 'colsample_bytree': 0.5866867976473774, 'gamma': 4.483827123132126, 'min_child_weight': 1, 'lambda': 0.125327263862507, 'alpha': 0.04381141781653137}


In [10]:
final_model = xgb.XGBClassifier(**best_params)
X_train_selected = X_train.iloc[:, best_features]
X_test_selected = X_test.iloc[:, best_features]
final_model.fit(X_train_selected, y_train)

# Predykcje na zbiorze testowym
y_pred = final_model.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
print("Final model accuracy on test data:", accuracy)


Final model accuracy on test data: 0.6544638639584317


## tutaj odpalam study dla najlepszych cech, żeby wykresy dostać

In [11]:
def wrapped_objective(trial):
    return objective_log(trial, X_train_selected, y_train)
study_best_features_log = optuna.create_study(direction="minimize", sampler=optuna.samplers.RandomSampler(seed=42))
study_best_features_log.optimize(wrapped_objective, n_trials=200, timeout=3600)
best_params_log = study_best_features_log.best_params
print("Best parameters:", best_params_log)

[I 2024-12-21 01:08:54,273] A new study created in memory with name: no-name-a5aec8c4-86fc-46c6-b84a-a3295a6dbac1
[I 2024-12-21 01:08:59,982] Trial 0 finished with value: 0.7656278788592351 and parameters: {'max_depth': 5, 'learning_rate': 0.2536999076681772, 'n_estimators': 759, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'min_child_weight': 1, 'lambda': 2.9154431891537547, 'alpha': 0.2537815508265665}. Best is trial 0 with value: 0.7656278788592351.
[I 2024-12-21 01:09:32,588] Trial 1 finished with value: 0.6381890712567143 and parameters: {'max_depth': 8, 'learning_rate': 0.010725209743171996, 'n_estimators': 973, 'subsample': 0.9162213204002109, 'colsample_bytree': 0.6061695553391381, 'gamma': 0.9091248360355031, 'min_child_weight': 2, 'lambda': 0.016480446427978974, 'alpha': 0.12561043700013558}. Best is trial 1 with value: 0.6381890712567143.
[I 2024-12-21 01:09:44,144] Trial 2 finished with value: 0.6394702338865937 and p

Best parameters: {'max_depth': 4, 'learning_rate': 0.01727212894967222, 'n_estimators': 268, 'subsample': 0.6425475843469235, 'colsample_bytree': 0.5866867976473774, 'gamma': 4.483827123132126, 'min_child_weight': 1, 'lambda': 0.125327263862507, 'alpha': 0.04381141781653137}


In [None]:
X_train_selected = X_train.iloc[:, best_features]
X_test_selected = X_test.iloc[:, best_features]
final_model_log = xgb.XGBClassifier(**best_params_log)
final_model_log.fit(X_train_selected, y_train)
y_pred_log = final_model_log.predict(X_test_selected)
accuracy_log = accuracy_score(y_test, y_pred_log)
print("Final model accuracy on test data:", accuracy_log)

Final model accuracy on test data: 0.6544638639584317


## wizualizacja optymalizacji, szukanie zaleznosci jaka siatka i jakie hiperparametry modyfikowac

In [13]:
optuna.visualization.plot_optimization_history(study_best_features_log)

In [14]:
optuna.visualization.plot_param_importances(study_best_features_log)

In [20]:
optuna.visualization.plot_contour(study_best_features_log, params=["max_depth", "learning_rate"])

In [15]:
optuna.visualization.plot_slice(study_best_features_log)

In [16]:
optuna.visualization.plot_rank(study_best_features_log)

In [17]:
optuna.visualization.plot_edf(study_best_features_log)

In [18]:
optuna.visualization.plot_terminator_improvement(study_best_features_log)


plot_terminator_improvement is experimental (supported from v3.2.0). The interface can change in the future.


RegretBoundEvaluator is experimental (supported from v3.2.0). The interface can change in the future.


CrossValidationErrorEvaluator is experimental (supported from v3.2.0). The interface can change in the future.

100%|██████████| 200/200 [00:12<00:00, 16.25it/s]


In [19]:
optuna.visualization.plot_parallel_coordinate(study_best_features_log)

In [26]:
joblib.dump(final_model_log, "../XGBoost/best_models/best_model_log.pkl")

['../XGBoost/best_models/best_model_log.pkl']

In [27]:
np.save("../XGBoost/best_models/best_features_log.npy", best_features)

In [29]:
import json
with open("../XGBoost/best_models/best_params_log.json", "w") as f:
    json.dump(best_params_log, f)

In [33]:
with open("../XGBoost/best_models/best_model_log_as_txt.txt", "w") as f:
    f.write(str(final_model_log))

In [None]:
with open("../XGBoost/best_models/best_params_log.json", "r") as f:
    best_params_log = json.load(f)
best_features_log = np.load("../XGBoost/best_models/best_features_log.npy")
best_model_log = joblib.load("../XGBoost/best_models/best_model_log.pkl")

## weźmy po duwagę najważniejsze hiperparametry i zmodyfikujmy przedział po analizie wykresów

In [37]:
def objective_4(trial, X_train, y_train):
    # Proponowane hiperparametry przez Optunę
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.05, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "lambda": trial.suggest_float("lambda", 1e-3, 10, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10, log=True),
    }
    model = xgb.XGBClassifier(**params)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy")

    # Negatywna wartość accuracy (Optuna minimalizuje, więc -accuracy)
    return scores.mean()

In [38]:
def wrapped_objective_4(trial):
    return objective_4(trial, X_train, y_train)
study_4 = optuna.create_study(direction="maximize", sampler=optuna.samplers.RandomSampler(seed=42))
study_4.optimize(wrapped_objective_4, n_trials=200, timeout=3600)
best_params_4 = study_4.best_params
print("Best parameters:", best_params_4)

[I 2024-12-16 15:41:18,527] A new study created in memory with name: no-name-8c253457-b078-44a3-86cb-f8da6c3fadcf
[I 2024-12-16 15:41:20,073] Trial 0 finished with value: 0.6482728949930815 and parameters: {'learning_rate': 0.018272261776066247, 'subsample': 0.9753571532049581, 'gamma': 3.6599697090570253, 'lambda': 0.24810409748678125, 'alpha': 0.004207988669606638}. Best is trial 0 with value: 0.6482728949930815.
[I 2024-12-16 15:41:21,395] Trial 1 finished with value: 0.6479695075078153 and parameters: {'learning_rate': 0.012853916978930137, 'subsample': 0.5290418060840998, 'gamma': 4.330880728874676, 'lambda': 0.2537815508265665, 'alpha': 0.679657809075816}. Best is trial 0 with value: 0.6482728949930815.
[I 2024-12-16 15:41:22,882] Trial 2 finished with value: 0.6436163583252191 and parameters: {'learning_rate': 0.010336843570697411, 'subsample': 0.9849549260809971, 'gamma': 4.162213204002109, 'lambda': 0.0070689749506246055, 'alpha': 0.005337032762603957}. Best is trial 0 with va

Best parameters: {'learning_rate': 0.020820463933005757, 'subsample': 0.8100662989007683, 'gamma': 1.3869059149056633, 'lambda': 0.005655677524618509, 'alpha': 0.07158031739371042}


## co ciekawe wynik sie minimalnie pogorszyl, w kazdym razie nie powiedzialem jeszcze ostatniego słowa w tej kwestii, ale na ten moment skaczę na inny kwiatek

In [39]:
final_model_4 = xgb.XGBClassifier(**best_params_4)
final_model_4.fit(X_train, y_train)
y_pred_4 = final_model_4.predict(X_test)
accuracy_4 = accuracy_score(y_test, y_pred_4)
print("Final model accuracy on test data:", accuracy_4)

Final model accuracy on test data: 0.6532829475673122


In [35]:
optuna.visualization.plot_param_importances(study_4)

In [40]:
optuna.visualization.plot_slice(study_4)