In [1]:
import pandas as pd
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import numpy as np
import shap
import matplotlib.pyplot as plt


In [2]:
import warnings
warnings.resetwarnings()
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter("default", ConvergenceWarning)

In [3]:
matches = pd.read_csv("../../preparation_before_models/data/matches.csv")

In [4]:
X=matches.drop(columns=['target'])
y=matches['target']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

### wstępna selekcja za pomocą shapa

In [8]:
def feature_selection_with_shap(X, y, percent):
    model = xgb.XGBClassifier( eval_metric="mlogloss")
    model.fit(X, y)

    explainer = shap.Explainer(model, X)
    shap_values = explainer(X)

    # Obliczenie średniej absolutnej ważności cech
    feature_importance = np.abs(shap_values.values).mean(axis=0)

    # Wybór najważniejszych cech 
    importance_threshold = np.percentile(feature_importance, 100-percent)  
    selected_features = np.where(feature_importance > importance_threshold)[0]

    print(f"Selected {len(selected_features)} features out of {X.shape[1]} with top {percent}%.")
    return X.iloc[:, selected_features], selected_features
    

In [6]:
def objective(trial, X_train, y_train):
    # Proponowane hiperparametry przez Optunę
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "lambda": trial.suggest_float("lambda", 1e-3, 10, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10, log=True),
    }
    model = xgb.XGBClassifier(**params)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy")

    # Negatywna wartość accuracy (Optuna minimalizuje, więc -accuracy)
    return scores.mean()

### odpalałem model na różnych ilosciach cech w zaleznosci od shapa, top 25%, 35% itd. finalnie okazuje sie ze pozbycie sie jakichkolwiek cech pogarsza wyłącznie wynik (ewentualnie mozna dla 38 najlepszych zrobic, tam jest drobny potencjal, na pewno do tego wrócę), zostawiam kod i wyniki dla potomnych ale nie ma potrzeby odpalać poniższej komorki, liczy sie okolo 1.5h. 

In [10]:
percentages = range(25, 101, 10)  # Testowanie od 25% do 100% co 10%
best_accuracy = 0
best_features = None
best_params = None
best_num_features = 0

for percent in percentages:
    print(f"Testing top {percent}% features...")
    X_train_selected, selected_features = feature_selection_with_shap(X_train, y_train, percent)
    X_test_selected = X_test.iloc[:, selected_features]

    # Optuna optymalizacja dla aktualnego podzbioru cech
    def wrapped_objective(trial):
        return objective(trial, X_train_selected, y_train)

    study_accuracy = optuna.create_study(direction="maximize")
    study_accuracy.optimize(wrapped_objective, n_trials=200, timeout=3600)

    # Sprawdzenie wyników
    if study_accuracy.best_value > best_accuracy:
        best_accuracy = study_accuracy.best_value
        best_features = selected_features
        best_params = study_accuracy.best_params
        best_num_features = len(selected_features)

print(f"Best accuracy: {best_accuracy}")
print(f"Best number of features: {best_num_features}")
print(f"Best parameters: {best_params}")

Testing top 25% features...


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


Selected 10 features out of 41 with top 25%.


[I 2024-12-15 13:12:03,652] Trial 0 finished with value: 0.6230630348998105 and parameters: {'max_depth': 8, 'learning_rate': 0.03664728817211668, 'n_estimators': 474, 'subsample': 0.5607248574836226, 'colsample_bytree': 0.6212412220152462, 'gamma': 0.2032134040268474, 'min_child_weight': 4, 'lambda': 0.004853031264314473, 'alpha': 0.0021232956198438675}. Best is trial 0 with value: 0.6230630348998105.
[I 2024-12-15 13:12:05,713] Trial 1 finished with value: 0.6231634295085328 and parameters: {'max_depth': 8, 'learning_rate': 0.06049113175367161, 'n_estimators': 376, 'subsample': 0.803325686415635, 'colsample_bytree': 0.9876556897066286, 'gamma': 0.961522330935366, 'min_child_weight': 4, 'lambda': 0.0031162437414247505, 'alpha': 0.12776242140354205}. Best is trial 1 with value: 0.6231634295085328.
[I 2024-12-15 13:12:07,063] Trial 2 finished with value: 0.6437176241480038 and parameters: {'max_depth': 5, 'learning_rate': 0.03036633160252253, 'n_estimators': 443, 'subsample': 0.57580597

Testing top 35% features...


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


Selected 14 features out of 41 with top 35%.


[I 2024-12-15 13:20:09,304] Trial 0 finished with value: 0.6246841080305437 and parameters: {'max_depth': 6, 'learning_rate': 0.08908176411838248, 'n_estimators': 257, 'subsample': 0.5695639247534618, 'colsample_bytree': 0.9556493072415648, 'gamma': 1.0520879308040172, 'min_child_weight': 9, 'lambda': 0.07399580287458549, 'alpha': 0.002727643487380904}. Best is trial 0 with value: 0.6246841080305437.
[I 2024-12-15 13:20:14,201] Trial 1 finished with value: 0.6369327115256496 and parameters: {'max_depth': 6, 'learning_rate': 0.032363805754738635, 'n_estimators': 555, 'subsample': 0.6392611772680534, 'colsample_bytree': 0.5293290488450109, 'gamma': 0.9284903321289173, 'min_child_weight': 9, 'lambda': 9.064574552670253, 'alpha': 0.02999226682513022}. Best is trial 1 with value: 0.6369327115256496.
[I 2024-12-15 13:20:17,468] Trial 2 finished with value: 0.6380477117818889 and parameters: {'max_depth': 8, 'learning_rate': 0.12072084689720877, 'n_estimators': 957, 'subsample': 0.72594135247

Testing top 45% features...


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


Selected 18 features out of 41 with top 45%.


[I 2024-12-15 13:27:11,413] Trial 0 finished with value: 0.633593604263824 and parameters: {'max_depth': 5, 'learning_rate': 0.02890778068856871, 'n_estimators': 762, 'subsample': 0.852135413885464, 'colsample_bytree': 0.742343278966547, 'gamma': 0.8657749948711935, 'min_child_weight': 10, 'lambda': 0.18764461031967097, 'alpha': 0.14122721988320328}. Best is trial 0 with value: 0.633593604263824.
[I 2024-12-15 13:27:14,713] Trial 1 finished with value: 0.6440216778557885 and parameters: {'max_depth': 5, 'learning_rate': 0.03368976949703383, 'n_estimators': 251, 'subsample': 0.6320957132092451, 'colsample_bytree': 0.8671178009094878, 'gamma': 2.246277542117638, 'min_child_weight': 8, 'lambda': 0.0011689835092461782, 'alpha': 0.04935626463132151}. Best is trial 1 with value: 0.6440216778557885.
[I 2024-12-15 13:27:16,273] Trial 2 finished with value: 0.6497915748475376 and parameters: {'max_depth': 4, 'learning_rate': 0.07119514358367802, 'n_estimators': 123, 'subsample': 0.6730395210065

Testing top 55% features...


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


Selected 22 features out of 41 with top 55%.


[I 2024-12-15 13:39:20,357] Trial 0 finished with value: 0.6098998616307078 and parameters: {'max_depth': 7, 'learning_rate': 0.14679871021262533, 'n_estimators': 574, 'subsample': 0.5558746805727783, 'colsample_bytree': 0.8400842233495878, 'gamma': 0.5076871956699935, 'min_child_weight': 2, 'lambda': 0.0013250697490239695, 'alpha': 1.555947978180067}. Best is trial 0 with value: 0.6098998616307078.
[I 2024-12-15 13:39:25,761] Trial 1 finished with value: 0.6270127094757341 and parameters: {'max_depth': 10, 'learning_rate': 0.20810432243184873, 'n_estimators': 608, 'subsample': 0.5758995174425615, 'colsample_bytree': 0.9548334917992773, 'gamma': 4.506112040386524, 'min_child_weight': 3, 'lambda': 3.873983737104779, 'alpha': 0.09743696303123343}. Best is trial 1 with value: 0.6270127094757341.
[I 2024-12-15 13:39:30,413] Trial 2 finished with value: 0.6460453031312459 and parameters: {'max_depth': 7, 'learning_rate': 0.025414323514121387, 'n_estimators': 286, 'subsample': 0.721708023670

Testing top 65% features...


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


Selected 26 features out of 41 with top 65%.


[I 2024-12-15 13:58:01,481] Trial 0 finished with value: 0.6469567980320811 and parameters: {'max_depth': 5, 'learning_rate': 0.02099841385463229, 'n_estimators': 566, 'subsample': 0.5581821100368944, 'colsample_bytree': 0.8686843045118466, 'gamma': 3.197569674823834, 'min_child_weight': 7, 'lambda': 0.00831437684855189, 'alpha': 8.939818648004952}. Best is trial 0 with value: 0.6469567980320811.
[I 2024-12-15 13:58:07,047] Trial 1 finished with value: 0.6338962230308 and parameters: {'max_depth': 10, 'learning_rate': 0.05062168014155483, 'n_estimators': 974, 'subsample': 0.5466492382376021, 'colsample_bytree': 0.5887646509488988, 'gamma': 2.943473454416279, 'min_child_weight': 4, 'lambda': 0.023837266351724794, 'alpha': 0.34851384660840967}. Best is trial 0 with value: 0.6469567980320811.
[I 2024-12-15 13:58:12,299] Trial 2 finished with value: 0.6480706195869421 and parameters: {'max_depth': 9, 'learning_rate': 0.019763471261769115, 'n_estimators': 838, 'subsample': 0.701686856528873

Testing top 75% features...


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


Selected 30 features out of 41 with top 75%.


[I 2024-12-15 14:13:58,834] Trial 0 finished with value: 0.6442215958591708 and parameters: {'max_depth': 7, 'learning_rate': 0.09081752287104089, 'n_estimators': 928, 'subsample': 0.6320685351352547, 'colsample_bytree': 0.9120204489671411, 'gamma': 4.798231973813269, 'min_child_weight': 9, 'lambda': 0.19363276044276814, 'alpha': 3.6560730595226985}. Best is trial 0 with value: 0.6442215958591708.
[I 2024-12-15 14:14:02,963] Trial 1 finished with value: 0.6451336544867525 and parameters: {'max_depth': 6, 'learning_rate': 0.11893927260818588, 'n_estimators': 559, 'subsample': 0.8571762027969647, 'colsample_bytree': 0.6719555035005157, 'gamma': 4.449886776191372, 'min_child_weight': 3, 'lambda': 0.02274195853184917, 'alpha': 0.08652145978022699}. Best is trial 1 with value: 0.6451336544867525.
[I 2024-12-15 14:14:18,224] Trial 2 finished with value: 0.6378444114180291 and parameters: {'max_depth': 9, 'learning_rate': 0.017536834536121187, 'n_estimators': 382, 'subsample': 0.8144985932389

Testing top 85% features...


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


Selected 34 features out of 41 with top 85%.


[I 2024-12-15 14:29:24,842] Trial 0 finished with value: 0.6267077333060012 and parameters: {'max_depth': 6, 'learning_rate': 0.05701100456491693, 'n_estimators': 504, 'subsample': 0.5329711841113464, 'colsample_bytree': 0.9995644029482085, 'gamma': 0.9284180377031759, 'min_child_weight': 10, 'lambda': 0.8745440880187977, 'alpha': 0.1654078837429343}. Best is trial 0 with value: 0.6267077333060012.
[I 2024-12-15 14:29:25,423] Trial 1 finished with value: 0.6456407010710807 and parameters: {'max_depth': 6, 'learning_rate': 0.14643025478835606, 'n_estimators': 111, 'subsample': 0.9799314504188633, 'colsample_bytree': 0.5564957109354765, 'gamma': 3.5243944342349796, 'min_child_weight': 9, 'lambda': 0.9946213061881567, 'alpha': 0.04116815881384766}. Best is trial 1 with value: 0.6456407010710807.
[I 2024-12-15 14:29:28,409] Trial 2 finished with value: 0.6495885307231076 and parameters: {'max_depth': 10, 'learning_rate': 0.02718995241769843, 'n_estimators': 339, 'subsample': 0.879515668256

Testing top 95% features...


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


Selected 38 features out of 41 with top 95%.


[I 2024-12-15 14:38:43,652] Trial 0 finished with value: 0.6494871624045508 and parameters: {'max_depth': 4, 'learning_rate': 0.05381017082946398, 'n_estimators': 771, 'subsample': 0.8445235358394194, 'colsample_bytree': 0.7219548686769526, 'gamma': 4.935919772236389, 'min_child_weight': 8, 'lambda': 2.86222880634823, 'alpha': 0.17542351586403582}. Best is trial 0 with value: 0.6494871624045508.
[I 2024-12-15 14:38:47,578] Trial 1 finished with value: 0.6462473735458413 and parameters: {'max_depth': 8, 'learning_rate': 0.02879573310010986, 'n_estimators': 109, 'subsample': 0.9374201193373993, 'colsample_bytree': 0.8162349293718598, 'gamma': 0.3704102843444357, 'min_child_weight': 8, 'lambda': 0.5299931110276461, 'alpha': 0.41947496485663055}. Best is trial 0 with value: 0.6494871624045508.
[I 2024-12-15 14:38:51,313] Trial 2 finished with value: 0.6450321324245375 and parameters: {'max_depth': 3, 'learning_rate': 0.026732276433213468, 'n_estimators': 747, 'subsample': 0.849937866854161

Best accuracy: 0.6560680059447548
Best number of features: 38
Best parameters: {'max_depth': 7, 'learning_rate': 0.011932360252505183, 'n_estimators': 592, 'subsample': 0.7395714060268065, 'colsample_bytree': 0.5904044687299073, 'gamma': 4.408009054249332, 'min_child_weight': 5, 'lambda': 7.4370725049490565, 'alpha': 0.01485280025650714}


In [11]:
final_model = xgb.XGBClassifier(**best_params)
X_train_selected = X_train.iloc[:, best_features]
X_test_selected = X_test.iloc[:, best_features]
final_model.fit(X_train_selected, y_train)

# Predykcje na zbiorze testowym
y_pred = final_model.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
print("Final model accuracy on test data:", accuracy)

Final model accuracy on test data: 0.6556447803495512


## tutaj dla wszystkich cech odpalam model 

In [14]:
def wrapped_objective_3(trial):
    return objective(trial, X_train, y_train)
study_3 = optuna.create_study(direction="maximize", sampler=optuna.samplers.RandomSampler(seed=30))
study_3.optimize(wrapped_objective_3, n_trials=200, timeout=3600)
best_params_3 = study_3.best_params
print("Best parameters:", best_params_3)

[I 2024-12-16 14:34:08,886] A new study created in memory with name: no-name-c238b332-49ea-4540-922c-b579d54449cd
[I 2024-12-16 14:34:21,218] Trial 0 finished with value: 0.6252891405729514 and parameters: {'max_depth': 8, 'learning_rate': 0.036509987708738215, 'n_estimators': 697, 'subsample': 0.5818253630513767, 'colsample_bytree': 0.9813039068371594, 'gamma': 1.7333092018988283, 'min_child_weight': 10, 'lambda': 0.008714281448909188, 'alpha': 0.22017959781029373}. Best is trial 0 with value: 0.6252891405729514.
[I 2024-12-16 14:34:24,686] Trial 1 finished with value: 0.649487623635525 and parameters: {'max_depth': 6, 'learning_rate': 0.015894051382520313, 'n_estimators': 590, 'subsample': 0.7590881734129227, 'colsample_bytree': 0.8834275531492527, 'gamma': 4.669250716592899, 'min_child_weight': 1, 'lambda': 0.0060685512932250624, 'alpha': 9.479266587302769}. Best is trial 1 with value: 0.649487623635525.
[I 2024-12-16 14:34:29,863] Trial 2 finished with value: 0.6435141187925998 and

Best parameters: {'max_depth': 8, 'learning_rate': 0.018775734274607978, 'n_estimators': 605, 'subsample': 0.7257772113161631, 'colsample_bytree': 0.7145110535468082, 'gamma': 4.935012552117339, 'min_child_weight': 2, 'lambda': 0.30830821673396996, 'alpha': 1.0656369443726696}


In [15]:
final_model_3 = xgb.XGBClassifier(**best_params_3)
final_model_3.fit(X_train, y_train)
y_pred_3 = final_model_3.predict(X_test)
accuracy_3 = accuracy_score(y_test, y_pred_3)
print("Final model accuracy on test data:", accuracy_3)

Final model accuracy on test data: 0.6547000472366556


## wizualizacja optymalizacji, szukanie zaleznosci jaka siatka i jakie hiperparametry modyfikowac

In [16]:
optuna.visualization.plot_optimization_history(study_3)

In [17]:
optuna.visualization.plot_param_importances(study_3)

In [20]:
optuna.visualization.plot_contour(study_3, params=["max_depth", "learning_rate"])

In [23]:
optuna.visualization.plot_slice(study_3)

In [25]:
optuna.visualization.plot_rank(study_3)

In [26]:
optuna.visualization.plot_edf(study_3)

In [28]:
optuna.visualization.plot_terminator_improvement(study_3)


plot_terminator_improvement is experimental (supported from v3.2.0). The interface can change in the future.


RegretBoundEvaluator is experimental (supported from v3.2.0). The interface can change in the future.


CrossValidationErrorEvaluator is experimental (supported from v3.2.0). The interface can change in the future.

100%|██████████| 200/200 [00:23<00:00,  8.45it/s]


In [30]:
optuna.visualization.plot_parallel_coordinate(study_3)

## weźmy po duwagę najważniejsze hiperparametry i zmodyfikujmy przedział po analizie wykresów

In [37]:
def objective_4(trial, X_train, y_train):
    # Proponowane hiperparametry przez Optunę
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.05, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "lambda": trial.suggest_float("lambda", 1e-3, 10, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10, log=True),
    }
    model = xgb.XGBClassifier(**params)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy")

    # Negatywna wartość accuracy (Optuna minimalizuje, więc -accuracy)
    return scores.mean()

In [38]:
def wrapped_objective_4(trial):
    return objective_4(trial, X_train, y_train)
study_4 = optuna.create_study(direction="maximize", sampler=optuna.samplers.RandomSampler(seed=42))
study_4.optimize(wrapped_objective_4, n_trials=200, timeout=3600)
best_params_4 = study_4.best_params
print("Best parameters:", best_params_4)

[I 2024-12-16 15:41:18,527] A new study created in memory with name: no-name-8c253457-b078-44a3-86cb-f8da6c3fadcf
[I 2024-12-16 15:41:20,073] Trial 0 finished with value: 0.6482728949930815 and parameters: {'learning_rate': 0.018272261776066247, 'subsample': 0.9753571532049581, 'gamma': 3.6599697090570253, 'lambda': 0.24810409748678125, 'alpha': 0.004207988669606638}. Best is trial 0 with value: 0.6482728949930815.
[I 2024-12-16 15:41:21,395] Trial 1 finished with value: 0.6479695075078153 and parameters: {'learning_rate': 0.012853916978930137, 'subsample': 0.5290418060840998, 'gamma': 4.330880728874676, 'lambda': 0.2537815508265665, 'alpha': 0.679657809075816}. Best is trial 0 with value: 0.6482728949930815.
[I 2024-12-16 15:41:22,882] Trial 2 finished with value: 0.6436163583252191 and parameters: {'learning_rate': 0.010336843570697411, 'subsample': 0.9849549260809971, 'gamma': 4.162213204002109, 'lambda': 0.0070689749506246055, 'alpha': 0.005337032762603957}. Best is trial 0 with va

Best parameters: {'learning_rate': 0.020820463933005757, 'subsample': 0.8100662989007683, 'gamma': 1.3869059149056633, 'lambda': 0.005655677524618509, 'alpha': 0.07158031739371042}


## co ciekawe wynik sie minimalnie pogorszyl, w kazdym razie nie powiedzialem jeszcze ostatniego słowa w tej kwestii, ale na ten moment skaczę na inny kwiatek

In [39]:
final_model_4 = xgb.XGBClassifier(**best_params_4)
final_model_4.fit(X_train, y_train)
y_pred_4 = final_model_4.predict(X_test)
accuracy_4 = accuracy_score(y_test, y_pred_4)
print("Final model accuracy on test data:", accuracy_4)

Final model accuracy on test data: 0.6532829475673122


In [35]:
optuna.visualization.plot_param_importances(study_4)

In [40]:
optuna.visualization.plot_slice(study_4)