In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from scipy.stats import kstest
import shap
import joblib
import json
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
from sklearn.metrics import accuracy_score

In [2]:
matches = pd.read_csv("../../preparation_before_models/data/matches.csv")

In [3]:
matches['Date'] = pd.to_datetime(matches['Date'])
print(matches['Date'].head())
print(matches['Date'].dtype)

0   2017-12-31
1   2017-12-31
2   2018-01-01
3   2018-01-01
4   2018-01-01
Name: Date, dtype: datetime64[ns]
datetime64[ns]


In [4]:
matches=matches.drop(columns=['player1_bet_odds','player2_bet_odds',"w_ace_avg", "l_ace_avg", "w_CO_ace_avg", "l_CO_ace_avg","w_df_avg", "l_df_avg", "w_CO_df_avg", "l_CO_df_avg", "w_2ndIn_avg","l_2ndIn_avg","w_CO_2ndIn_avg", "l_CO_2ndIn_avg"])

In [5]:
test_data = matches[matches["Date"].dt.year == 2023]
train_data = matches[matches["Date"].dt.year != 2023]

In [8]:
X_train_data = train_data.drop(columns=['target','Date', 'match_id'])
y_train_data = train_data[['target', 'Date']]

In [9]:
X_test_data  = test_data.drop(columns=['target','Date', 'match_id'])
y_test_data= test_data['target']

In [15]:
def scale_features(X):
    scaled_X = X.copy()
    for col in X.columns:
        if kstest(X[col], 'norm').pvalue > 0.05:
            # Normal distribution: StandardScaler
            scaler = StandardScaler()
        else:
            # Non-normal distribution: MinMaxScaler
            scaler = MinMaxScaler()
        scaled_X[col] = scaler.fit_transform(X[[col]])
    return scaled_X

In [12]:
X_train_scaled = scale_features(X_train_data)
X_test_scaled = scale_features(X_test_data)  

In [16]:
def rolling_origin_split(data):
    data = data.sort_values(by="Date")

    
    splits = []
    for year in range(2018, 2022):
        training_data = data[data["Date"].dt.year <= year]
        val_data = data[data["Date"].dt.year == year + 1]

        if not val_data.empty:
            splits.append((training_data, val_data))

    return splits

In [22]:
def feature_selection_with_shap(X, y, percent):
    if "Date" in X.columns:
        X = X.drop(columns=["Date"])
    
    model=LogisticRegression(max_iter=1000)
    model.fit(X, y)

    explainer = shap.Explainer(model, X)
    shap_values = explainer(X)

    # Obliczenie średniej absolutnej ważności cech
    feature_importance = np.abs(shap_values.values).mean(axis=0)

    # Wybór najważniejszych cech 
    importance_threshold = np.percentile(feature_importance, 100-percent)  
    selected_features = np.where(feature_importance > importance_threshold)[0]

    print(f"Selected {len(selected_features)} features out of {X.shape[1]} with top {percent}%.")
    return X.iloc[:, selected_features], selected_features

In [26]:
def objective_logistic_nested(trial, data):
    solver = trial.suggest_categorical("solver", ["liblinear", "saga", "lbfgs"])
    penalty = trial.suggest_categorical("penalty", ["l1", "l2", None])
    C = trial.suggest_float("C", 1e-4, 1e4, log=True)
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])
    valid_combinations = {
        "lbfgs": ["l2", None],
        "liblinear": ["l1", "l2"],
        "saga": ["l1", "l2", None]
    }
    
    # Skip invalid combinations
    if penalty not in valid_combinations[solver]:
        raise optuna.exceptions.TrialPruned()
    
    splits = rolling_origin_split(data)
    model=LogisticRegression(
        solver=solver,
        penalty=penalty,
        C=C,
        class_weight=class_weight,
        max_iter=1000,
        random_state=42
    )
    log_loss_scores = []

    for training_data, val_data in splits:
        # Ensure proper sorting to avoid leakage
        training_data = training_data.sort_values("Date")
        val_data = val_data.sort_values("Date")

        # Separate features and target
        X_train = training_data.drop(columns=["target", "Date"])
        y_train = training_data["target"]
        X_val = val_data.drop(columns=["target", "Date"])
        y_val = val_data["target"]

        # Train the model on the training set
        model.fit(X_train, y_train)

        # Predict probabilities for the validation set
        y_pred_proba = model.predict_proba(X_val)[:, 1]

        # Calculate log loss for the validation set
        log_loss_scores.append(log_loss(y_val, y_pred_proba))

    # Return the mean log loss as the objective to minimize
    return np.mean(log_loss_scores)

In [27]:
percentages = [ 50, 65, 75, 85, 100]  
best_logloss = float("inf")
best_features = None
best_params = None
best_num_features = 0
for percent in percentages:
    X_train_selected, selected_features = feature_selection_with_shap(X_train_scaled, y_train_data['target'], percent)
    train_data_selected = pd.concat([X_train_selected, y_train_data], axis=1)

    def wrapped_objective(trial):
        return objective_logistic_nested(trial, train_data_selected)
    
    study = optuna.create_study(direction="minimize", sampler=optuna.samplers.RandomSampler(seed=42))
    study.optimize(wrapped_objective, n_trials=200, timeout=3600)

    if study.best_value < best_logloss:
        best_logloss = study.best_value
        best_features = selected_features
        best_params = study.best_params
        best_num_features = len(selected_features)
print(f"Best logloss: {best_logloss}")
print(f"Best number of features: {best_num_features}")
print(f"Best parameters: {best_params}")

[I 2025-01-04 18:46:04,740] A new study created in memory with name: no-name-db8f1ce4-484e-4093-a928-47a09c1e0c1c
[I 2025-01-04 18:46:04,869] Trial 0 finished with value: 0.6955195672920569 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 0.00029152036385288323, 'class_weight': None}. Best is trial 0 with value: 0.6955195672920569.
[I 2025-01-04 18:46:04,871] Trial 1 pruned. 


Selected 47 features out of 95 with top 50%.


[I 2025-01-04 18:46:05,382] Trial 2 finished with value: 0.617463091022921 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.617463091022921.
[I 2025-01-04 18:46:18,381] Trial 3 finished with value: 0.6180393268894604 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 2 with value: 0.617463091022921.
[I 2025-01-04 18:46:18,821] Trial 4 finished with value: 0.617463091022921 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class_weight': None}. Best is trial 2 with value: 0.617463091022921.
[I 2025-01-04 18:46:18,823] Trial 5 pruned. 
[I 2025-01-04 18:46:26,026] Trial 6 finished with value: 0.6179286174672516 and parameters: {'solver': 'saga', 'penalty': None, 'C': 0.12865252594826798, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.617463091022921.
[I 2025-01-04 18:46:26,438] Trial 7 finished with va

Selected 62 features out of 95 with top 65%.


[I 2025-01-04 18:55:31,227] Trial 2 finished with value: 0.6187916589182622 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.6187916589182622.
[I 2025-01-04 18:55:51,246] Trial 3 finished with value: 0.6192035310792675 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 2 with value: 0.6187916589182622.
[I 2025-01-04 18:55:51,714] Trial 4 finished with value: 0.6187916589182622 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class_weight': None}. Best is trial 2 with value: 0.6187916589182622.
[I 2025-01-04 18:55:51,715] Trial 5 pruned. 
[I 2025-01-04 18:56:03,426] Trial 6 finished with value: 0.619173216069222 and parameters: {'solver': 'saga', 'penalty': None, 'C': 0.12865252594826798, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.6187916589182622.
[I 2025-01-04 18:56:03,880] Trial 7 finished wi

Selected 71 features out of 95 with top 75%.


[I 2025-01-04 19:09:47,882] Trial 2 finished with value: 0.6201439249758438 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.6201439249758438.
[I 2025-01-04 19:10:11,040] Trial 3 finished with value: 0.6206178908360157 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 2 with value: 0.6201439249758438.
[I 2025-01-04 19:10:11,603] Trial 4 finished with value: 0.6201439249758438 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class_weight': None}. Best is trial 2 with value: 0.6201439249758438.
[I 2025-01-04 19:10:11,604] Trial 5 pruned. 
[I 2025-01-04 19:10:25,240] Trial 6 finished with value: 0.6206612470087847 and parameters: {'solver': 'saga', 'penalty': None, 'C': 0.12865252594826798, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.6201439249758438.
[I 2025-01-04 19:10:25,830] Trial 7 finished w

Selected 80 features out of 95 with top 85%.


[I 2025-01-04 19:26:49,312] Trial 2 finished with value: 0.6228917372937086 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.6228917372937086.
[I 2025-01-04 19:27:18,207] Trial 3 finished with value: 0.624136940211941 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 2 with value: 0.6228917372937086.
[I 2025-01-04 19:27:18,905] Trial 4 finished with value: 0.6228917372937086 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class_weight': None}. Best is trial 2 with value: 0.6228917372937086.
[I 2025-01-04 19:27:18,907] Trial 5 pruned. 
[I 2025-01-04 19:27:36,294] Trial 6 finished with value: 0.6240131829529243 and parameters: {'solver': 'saga', 'penalty': None, 'C': 0.12865252594826798, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.6228917372937086.
[I 2025-01-04 19:27:37,075] Trial 7 finished wi

Selected 94 features out of 95 with top 100%.


[I 2025-01-04 19:46:58,986] Trial 1 pruned. 
[I 2025-01-04 19:46:59,841] Trial 2 finished with value: 0.6374030032805471 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.6374030032805471.
[I 2025-01-04 19:47:33,670] Trial 3 finished with value: 0.6349753967766634 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 3 with value: 0.6349753967766634.
[I 2025-01-04 19:47:34,511] Trial 4 finished with value: 0.6374030032805471 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class_weight': None}. Best is trial 3 with value: 0.6349753967766634.
[I 2025-01-04 19:47:34,512] Trial 5 pruned. 
[I 2025-01-04 19:47:53,609] Trial 6 finished with value: 0.6345504397027046 and parameters: {'solver': 'saga', 'penalty': None, 'C': 0.12865252594826798, 'class_weight': 'balanced'}. Best is trial 6 with value: 0.6345504397027046.
[

Best logloss: 0.6157210198420695
Best number of features: 47
Best parameters: {'solver': 'saga', 'penalty': 'l2', 'C': 0.9673101205310934, 'class_weight': 'balanced'}


In [29]:
final_model = LogisticRegression(**best_params, max_iter=1000, random_state=42)
X_train_selected = X_train_scaled.iloc[:, best_features]
X_test_selected = X_test_scaled.iloc[:, best_features]
final_model.fit(X_train_selected, y_train_data['target'])

y_pred = final_model.predict(X_test_selected)
y_pred_proba = final_model.predict_proba(X_test_selected)[:, 1]
accuracy= accuracy_score(y_test_data, y_pred)
logloss= log_loss(y_test_data, y_pred_proba)
print(f"Accuracy: {accuracy}")
print(f"Log loss: {logloss}")

Accuracy: 0.6454613095238095
Log loss: 0.6194132642354017


In [30]:
train_data_selected = pd.concat([X_train_selected, y_train_data], axis=1)
def wrapped_objective(trial):
    return objective_logistic_nested(trial, train_data_selected)
study_best_features = optuna.create_study(direction="minimize", sampler=optuna.samplers.RandomSampler(seed=42))
study_best_features.optimize(wrapped_objective, n_trials=200, timeout=3600)
best_params = study_best_features.best_params
print(f"Best parameters: {best_params}")

[I 2025-01-04 20:24:27,869] A new study created in memory with name: no-name-9f0f0077-225f-47be-a5e3-70be4a9864b9
[I 2025-01-04 20:24:28,036] Trial 0 finished with value: 0.6955195672920569 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 0.00029152036385288323, 'class_weight': None}. Best is trial 0 with value: 0.6955195672920569.
[I 2025-01-04 20:24:28,037] Trial 1 pruned. 
[I 2025-01-04 20:24:28,443] Trial 2 finished with value: 0.617463091022921 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.617463091022921.
[I 2025-01-04 20:24:40,032] Trial 3 finished with value: 0.6180393268894604 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 2 with value: 0.617463091022921.
[I 2025-01-04 20:24:40,446] Trial 4 finished with value: 0.617463091022921 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class_wei

Best parameters: {'solver': 'saga', 'penalty': 'l2', 'C': 0.9673101205310934, 'class_weight': 'balanced'}


In [31]:
optuna.visualization.plot_optimization_history(study_best_features)

In [32]:
optuna.visualization.plot_param_importances(study_best_features)

In [33]:
optuna.visualization.plot_slice(study_best_features)

In [34]:
optuna.visualization.plot_parallel_coordinate(study_best_features)

In [35]:
optuna.visualization.plot_edf(study_best_features)

In [37]:
optuna.visualization.plot_rank(study_best_features)

In [39]:
optuna.visualization.plot_terminator_improvement(study_best_features)


plot_terminator_improvement is experimental (supported from v3.2.0). The interface can change in the future.


RegretBoundEvaluator is experimental (supported from v3.2.0). The interface can change in the future.


CrossValidationErrorEvaluator is experimental (supported from v3.2.0). The interface can change in the future.

100%|██████████| 200/200 [00:10<00:00, 19.04it/s]


In [40]:
joblib.dump(final_model, "../logistic_regression/best_models/best_model_log_nested.pkl")

['../logistic_regression/best_models/best_model_log_nested.pkl']

In [41]:
np.save("../logistic_regression/best_models/best_features_log_nested.npy", best_features)

In [42]:
with open("../logistic_regression/best_models/best_params_log_nested.json", "w") as f:
    json.dump(best_params, f)

In [43]:
with open("../logistic_regression/best_models/best_model_log_nested_as_txt.txt", "w") as f:
    f.write(str(final_model))