In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from scipy.stats import kstest
import shap
import joblib
import json
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
from sklearn.metrics import accuracy_score
import plotly.io as pio

In [2]:
matches = pd.read_csv("../../preparation_before_models/data/matches.csv")

In [3]:
matches['Date'] = pd.to_datetime(matches['Date'])
print(matches['Date'].head())
print(matches['Date'].dtype)

0   2017-12-31
1   2017-12-31
2   2018-01-01
3   2018-01-01
4   2018-01-01
Name: Date, dtype: datetime64[ns]
datetime64[ns]


In [4]:
def combine_player_columns(df):
    """
    Combines player1 and player2 columns into a single column representing their difference.
    """
    combined_df = df.copy()
    difference_columns = {}

    # Iterate over all columns to find matching player1/player2 columns
    for col in df.columns:
        if col.startswith('player1_'):
            suffix = col[len('player1_'):]
            player2_col = f'player2_{suffix}'
            
            if player2_col in df.columns:
                # Compute the difference
                diff_col_name = f'diff_{suffix}'
                combined_df[diff_col_name] = df[col] - df[player2_col]
                
                # Track columns to drop
                difference_columns[col] = diff_col_name
                difference_columns[player2_col] = diff_col_name

    # Drop original player1/player2 columns
    combined_df = combined_df.drop(columns=difference_columns.keys())
    
    return combined_df

# Example usage
matches = combine_player_columns(matches)


print("Columns after combining:")
print(matches.columns)

Columns after combining:
Index(['Date', 'outdoor', 'match_id', 'tournament_level', 'best_of',
       'w_ace_avg', 'l_ace_avg', 'w_CO_ace_avg', 'l_CO_ace_avg', 'w_df_avg',
       'l_df_avg', 'w_CO_df_avg', 'l_CO_df_avg', 'w_2ndIn_avg', 'l_2ndIn_avg',
       'w_CO_2ndIn_avg', 'l_CO_2ndIn_avg', 'non_CO_uncertainty',
       'CO_uncertainty', 'Round_Num', 'temperature_2m', 'relative_humidity_2m',
       'windspeed_10m', 'apparent_temperature', 'Surface_Clay',
       'Surface_Grass', 'Surface_Hard', 'target', 'diff_bet_odds',
       'diff_right_handed', 'diff_age', 'diff_rank', 'diff_entry_LL',
       'diff_entry_Q', 'diff_entry_WC', 'diff_is_seeded',
       'diff_1st_serve_in_pct_avg', 'diff_CO_1st_serve_in_pct_avg',
       'diff_1st_serve_win_pct_avg', 'diff_CO_1st_serve_win_pct_avg',
       'diff_2nd_serve_in_pct_avg', 'diff_CO_2nd_serve_in_pct_avg',
       'diff_2nd_serve_win_pct_avg', 'diff_CO_2nd_serve_win_pct_avg',
       'diff_service_games_won_pct_avg', 'diff_CO_service_games_won_pc

In [5]:
matches=matches.drop(columns=['non_CO_uncertainty', 'CO_uncertainty',"outdoor", "tournament_level", "best_of", "Round_Num", "Surface_Clay", "Surface_Grass", "Surface_Hard", "temperature_2m", "relative_humidity_2m", "windspeed_10m",  "apparent_temperature", "diff_right_handed","diff_bet_odds","w_ace_avg", "l_ace_avg", "w_CO_ace_avg", "l_CO_ace_avg","w_df_avg", "l_df_avg", "w_CO_df_avg", "l_CO_df_avg", "w_2ndIn_avg","l_2ndIn_avg","w_CO_2ndIn_avg", "l_CO_2ndIn_avg"])

In [6]:
matches.columns

Index(['Date', 'match_id', 'target', 'diff_age', 'diff_rank', 'diff_entry_LL',
       'diff_entry_Q', 'diff_entry_WC', 'diff_is_seeded',
       'diff_1st_serve_in_pct_avg', 'diff_CO_1st_serve_in_pct_avg',
       'diff_1st_serve_win_pct_avg', 'diff_CO_1st_serve_win_pct_avg',
       'diff_2nd_serve_in_pct_avg', 'diff_CO_2nd_serve_in_pct_avg',
       'diff_2nd_serve_win_pct_avg', 'diff_CO_2nd_serve_win_pct_avg',
       'diff_service_games_won_pct_avg', 'diff_CO_service_games_won_pct_avg',
       'diff_1st_serve_return_win_pct_avg',
       'diff_CO_1st_serve_return_win_pct_avg',
       'diff_2nd_serve_return_win_pct_avg',
       'diff_CO_2nd_serve_return_win_pct_avg', 'diff_return_games_win_pct_avg',
       'diff_CO_return_games_win_pct_avg', 'diff_bp_won_pct_avg',
       'diff_CO_bp_won_pct_avg', 'diff_bp_saved_pct_avg',
       'diff_CO_bp_saved_pct_avg', 'diff_elo', 'diff_surface_elo',
       'diff_blended_elo', 'diff_fatigue_score', 'diff_h2h_wins',
       'diff_h2h_surface_wins', 'diff

In [6]:
test_data = matches[matches["Date"].dt.year == 2023]
train_data = matches[matches["Date"].dt.year != 2023]

In [7]:
X_train_data = train_data.drop(columns=['target','Date', 'match_id'])
y_train_data = train_data[['target', 'Date']]

In [8]:
X_test_data  = test_data.drop(columns=['target','Date', 'match_id'])
y_test_data= test_data['target']

In [9]:
def scale_features(X):
    scaled_X = X.copy()
    for col in X.columns:
        if kstest(X[col], 'norm').pvalue > 0.05:
            # Normal distribution: StandardScaler
            scaler = StandardScaler()
        else:
            # Non-normal distribution: MinMaxScaler
            scaler = MinMaxScaler()
        scaled_X[col] = scaler.fit_transform(X[[col]])
    return scaled_X

In [10]:
X_train_scaled = scale_features(X_train_data)
X_test_scaled = scale_features(X_test_data)  

In [11]:
def rolling_origin_split(data):
    data = data.sort_values(by="Date")

    
    splits = []
    for year in range(2018, 2022):
        training_data = data[data["Date"].dt.year <= year]
        val_data = data[data["Date"].dt.year == year + 1]

        if not val_data.empty:
            splits.append((training_data, val_data))

    return splits

In [13]:
def feature_selection_with_shap(X, y, percent):
    if "Date" in X.columns:
        X = X.drop(columns=["Date"])
    
    model=LogisticRegression(max_iter=1000, fit_intercept=False)
    model.fit(X, y)

    explainer = shap.Explainer(model, X)
    shap_values = explainer(X)

    # Obliczenie średniej absolutnej ważności cech
    feature_importance = np.abs(shap_values.values).mean(axis=0)

    # Wybór najważniejszych cech 
    importance_threshold = np.percentile(feature_importance, 100-percent)  
    selected_features = np.where(feature_importance > importance_threshold)[0]

    print(f"Selected {len(selected_features)} features out of {X.shape[1]} with top {percent}%.")
    return X.iloc[:, selected_features], selected_features

In [12]:
def objective_logistic_nested(trial, data):
    solver = trial.suggest_categorical("solver", ["liblinear", "saga", "lbfgs"])
    penalty = trial.suggest_categorical("penalty", ["l1", "l2", None])
    C = trial.suggest_float("C", 1e-4, 1e4, log=True)
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])
    valid_combinations = {
        "lbfgs": ["l2", None],
        "liblinear": ["l1", "l2"],
        "saga": ["l1", "l2", None]
    }
    
    # Skip invalid combinations
    if penalty not in valid_combinations[solver]:
        raise optuna.exceptions.TrialPruned()
    
    splits = rolling_origin_split(data)
    model=LogisticRegression(
        solver=solver,
        penalty=penalty,
        C=C,
        class_weight=class_weight,
        max_iter=1000,
        random_state=42,
        fit_intercept=False
    )
    log_loss_scores = []

    for training_data, val_data in splits:
        # Ensure proper sorting to avoid leakage
        training_data = training_data.sort_values("Date")
        val_data = val_data.sort_values("Date")

        # Separate features and target
        X_train = training_data.drop(columns=["target", "Date"])
        y_train = training_data["target"]
        X_val = val_data.drop(columns=["target", "Date"])
        y_val = val_data["target"]

        # Train the model on the training set
        model.fit(X_train, y_train)

        # Predict probabilities for the validation set
        y_pred_proba = model.predict_proba(X_val)[:, 1]

        # Calculate log loss for the validation set
        log_loss_scores.append(log_loss(y_val, y_pred_proba))

    # Return the mean log loss as the objective to minimize
    return np.mean(log_loss_scores)

In [15]:
percentages = [ 50, 65, 75, 85, 100]  
best_logloss = float("inf")
best_features = None
best_params = None
best_num_features = 0
for percent in percentages:
    X_train_selected, selected_features = feature_selection_with_shap(X_train_scaled, y_train_data['target'], percent)
    train_data_selected = pd.concat([X_train_selected, y_train_data], axis=1)

    def wrapped_objective(trial):
        return objective_logistic_nested(trial, train_data_selected)
    
    study = optuna.create_study(direction="minimize", sampler=optuna.samplers.RandomSampler(seed=42))
    study.optimize(wrapped_objective, n_trials=200, timeout=3600)

    if study.best_value < best_logloss:
        best_logloss = study.best_value
        best_features = selected_features
        best_params = study.best_params
        best_num_features = len(selected_features)
print(f"Best logloss: {best_logloss}")
print(f"Best number of features: {best_num_features}")
print(f"Best parameters: {best_params}")

[I 2025-01-18 02:17:54,730] A new study created in memory with name: no-name-c19a347b-0eb1-4b05-9f68-123d44019b0a
[I 2025-01-18 02:17:54,812] Trial 0 finished with value: 0.6931471805599454 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 0.00029152036385288323, 'class_weight': None}. Best is trial 0 with value: 0.6931471805599454.
[I 2025-01-18 02:17:54,814] Trial 1 pruned. 


Selected 20 features out of 40 with top 50%.


[I 2025-01-18 02:17:54,986] Trial 2 finished with value: 0.6172064319760147 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.6172064319760147.
[I 2025-01-18 02:17:59,450] Trial 3 finished with value: 0.6180314705914248 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 2 with value: 0.6172064319760147.
[I 2025-01-18 02:17:59,607] Trial 4 finished with value: 0.6172064319760147 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class_weight': None}. Best is trial 2 with value: 0.6172064319760147.
[I 2025-01-18 02:17:59,609] Trial 5 pruned. 
[I 2025-01-18 02:18:02,487] Trial 6 finished with value: 0.6180342118541768 and parameters: {'solver': 'saga', 'penalty': None, 'C': 0.12865252594826798, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.6172064319760147.
[I 2025-01-18 02:18:02,663] Trial 7 finished w

Selected 26 features out of 40 with top 65%.


[I 2025-01-18 02:22:00,463] Trial 2 finished with value: 0.6187635840023372 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.6187635840023372.
[I 2025-01-18 02:22:04,887] Trial 3 finished with value: 0.6191157529855693 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 2 with value: 0.6187635840023372.
[I 2025-01-18 02:22:05,075] Trial 4 finished with value: 0.6187635840023372 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class_weight': None}. Best is trial 2 with value: 0.6187635840023372.
[I 2025-01-18 02:22:05,075] Trial 5 pruned. 
[I 2025-01-18 02:22:08,266] Trial 6 finished with value: 0.6191198131404935 and parameters: {'solver': 'saga', 'penalty': None, 'C': 0.12865252594826798, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.6187635840023372.
[I 2025-01-18 02:22:08,445] Trial 7 finished w

Selected 30 features out of 40 with top 75%.


[I 2025-01-18 02:26:57,252] Trial 2 finished with value: 0.6184400693505934 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.6184400693505934.
[I 2025-01-18 02:27:02,482] Trial 3 finished with value: 0.6187784056573251 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 2 with value: 0.6184400693505934.
[I 2025-01-18 02:27:02,725] Trial 4 finished with value: 0.6184400693505934 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class_weight': None}. Best is trial 2 with value: 0.6184400693505934.
[I 2025-01-18 02:27:02,726] Trial 5 pruned. 
[I 2025-01-18 02:27:06,418] Trial 6 finished with value: 0.6187845174639199 and parameters: {'solver': 'saga', 'penalty': None, 'C': 0.12865252594826798, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.6184400693505934.
[I 2025-01-18 02:27:06,657] Trial 7 finished w

Selected 34 features out of 40 with top 85%.


[I 2025-01-18 02:32:29,321] Trial 2 finished with value: 0.6191117204187591 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.6191117204187591.
[I 2025-01-18 02:32:35,734] Trial 3 finished with value: 0.6195792361312216 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 2 with value: 0.6191117204187591.
[I 2025-01-18 02:32:35,999] Trial 4 finished with value: 0.6191117204187591 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class_weight': None}. Best is trial 2 with value: 0.6191117204187591.
[I 2025-01-18 02:32:36,001] Trial 5 pruned. 
[I 2025-01-18 02:32:40,235] Trial 6 finished with value: 0.6195987295399072 and parameters: {'solver': 'saga', 'penalty': None, 'C': 0.12865252594826798, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.6191117204187591.
[I 2025-01-18 02:32:40,490] Trial 7 finished w

Selected 39 features out of 40 with top 100%.


[I 2025-01-18 02:38:48,099] Trial 2 finished with value: 0.6200090425530191 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.6200090425530191.
[I 2025-01-18 02:38:55,811] Trial 3 finished with value: 0.6199069240059949 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 3 with value: 0.6199069240059949.
[I 2025-01-18 02:38:56,056] Trial 4 finished with value: 0.6200090425530191 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class_weight': None}. Best is trial 3 with value: 0.6199069240059949.
[I 2025-01-18 02:38:56,057] Trial 5 pruned. 
[I 2025-01-18 02:39:01,422] Trial 6 finished with value: 0.619923516723406 and parameters: {'solver': 'saga', 'penalty': None, 'C': 0.12865252594826798, 'class_weight': 'balanced'}. Best is trial 3 with value: 0.6199069240059949.
[I 2025-01-18 02:39:01,667] Trial 7 finished wi

Best logloss: 0.6164970915623564
Best number of features: 20
Best parameters: {'solver': 'liblinear', 'penalty': 'l2', 'C': 2.269409056622677, 'class_weight': 'balanced'}


In [None]:
final_model = LogisticRegression(**best_params, max_iter=1000, random_state=42)
X_train_selected = X_train_scaled.iloc[:, best_features]
X_test_selected = X_test_scaled.iloc[:, best_features]
final_model.fit(X_train_selected, y_train_data['target'])

y_pred = final_model.predict(X_test_selected)
y_pred_proba = final_model.predict_proba(X_test_selected)[:, 1]
accuracy= accuracy_score(y_test_data, y_pred)
logloss= log_loss(y_test_data, y_pred_proba)


In [15]:
best_features=np.load("../logistic_regression/best_models/best_features_log_nested.npy")
X_train_selected = X_train_scaled.iloc[:, best_features]

In [16]:
train_data_selected = pd.concat([X_train_selected, y_train_data], axis=1)
def wrapped_objective(trial):
    return objective_logistic_nested(trial, train_data_selected)
study_best_features = optuna.create_study(direction="minimize", sampler=optuna.samplers.RandomSampler(seed=42))
study_best_features.optimize(wrapped_objective, n_trials=200, timeout=3600)
best_params = study_best_features.best_params
print(f"Best parameters: {best_params}")

[I 2025-01-18 23:07:14,656] A new study created in memory with name: no-name-412dbeda-52ad-45a8-9b53-792208a47083
[I 2025-01-18 23:07:14,782] Trial 0 finished with value: 0.6931471805599454 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 0.00029152036385288323, 'class_weight': None}. Best is trial 0 with value: 0.6931471805599454.
[I 2025-01-18 23:07:14,784] Trial 1 pruned. 
[I 2025-01-18 23:07:14,988] Trial 2 finished with value: 0.6172064319760147 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.6172064319760147.
[I 2025-01-18 23:07:19,390] Trial 3 finished with value: 0.6180314705914248 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 2 with value: 0.6172064319760147.
[I 2025-01-18 23:07:19,550] Trial 4 finished with value: 0.6172064319760147 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class

Best parameters: {'solver': 'liblinear', 'penalty': 'l2', 'C': 2.269409056622677, 'class_weight': 'balanced'}


In [17]:
optuna.visualization.plot_optimization_history(study_best_features)

In [18]:
fig=optuna.visualization.plot_param_importances(study_best_features)
fig.update_layout(title="")

In [21]:
fig=optuna.visualization.plot_param_importances(study_best_features)
# Poprawienie estetyki wykresu (tło, czcionka, styl)
fig.update_layout(
    title="",
    template="plotly_white",  # Jasne tło
    font=dict(size=14),  # Ustawienie czcionki
    plot_bgcolor='white',  # Tło wykresu
    paper_bgcolor='white',  # Tło całego dokumentu
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey'),  # Siatka na osi X
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey')   # Siatka na osi Y
)

# Zapis do wysokiej jakości pliku PDF
pio.write_image(fig, "param_importance_plot_nested.pdf", format="pdf", scale=3)  # Skalowanie dla świetnej jakości

# Wyświetlenie wykresu
fig.show()

In [20]:
fig=optuna.visualization.plot_slice(study_best_features)
fig.update_layout(title="")

In [21]:
fig=optuna.visualization.plot_slice(study_best_features, params=["C", "solver"])
fig.update_layout(title="")

In [34]:
optuna.visualization.plot_parallel_coordinate(study_best_features)

In [22]:
fig=optuna.visualization.plot_edf(study_best_features)
fig.update_layout(title="")

In [25]:
values = np.array([t.value for t in study_best_features.trials if t.state == optuna.trial.TrialState.COMPLETE])
values.sort()
quantile_80 = np.quantile(values, 0.80)
fig=optuna.visualization.plot_edf(study_best_features)
fig.update_layout(
    title="",
    template="plotly_white", 
    font=dict(size=14),
)
fig.add_vline(
    x=quantile_80,
    line_dash="dash",
    line_color="red",
)


# Zapis do wysokiej jakości pliku PDF
pio.write_image(fig, "edf_plot_nested.pdf", format="pdf", scale=3)  # Skalowanie dla wysokiej jakości

# Wyświetlenie wykresu
fig.show()

In [40]:
filtered_trials = [trial for trial in study_best_features.trials if trial.params.get("solver") == "saga"]

# Create a new study with only the filtered trials
filtered_study = optuna.create_study(direction=study_best_features.direction)
for trial in filtered_trials:
    filtered_study.add_trial(trial)

# Generate slice plot for C
fig = optuna.visualization.plot_slice(filtered_study, params=["C"])

# Improve aesthetics
fig.update_layout(
    title="",
    template="plotly_white",
    font=dict(size=14),
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey'),
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey')
)

# Save as high-quality PDF
#pio.write_image(fig, "slice_plot_C_saga.pdf", format="pdf", scale=3)

# Show the plot
fig.show()

[I 2025-01-18 23:53:32,464] A new study created in memory with name: no-name-4108f447-bfe3-49ee-8c1d-134f9b26021c


In [41]:
filtered_trials = [trial for trial in study_best_features.trials if trial.params.get("solver") == "lbfgs"]

# Create a new study with only the filtered trials
filtered_study = optuna.create_study(direction=study_best_features.direction)
for trial in filtered_trials:
    filtered_study.add_trial(trial)

# Generate slice plot for C
fig = optuna.visualization.plot_slice(filtered_study, params=["C"])

# Improve aesthetics
fig.update_layout(
    title="",
    template="plotly_white",
    font=dict(size=14),
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey'),
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey')
)

# Save as high-quality PDF
#pio.write_image(fig, "slice_plot_C_saga.pdf", format="pdf", scale=3)

# Show the plot
fig.show()

[I 2025-01-18 23:53:40,634] A new study created in memory with name: no-name-50554ae4-3560-493f-a7cc-b859b12f348a


In [42]:
filtered_trials = [trial for trial in study_best_features.trials if trial.params.get("solver") == "liblinear"]

# Create a new study with only the filtered trials
filtered_study = optuna.create_study(direction=study_best_features.direction)
for trial in filtered_trials:
    filtered_study.add_trial(trial)

# Generate slice plot for C
fig = optuna.visualization.plot_slice(filtered_study, params=["C"])

# Improve aesthetics
fig.update_layout(
    title="",
    template="plotly_white",
    font=dict(size=14),
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey'),
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey')
)

# Save as high-quality PDF
#pio.write_image(fig, "slice_plot_C_saga.pdf", format="pdf", scale=3)

# Show the plot
fig.show()

[I 2025-01-18 23:53:49,119] A new study created in memory with name: no-name-96983d18-c0e5-43b8-a3df-121c2acd8593


In [37]:
filtered_trials = [trial for trial in study_best_features.trials if trial.params.get("penalty") == None]

# Create a new study with only the filtered trials
filtered_study = optuna.create_study(direction=study_best_features.direction)
for trial in filtered_trials:
    filtered_study.add_trial(trial)

# Generate slice plot for C
fig = optuna.visualization.plot_slice(filtered_study, params=["C"])

# Improve aesthetics
fig.update_layout(
    title="",
    template="plotly_white",
    font=dict(size=14),
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey'),
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey')
)

# Save as high-quality PDF
#pio.write_image(fig, "slice_plot_C_saga.pdf", format="pdf", scale=3)

# Show the plot
fig.show()

[I 2025-01-18 23:49:34,538] A new study created in memory with name: no-name-547ab807-d7fc-4fbb-b38e-f1d925dcd4e3


In [38]:
filtered_trials = [trial for trial in study_best_features.trials if trial.params.get("penalty") == "l1"]

# Create a new study with only the filtered trials
filtered_study = optuna.create_study(direction=study_best_features.direction)
for trial in filtered_trials:
    filtered_study.add_trial(trial)

# Generate slice plot for C
fig = optuna.visualization.plot_slice(filtered_study, params=["C"])

# Improve aesthetics
fig.update_layout(
    title="",
    template="plotly_white",
    font=dict(size=14),
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey'),
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey')
)

# Save as high-quality PDF
#pio.write_image(fig, "slice_plot_C_saga.pdf", format="pdf", scale=3)

# Show the plot
fig.show()

[I 2025-01-18 23:50:09,940] A new study created in memory with name: no-name-dbcc7c27-4ad3-4ccf-acd1-6a186a1cb403


In [39]:
filtered_trials = [trial for trial in study_best_features.trials if trial.params.get("penalty") == "l2"]

# Create a new study with only the filtered trials
filtered_study = optuna.create_study(direction=study_best_features.direction)
for trial in filtered_trials:
    filtered_study.add_trial(trial)

# Generate slice plot for C
fig = optuna.visualization.plot_slice(filtered_study, params=["C"])

# Improve aesthetics
fig.update_layout(
    title="",
    template="plotly_white",
    font=dict(size=14),
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey'),
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey')
)

# Save as high-quality PDF
#pio.write_image(fig, "slice_plot_C_saga.pdf", format="pdf", scale=3)

# Show the plot
fig.show()

[I 2025-01-18 23:50:37,417] A new study created in memory with name: no-name-33564500-54e8-4407-850c-7fb31ffa0763


In [44]:
fig = optuna.visualization.plot_slice(study_best_features, params=["solver"])

# Improve aesthetics
fig.update_layout(
    title="",
    template="plotly_white",
    font=dict(size=14),
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey'),
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey')
)

# Save as high-quality PDF
#pio.write_image(fig, "slice_plot_C_saga.pdf", format="pdf", scale=3)

# Show the plot
fig.show()

In [24]:
fig=optuna.visualization.plot_rank(study_best_features)
fig.update_layout(title="")

In [37]:
fig=optuna.visualization.plot_rank(study_best_features, params=["C", "solver", "penalty"])
fig.update_layout(title="")

In [None]:
fig=optuna.visualization.plot_rank(study_best_features, params=["C", "solver", "penalty", "class_weight"])
fig.update_layout(
    title="",
    template="plotly_white",  
    font=dict(size=14),  
    plot_bgcolor='white',  
    paper_bgcolor='white',  
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey'),  
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey')   
)
pio.write_image(fig, "param_rank_plot_nested.pdf", format="pdf", scale=10, width=1200, height=800)  

# Wyświetlenie wykresu
fig.show()

In [56]:
fig=optuna.visualization.plot_rank(study_best_features, params=["solver", "C"])
fig.update_layout(
    title="",
    template="plotly_white",  
    font=dict(size=14),  
    plot_bgcolor='white',  
    paper_bgcolor='white',  
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey'),  
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey')   
)
pio.write_image(fig, "param_rank_plot_nested_solver_C.pdf", format="pdf", scale=10)  

# Wyświetlenie wykresu
fig.show()

In [57]:
fig=optuna.visualization.plot_rank(study_best_features, params=["penalty", "C"])
fig.update_layout(
    title="",
    template="plotly_white",  
    font=dict(size=14),  
    plot_bgcolor='white',  
    paper_bgcolor='white',  
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey'),  
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey')   
)
pio.write_image(fig, "param_rank_plot_nested_penalty_C.pdf", format="pdf", scale=10)  

# Wyświetlenie wykresu
fig.show()

In [58]:
fig=optuna.visualization.plot_rank(study_best_features, params=["class_weight", "C"])
fig.update_layout(
    title="",
    template="plotly_white",  
    font=dict(size=14),  
    plot_bgcolor='white',  
    paper_bgcolor='white',  
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey'),  
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey')   
)
pio.write_image(fig, "param_rank_plot_nested_class_C.pdf", format="pdf", scale=10)  

# Wyświetlenie wykresu
fig.show()

In [39]:
optuna.visualization.plot_terminator_improvement(study_best_features)


plot_terminator_improvement is experimental (supported from v3.2.0). The interface can change in the future.


RegretBoundEvaluator is experimental (supported from v3.2.0). The interface can change in the future.


CrossValidationErrorEvaluator is experimental (supported from v3.2.0). The interface can change in the future.

100%|██████████| 200/200 [00:10<00:00, 19.04it/s]


In [59]:
def objective_logistic_nested_improve(trial, data):
    solver = trial.suggest_categorical("solver", ["saga", "lbfgs"])
    penalty = trial.suggest_categorical("penalty", [ "l2"])
    C = trial.suggest_float("C", 0.6, 1.5, log=True)
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])
    valid_combinations = {
        "lbfgs": ["l2", None],
        "saga": [ "l2", None]
    }
    
    # Skip invalid combinations
    if penalty not in valid_combinations[solver]:
        raise optuna.exceptions.TrialPruned()
    
    splits = rolling_origin_split(data)
    model=LogisticRegression(
        solver=solver,
        penalty=penalty,
        C=C,
        class_weight=class_weight,
        max_iter=1000,
        random_state=42,
        fit_intercept=False
    )
    log_loss_scores = []

    for training_data, val_data in splits:
        # Ensure proper sorting to avoid leakage
        training_data = training_data.sort_values("Date")
        val_data = val_data.sort_values("Date")

        # Separate features and target
        X_train = training_data.drop(columns=["target", "Date"])
        y_train = training_data["target"]
        X_val = val_data.drop(columns=["target", "Date"])
        y_val = val_data["target"]

        # Train the model on the training set
        model.fit(X_train, y_train)

        # Predict probabilities for the validation set
        y_pred_proba = model.predict_proba(X_val)[:, 1]

        # Calculate log loss for the validation set
        log_loss_scores.append(log_loss(y_val, y_pred_proba))

    # Return the mean log loss as the objective to minimize
    return np.mean(log_loss_scores)

In [13]:
best_features=np.load("../logistic_regression/best_models/best_features_log_nested.npy")

In [26]:
X_train_selected, selected_features = feature_selection_with_shap(X_train_scaled, y_train_data['target'], 50)

Selected 20 features out of 40 with top 50%.


In [60]:
train_data_selected = pd.concat([X_train_selected, y_train_data], axis=1)
def wrapped_objective(trial):
    return objective_logistic_nested_improve(trial, train_data_selected)
study_best_features_improve = optuna.create_study(direction="minimize", sampler=optuna.samplers.RandomSampler(seed=42))
study_best_features_improve.optimize(wrapped_objective, n_trials=200, timeout=3600)
best_params_improve = study_best_features_improve.best_params
final_model_improve = LogisticRegression(**best_params_improve, max_iter=1000, random_state=42)
print(f"Best parameters: {best_params_improve}")

[I 2025-01-19 01:39:29,028] A new study created in memory with name: no-name-9c252f59-c99e-4247-91e3-4a2441561a99
[I 2025-01-19 01:39:29,224] Trial 0 finished with value: 0.6168179207136804 and parameters: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 1.1733860640735183, 'class_weight': None}. Best is trial 0 with value: 0.6168179207136804.
[I 2025-01-19 01:39:29,445] Trial 1 finished with value: 0.6165852536990611 and parameters: {'solver': 'saga', 'penalty': 'l2', 'C': 1.326897553487662, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.6165852536990611.
[I 2025-01-19 01:39:29,572] Trial 2 finished with value: 0.6166845031977465 and parameters: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 1.286510935395048, 'class_weight': None}. Best is trial 1 with value: 0.6165852536990611.
[I 2025-01-19 01:39:29,700] Trial 3 finished with value: 0.6168029761785239 and parameters: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.9704492513615651, 'class_weight': None}. Best is trial 1 with value: 0.6

Best parameters: {'solver': 'saga', 'penalty': 'l2', 'C': 1.4592140219942689, 'class_weight': None}


In [None]:
final_model_improve.fit(X_train_selected, y_train_data['target'])
X_test_selected = X_test_scaled.iloc[:, selected_features]
y_pred_improve = final_model_improve.predict(X_test_selected)
y_pred_proba_improve = final_model_improve.predict_proba(X_test_selected)[:, 1]
accuracy_improve = accuracy_score(y_test_data, y_pred_improve)
logloss_improve = log_loss(y_test_data, y_pred_proba_improve)

In [32]:
fig=optuna.visualization.plot_param_importances(study_best_features_improve)
fig.update_layout(title="")

In [63]:
values = np.array([t.value for t in study_best_features_improve.trials if t.state == optuna.trial.TrialState.COMPLETE])
values.sort()
quantile_80 = np.quantile(values, 0.80)
fig=optuna.visualization.plot_edf(study_best_features_improve)
fig.update_layout(
    title="",
    template="plotly_white", 
    font=dict(size=14),
)
fig.add_vline(
    x=quantile_80,
    line_dash="dash",
    line_color="red",
)

# Zapis do wysokiej jakości pliku PDF
pio.write_image(fig, "edf_plot_nested_2.pdf", format="pdf", scale=10)  # Skalowanie dla wysokiej jakości

# Wyświetlenie wykresu
fig.show()

In [39]:
def convert_categorical_params(study):
    df = study.trials_dataframe()
    for param in df.columns:
        if df[param].dtype == "object":  # Check if it's a categorical parameter
            df[param] = df[param].astype(str)  # Ensure categorical values are strings
    return df

In [40]:
import plotly.express as px

In [35]:
joblib.dump(final_model, "../logistic_regression/best_models/best_model_log_nested.pkl")

['../logistic_regression/best_models/best_model_log_nested.pkl']

In [36]:
np.save("../logistic_regression/best_models/best_features_log_nested.npy", selected_features)

In [37]:
with open("../logistic_regression/best_models/best_params_log_nested.json", "w") as f:
    json.dump(best_params, f)

In [38]:
with open("../logistic_regression/best_models/best_model_log_nested_as_txt.txt", "w") as f:
    f.write(str(final_model))

In [39]:
joblib.dump(final_model_improve, "../logistic_regression/best_models/best_model_log_nested_improve.pkl")
with open("../logistic_regression/best_models/best_params_log_nested_improve.json", "w") as f:
    json.dump(best_params_improve, f)
with open("../logistic_regression/best_models/best_model_log_nested_improve_as_txt.txt", "w") as f:
    f.write(str(final_model_improve))