In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from scipy.stats import kstest
import shap
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
import plotly.express as px
import plotly.io as pio

In [2]:
matches = pd.read_csv("../../preparation_before_models/data/matches.csv")
matches['Date'] = pd.to_datetime(matches['Date'])

In [3]:
def combine_player_columns(df):
    """
    Combines player1 and player2 columns into a single column representing their difference.
    """
    combined_df = df.copy()
    difference_columns = {}

    # Iterate over all columns to find matching player1/player2 columns
    for col in df.columns:
        if col.startswith('player1_'):
            suffix = col[len('player1_'):]
            player2_col = f'player2_{suffix}'
            
            if player2_col in df.columns:
                # Compute the difference
                diff_col_name = f'diff_{suffix}'
                combined_df[diff_col_name] = df[col] - df[player2_col]
                
                # Track columns to drop
                difference_columns[col] = diff_col_name
                difference_columns[player2_col] = diff_col_name

    # Drop original player1/player2 columns
    combined_df = combined_df.drop(columns=difference_columns.keys())
    
    return combined_df

# Example usage
matches = combine_player_columns(matches)


print("Columns after combining:")
print(matches.columns)

Columns after combining:
Index(['Date', 'outdoor', 'match_id', 'tournament_level', 'best_of',
       'w_ace_avg', 'l_ace_avg', 'w_CO_ace_avg', 'l_CO_ace_avg', 'w_df_avg',
       'l_df_avg', 'w_CO_df_avg', 'l_CO_df_avg', 'w_2ndIn_avg', 'l_2ndIn_avg',
       'w_CO_2ndIn_avg', 'l_CO_2ndIn_avg', 'non_CO_uncertainty',
       'CO_uncertainty', 'Round_Num', 'temperature_2m', 'relative_humidity_2m',
       'windspeed_10m', 'apparent_temperature', 'Surface_Clay',
       'Surface_Grass', 'Surface_Hard', 'target', 'diff_bet_odds',
       'diff_right_handed', 'diff_age', 'diff_rank', 'diff_entry_LL',
       'diff_entry_Q', 'diff_entry_WC', 'diff_is_seeded',
       'diff_1st_serve_in_pct_avg', 'diff_CO_1st_serve_in_pct_avg',
       'diff_1st_serve_win_pct_avg', 'diff_CO_1st_serve_win_pct_avg',
       'diff_2nd_serve_in_pct_avg', 'diff_CO_2nd_serve_in_pct_avg',
       'diff_2nd_serve_win_pct_avg', 'diff_CO_2nd_serve_win_pct_avg',
       'diff_service_games_won_pct_avg', 'diff_CO_service_games_won_pc

In [4]:
matches=matches.drop(columns=['non_CO_uncertainty', 'CO_uncertainty',"outdoor", "tournament_level", "best_of", "Round_Num", "Surface_Clay", "Surface_Grass", "Surface_Hard", "temperature_2m", "relative_humidity_2m", "windspeed_10m",  "apparent_temperature", "diff_right_handed","diff_bet_odds","w_ace_avg", "l_ace_avg", "w_CO_ace_avg", "l_CO_ace_avg","w_df_avg", "l_df_avg", "w_CO_df_avg", "l_CO_df_avg", "w_2ndIn_avg","l_2ndIn_avg","w_CO_2ndIn_avg", "l_CO_2ndIn_avg"])

In [16]:
matches.columns

Index(['Date', 'match_id', 'target', 'diff_age', 'diff_rank', 'diff_entry_LL',
       'diff_entry_Q', 'diff_entry_WC', 'diff_is_seeded',
       'diff_1st_serve_in_pct_avg', 'diff_CO_1st_serve_in_pct_avg',
       'diff_1st_serve_win_pct_avg', 'diff_CO_1st_serve_win_pct_avg',
       'diff_2nd_serve_in_pct_avg', 'diff_CO_2nd_serve_in_pct_avg',
       'diff_2nd_serve_win_pct_avg', 'diff_CO_2nd_serve_win_pct_avg',
       'diff_service_games_won_pct_avg', 'diff_CO_service_games_won_pct_avg',
       'diff_1st_serve_return_win_pct_avg',
       'diff_CO_1st_serve_return_win_pct_avg',
       'diff_2nd_serve_return_win_pct_avg',
       'diff_CO_2nd_serve_return_win_pct_avg', 'diff_return_games_win_pct_avg',
       'diff_CO_return_games_win_pct_avg', 'diff_bp_won_pct_avg',
       'diff_CO_bp_won_pct_avg', 'diff_bp_saved_pct_avg',
       'diff_CO_bp_saved_pct_avg', 'diff_elo', 'diff_surface_elo',
       'diff_blended_elo', 'diff_fatigue_score', 'diff_h2h_wins',
       'diff_h2h_surface_wins', 'diff

In [5]:
test_data = matches[matches["Date"].dt.year == 2023]
train_data = matches[matches["Date"].dt.year != 2023]

In [6]:
X_train=train_data.drop(columns=['target', 'match_id', 'Date'])
y_train=train_data['target']
X_test=test_data.drop(columns=['target', 'match_id', 'Date'])
y_test=test_data['target']

In [7]:
def scale_features(X):
    scaled_X = X.copy()
    for col in X.columns:
        if kstest(X[col], 'norm').pvalue > 0.05:
            # Normal distribution: StandardScaler
            scaler = StandardScaler()
        else:
            # Non-normal distribution: MinMaxScaler
            scaler = MinMaxScaler()
        scaled_X[col] = scaler.fit_transform(X[[col]])
    return scaled_X

In [8]:
X_train_scaled = scale_features(X_train)
X_test_scaled = scale_features(X_test)  

In [21]:
def feature_selection_with_shap(X, y, percent):
    model=LogisticRegression(max_iter=1000, fit_intercept=False)
    model.fit(X, y)
    explainer = shap.Explainer(model, X)
    shap_values = explainer(X)
    feature_importance = np.abs(shap_values.values).mean(axis=0)
    
    importance_threshold = np.percentile(feature_importance, 100-percent)  
    selected_features = np.where(feature_importance > importance_threshold)[0]

    print(f"Selected {len(selected_features)} features out of {X.shape[1]} with top {percent}%.")
    return X.iloc[:, selected_features], selected_features

In [12]:
def objective_logistic(trial, X_train, y_train):
    solver = trial.suggest_categorical("solver", ["liblinear", "saga", "lbfgs"])
    penalty = trial.suggest_categorical("penalty", ["l1", "l2", None])
    C = trial.suggest_float("C", 1e-4, 1e4, log=True)
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])
    if (solver == "lbfgs" and penalty not in ["l2", None]) or \
       (solver == "liblinear" and penalty not in ["l1", "l2"]) or \
       (solver == "saga" and penalty not in ["l1", "l2", None]):
        raise optuna.exceptions.TrialPruned()
    model = LogisticRegression(
        C=C,
        penalty=penalty,
        solver=solver,
        max_iter=1000,
        class_weight=class_weight,
        random_state=42,
        fit_intercept=False
    )
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring="neg_log_loss")
    return -scores.mean()

In [23]:
percentages = [50, 65, 75, 85, 100]
best_log_loss = float("inf")
best_features = None
best_params = None
best_num_features = 0

for percent in percentages:
    print(f"Testing top {percent}% features...")
    X_train_selected, selected_features = feature_selection_with_shap(X_train_scaled, y_train, percent)
    X_test_selected = X_test_scaled.iloc[:, selected_features]

    def wrapped_objective(trial):
        return objective_logistic(trial, X_train_selected, y_train)

    study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=42))
    study.optimize(wrapped_objective, n_trials=200, timeout=3600)

    if study.best_value < best_log_loss:
        best_log_loss = study.best_value
        best_features = selected_features
        best_params = study.best_params
        best_num_features = len(selected_features)

print(f"Best log loss: {best_log_loss}")
print(f"Best number of features: {best_num_features}")
print(f"Best parameters: {best_params}")


Testing top 50% features...


[I 2025-01-18 01:50:14,665] A new study created in memory with name: no-name-167b5288-1644-485f-b1b0-34510eb3d730
[I 2025-01-18 01:50:14,751] Trial 0 finished with value: 0.6931471805599454 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 0.00029152036385288323, 'class_weight': None}. Best is trial 0 with value: 0.6931471805599454.
[I 2025-01-18 01:50:14,753] Trial 1 pruned. 


Selected 20 features out of 40 with top 50%.


[I 2025-01-18 01:50:15,001] Trial 2 finished with value: 0.6175935585847383 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.6175935585847383.
[I 2025-01-18 01:50:20,224] Trial 3 finished with value: 0.6177007195616333 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 2 with value: 0.6175935585847383.
[I 2025-01-18 01:50:20,436] Trial 4 finished with value: 0.6175935585847383 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class_weight': None}. Best is trial 2 with value: 0.6175935585847383.
[I 2025-01-18 01:50:20,437] Trial 5 pruned. 
[I 2025-01-18 01:50:24,212] Trial 6 finished with value: 0.6177042751239264 and parameters: {'solver': 'saga', 'penalty': None, 'C': 0.12865252594826798, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.6175935585847383.
[I 2025-01-18 01:50:24,402] Trial 7 finished w

Testing top 65% features...


[I 2025-01-18 01:52:36,479] A new study created in memory with name: no-name-cb7b516b-92c3-4386-bf1c-430d5b1983d9
[I 2025-01-18 01:52:36,561] Trial 0 finished with value: 0.6931471805599454 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 0.00029152036385288323, 'class_weight': None}. Best is trial 0 with value: 0.6931471805599454.
[I 2025-01-18 01:52:36,562] Trial 1 pruned. 


Selected 26 features out of 40 with top 65%.


[I 2025-01-18 01:52:36,819] Trial 2 finished with value: 0.6179797544966633 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.6179797544966633.
[I 2025-01-18 01:52:45,105] Trial 3 finished with value: 0.6180587241284404 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 2 with value: 0.6179797544966633.
[I 2025-01-18 01:52:45,366] Trial 4 finished with value: 0.6179797544966633 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class_weight': None}. Best is trial 2 with value: 0.6179797544966633.
[I 2025-01-18 01:52:45,367] Trial 5 pruned. 
[I 2025-01-18 01:52:51,241] Trial 6 finished with value: 0.6180621637870993 and parameters: {'solver': 'saga', 'penalty': None, 'C': 0.12865252594826798, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.6179797544966633.
[I 2025-01-18 01:52:51,476] Trial 7 finished w

Testing top 75% features...


[I 2025-01-18 01:54:18,981] A new study created in memory with name: no-name-a8a9b32c-07d0-457c-9d0f-165afcefa938
[I 2025-01-18 01:54:19,060] Trial 0 finished with value: 0.6931471805599454 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 0.00029152036385288323, 'class_weight': None}. Best is trial 0 with value: 0.6931471805599454.
[I 2025-01-18 01:54:19,062] Trial 1 pruned. 


Selected 30 features out of 40 with top 75%.


[I 2025-01-18 01:54:19,362] Trial 2 finished with value: 0.6177425751213221 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.6177425751213221.
[I 2025-01-18 01:54:28,153] Trial 3 finished with value: 0.6180087840734372 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 2 with value: 0.6177425751213221.
[I 2025-01-18 01:54:28,454] Trial 4 finished with value: 0.6177425751213221 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class_weight': None}. Best is trial 2 with value: 0.6177425751213221.
[I 2025-01-18 01:54:28,455] Trial 5 pruned. 
[I 2025-01-18 01:54:34,766] Trial 6 finished with value: 0.6180122980356554 and parameters: {'solver': 'saga', 'penalty': None, 'C': 0.12865252594826798, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.6177425751213221.
[I 2025-01-18 01:54:35,030] Trial 7 finished w

Testing top 85% features...


[I 2025-01-18 01:55:59,527] A new study created in memory with name: no-name-4eba9a97-a03f-413f-bef9-ae85fda13c9e
[I 2025-01-18 01:55:59,616] Trial 0 finished with value: 0.6931471805599454 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 0.00029152036385288323, 'class_weight': None}. Best is trial 0 with value: 0.6931471805599454.
[I 2025-01-18 01:55:59,617] Trial 1 pruned. 


Selected 34 features out of 40 with top 85%.


[I 2025-01-18 01:55:59,926] Trial 2 finished with value: 0.6180220229889073 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.6180220229889073.
[I 2025-01-18 01:56:10,128] Trial 3 finished with value: 0.6181632747729745 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 2 with value: 0.6180220229889073.
[I 2025-01-18 01:56:10,425] Trial 4 finished with value: 0.6180220229889073 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class_weight': None}. Best is trial 2 with value: 0.6180220229889073.
[I 2025-01-18 01:56:10,426] Trial 5 pruned. 
[I 2025-01-18 01:56:17,444] Trial 6 finished with value: 0.6181670346893451 and parameters: {'solver': 'saga', 'penalty': None, 'C': 0.12865252594826798, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.6180220229889073.
[I 2025-01-18 01:56:17,770] Trial 7 finished w

Testing top 100% features...


[I 2025-01-18 01:59:06,308] A new study created in memory with name: no-name-e2122a3a-bff4-4c5c-a425-8aee2364cb63
[I 2025-01-18 01:59:06,400] Trial 0 finished with value: 0.6931471805599454 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 0.00029152036385288323, 'class_weight': None}. Best is trial 0 with value: 0.6931471805599454.
[I 2025-01-18 01:59:06,401] Trial 1 pruned. 


Selected 39 features out of 40 with top 100%.


[I 2025-01-18 01:59:06,751] Trial 2 finished with value: 0.6182661330097293 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.6182661330097293.
[I 2025-01-18 01:59:19,102] Trial 3 finished with value: 0.6186181494711485 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 2 with value: 0.6182661330097293.
[I 2025-01-18 01:59:19,461] Trial 4 finished with value: 0.6182661330097293 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class_weight': None}. Best is trial 2 with value: 0.6182661330097293.
[I 2025-01-18 01:59:19,463] Trial 5 pruned. 
[I 2025-01-18 01:59:28,067] Trial 6 finished with value: 0.6186222452592538 and parameters: {'solver': 'saga', 'penalty': None, 'C': 0.12865252594826798, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.6182661330097293.
[I 2025-01-18 01:59:28,473] Trial 7 finished w

Best log loss: 0.6174935561661402
Best number of features: 20
Best parameters: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 49.17579460893949, 'class_weight': None}


In [24]:
final_model = LogisticRegression(**best_params)
X_train_final = X_train_scaled.iloc[:, best_features]
X_test_final = X_test_scaled.iloc[:, best_features]
final_model.fit(X_train_final, y_train)
accuracy = final_model.score(X_test_final, y_test)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6469494047619048


In [10]:
best_features=np.load("../logistic_regression/best_models/best_features_log_cros.npy")
X_train_final = X_train_scaled.iloc[:, best_features]

In [32]:
X_train_final.columns

Index(['diff_age', 'diff_rank', 'diff_entry_Q', 'diff_is_seeded',
       'diff_1st_serve_in_pct_avg', 'diff_CO_1st_serve_win_pct_avg',
       'diff_CO_2nd_serve_in_pct_avg', 'diff_service_games_won_pct_avg',
       'diff_CO_service_games_won_pct_avg',
       'diff_CO_2nd_serve_return_win_pct_avg',
       'diff_CO_return_games_win_pct_avg', 'diff_bp_won_pct_avg',
       'diff_CO_bp_saved_pct_avg', 'diff_elo', 'diff_surface_elo',
       'diff_blended_elo', 'diff_h2h_wins', 'diff_win_pct_last_10_surface',
       'diff_Game_Diff_Tournament', 'diff_total_wins_tournament_history'],
      dtype='object')

In [13]:
def wrapped_objective(trial):
    return objective_logistic(trial, X_train_final, y_train)
study_best_features = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=42))
study_best_features.optimize(wrapped_objective, n_trials=200, timeout=3600)

[I 2025-01-18 19:42:00,285] A new study created in memory with name: no-name-ea9a5a59-90b4-448c-a326-c6e0cc195997
[I 2025-01-18 19:42:00,410] Trial 0 finished with value: 0.6931471805599454 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 0.00029152036385288323, 'class_weight': None}. Best is trial 0 with value: 0.6931471805599454.
[I 2025-01-18 19:42:00,411] Trial 1 pruned. 
[I 2025-01-18 19:42:00,623] Trial 2 finished with value: 0.6175935585847383 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.6175935585847383.
[I 2025-01-18 19:42:05,737] Trial 3 finished with value: 0.6177007195616333 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 2 with value: 0.6175935585847383.
[I 2025-01-18 19:42:05,927] Trial 4 finished with value: 0.6175935585847383 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class

In [26]:
optuna.visualization.plot_optimization_history(study_best_features)

In [29]:
fig=optuna.visualization.plot_slice(study_best_features, params=["C","solver"])
fig.update_layout(width=700 ,title="")

In [17]:
optuna.visualization.plot_slice(study_best_features)

In [28]:
fig=optuna.visualization.plot_param_importances(study_best_features)
fig.update_layout(title="")

In [31]:
fig = optuna.visualization.plot_param_importances(study_best_features)

# Poprawienie estetyki wykresu (tło, czcionka, styl)
fig.update_layout(
    title="",
    template="plotly_white",  # Jasne tło
    font=dict(size=14),  # Ustawienie czcionki
    plot_bgcolor='white',  # Tło wykresu
    paper_bgcolor='white',  # Tło całego dokumentu
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey'),  # Siatka na osi X
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey')   # Siatka na osi Y
)

# Zapis do wysokiej jakości pliku PDF
pio.write_image(fig, "param_importance_plot.pdf", format="pdf", scale=3)  # Skalowanie dla świetnej jakości

# Wyświetlenie wykresu
fig.show()

In [43]:
df = study_best_features.trials_dataframe()

fig = px.scatter_matrix(df, dimensions=["params_C", "params_class_weight", "params_solver", "params_penalty"], title="C vs Other Parameters")
fig.show()

In [15]:
optuna.visualization.plot_rank(study_best_features)

In [44]:
optuna.visualization.plot_parallel_coordinate(study_best_features)

In [14]:
fig=optuna.visualization.plot_edf(study_best_features)
fig.update_layout(title="")

In [39]:
values = np.array([t.value for t in study_best_features.trials if t.state == optuna.trial.TrialState.COMPLETE])

# Sortowanie wartości
values.sort()
probability_0_62 = np.sum(values <= 0.62) / len(values)

# Obliczenie kwantyla 0.95
quantile_95 = np.quantile(values, 0.95)

fig = optuna.visualization.plot_edf(study_best_features)

fig.update_layout(
    title="",
    template="plotly_white",  
    font=dict(size=14),
)

fig.add_vline(
    x=0.62,
    line_dash="dash",
    line_color="red",
)
fig.update_layout(
    yaxis=dict(
        tickvals=[0.2,0.4,0.6,0.8,probability_0_62],  # Lista wartości na osi y (tutaj tylko P(0.62))
        #ticktext=[f"{probability_0_62:.2f}"]  # Tekst z wartością na osi y
    )
)


# Zapis do wysokiej jakości pliku PDF
pio.write_image(fig, "edf_plot.pdf", format="pdf", scale=3)  # Skalowanie dla wysokiej jakości

# Wyświetlenie wykresu
fig.show()

In [19]:
optuna.visualization.plot_terminator_improvement(study_best_features)


plot_terminator_improvement is experimental (supported from v3.2.0). The interface can change in the future.


RegretBoundEvaluator is experimental (supported from v3.2.0). The interface can change in the future.


CrossValidationErrorEvaluator is experimental (supported from v3.2.0). The interface can change in the future.

 58%|█████▊    | 117/200 [00:09<00:09,  8.31it/s][W 2025-01-04 17:20:11,561] The optimization of kernel_params failed: 
linalg.cholesky: The factorization could not be completed because the input is not positive-definite (the leading minor of order 5 is not positive-definite).
The default initial kernel params will be used instead.
 74%|███████▎  | 147/200 [00:14<00:08,  6.61it/s][W 2025-01-04 17:20:15,810] The optimization of kernel_params failed: 
linalg.cholesky: The factorization could not be completed because the input is not positive-definite (the leading minor of order 1 is not positive-definite).
The default initial kernel params will be used instead.
 76%

In [34]:
import joblib
import json

In [35]:
joblib.dump(final_model, "../logistic_regression/best_models/best_model_log_cros.pkl")

['../logistic_regression/best_models/best_model_log_cros.pkl']

In [36]:
np.save("../logistic_regression/best_models/best_features_log_cros.npy", best_features)

In [37]:
with open("../logistic_regression/best_models/best_params_log_cros.json", "w") as f:
    json.dump(best_params, f)

In [38]:
with open("../logistic_regression/best_models/best_model_log_cros_as_txt.txt", "w") as f:
    f.write(str(final_model))