In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from scipy.stats import kstest
import shap
import joblib
import json
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
from sklearn.metrics import accuracy_score
import plotly.io as pio

In [2]:
matches = pd.read_csv("../../preparation_before_models/data/matches.csv")

In [None]:
matches['Date'] = pd.to_datetime(matches['Date'])
print(matches['Date'].head())
print(matches['Date'].dtype)

In [None]:
def combine_player_columns(df):
    
    combined_df = df.copy()
    difference_columns = {}

    
    for col in df.columns:
        if col.startswith('player1_'):
            suffix = col[len('player1_'):]
            player2_col = f'player2_{suffix}'
            
            if player2_col in df.columns:
                
                diff_col_name = f'diff_{suffix}'
                combined_df[diff_col_name] = df[col] - df[player2_col]
                
                
                difference_columns[col] = diff_col_name
                difference_columns[player2_col] = diff_col_name

    
    combined_df = combined_df.drop(columns=difference_columns.keys())
    
    return combined_df


matches = combine_player_columns(matches)


print("Columns after combining:")
print(matches.columns)

In [5]:
matches=matches.drop(columns=['non_CO_uncertainty', 'CO_uncertainty',"outdoor", "tournament_level", "best_of", "Round_Num", "Surface_Clay", "Surface_Grass", "Surface_Hard", "temperature_2m", "relative_humidity_2m", "windspeed_10m",  "apparent_temperature", "diff_right_handed","diff_bet_odds","w_ace_avg", "l_ace_avg", "w_CO_ace_avg", "l_CO_ace_avg","w_df_avg", "l_df_avg", "w_CO_df_avg", "l_CO_df_avg", "w_2ndIn_avg","l_2ndIn_avg","w_CO_2ndIn_avg", "l_CO_2ndIn_avg"])

In [None]:
matches.columns

In [6]:
test_data = matches[matches["Date"].dt.year == 2023]
train_data = matches[matches["Date"].dt.year != 2023]

In [7]:
X_train_data = train_data.drop(columns=['target','Date', 'match_id'])
y_train_data = train_data[['target', 'Date']]

In [8]:
X_test_data  = test_data.drop(columns=['target','Date', 'match_id'])
y_test_data= test_data['target']

In [9]:
def scale_features(X):
    scaled_X = X.copy()
    for col in X.columns:
        if kstest(X[col], 'norm').pvalue > 0.05:
            
            scaler = StandardScaler()
        else:
            
            scaler = MinMaxScaler()
        scaled_X[col] = scaler.fit_transform(X[[col]])
    return scaled_X

In [10]:
X_train_scaled = scale_features(X_train_data)
X_test_scaled = scale_features(X_test_data)  

In [11]:
def rolling_origin_split(data):
    data = data.sort_values(by="Date")

    
    splits = []
    for year in range(2018, 2022):
        training_data = data[data["Date"].dt.year <= year]
        val_data = data[data["Date"].dt.year == year + 1]

        if not val_data.empty:
            splits.append((training_data, val_data))

    return splits

In [12]:
def feature_selection_with_shap(X, y, percent):
    if "Date" in X.columns:
        X = X.drop(columns=["Date"])
    
    model=LogisticRegression(max_iter=1000, fit_intercept=False)
    model.fit(X, y)

    explainer = shap.Explainer(model, X)
    shap_values = explainer(X)

    
    feature_importance = np.abs(shap_values.values).mean(axis=0)

    
    importance_threshold = np.percentile(feature_importance, 100-percent)  
    selected_features = np.where(feature_importance > importance_threshold)[0]

    print(f"Selected {len(selected_features)} features out of {X.shape[1]} with top {percent}%.")
    return X.iloc[:, selected_features], selected_features

In [13]:
def objective_logistic_nested(trial, data):
    solver = trial.suggest_categorical("solver", ["liblinear", "saga", "lbfgs"])
    penalty = trial.suggest_categorical("penalty", ["l1", "l2", None])
    C = trial.suggest_float("C", 1e-4, 1e4, log=True)
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])
    valid_combinations = {
        "lbfgs": ["l2", None],
        "liblinear": ["l1", "l2"],
        "saga": ["l1", "l2", None]
    }
    
    
    if penalty not in valid_combinations[solver]:
        raise optuna.exceptions.TrialPruned()
    
    splits = rolling_origin_split(data)
    model=LogisticRegression(
        solver=solver,
        penalty=penalty,
        C=C,
        class_weight=class_weight,
        max_iter=1000,
        random_state=42,
        fit_intercept=False
    )
    log_loss_scores = []

    for training_data, val_data in splits:
        
        training_data = training_data.sort_values("Date")
        val_data = val_data.sort_values("Date")

        
        X_train = training_data.drop(columns=["target", "Date"])
        y_train = training_data["target"]
        X_val = val_data.drop(columns=["target", "Date"])
        y_val = val_data["target"]

        
        model.fit(X_train, y_train)

        
        y_pred_proba = model.predict_proba(X_val)[:, 1]

        
        log_loss_scores.append(log_loss(y_val, y_pred_proba))

    
    return np.mean(log_loss_scores)

In [None]:
percentages = [ 50, 65, 75, 85, 100]  
best_logloss = float("inf")
best_features = None
best_params = None
best_num_features = 0
for percent in percentages:
    X_train_selected, selected_features = feature_selection_with_shap(X_train_scaled, y_train_data['target'], percent)
    train_data_selected = pd.concat([X_train_selected, y_train_data], axis=1)

    def wrapped_objective(trial):
        return objective_logistic_nested(trial, train_data_selected)
    
    study = optuna.create_study(direction="minimize", sampler=optuna.samplers.RandomSampler(seed=42))
    study.optimize(wrapped_objective, n_trials=200, timeout=3600)

    if study.best_value < best_logloss:
        best_logloss = study.best_value
        best_features = selected_features
        best_params = study.best_params
        best_num_features = len(selected_features)
print(f"Best logloss: {best_logloss}")
print(f"Best number of features: {best_num_features}")
print(f"Best parameters: {best_params}")

In [None]:
final_model = LogisticRegression(**best_params, max_iter=1000, random_state=42)
X_train_selected = X_train_scaled.iloc[:, best_features]
X_test_selected = X_test_scaled.iloc[:, best_features]
final_model.fit(X_train_selected, y_train_data['target'])

In [None]:
train_data_selected = pd.concat([X_train_selected, y_train_data], axis=1)
def wrapped_objective(trial):
    return objective_logistic_nested(trial, train_data_selected)
study_best_features = optuna.create_study(direction="minimize", sampler=optuna.samplers.RandomSampler(seed=42))
study_best_features.optimize(wrapped_objective, n_trials=200, timeout=3600)
best_params = study_best_features.best_params
print(f"Best parameters: {best_params}")

In [None]:
fig=optuna.visualization.plot_param_importances(study_best_features)
fig.update_layout(
    title="",
    template="plotly_white",  
    font=dict(size=14),  
    plot_bgcolor='white',  
    paper_bgcolor='white',  
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey'),  
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey')   
)

pio.write_image(fig, "param_importance_plot_nested.pdf", format="pdf", scale=3)  

fig.show()

In [None]:
values = np.array([t.value for t in study_best_features.trials if t.state == optuna.trial.TrialState.COMPLETE])
values.sort()
quantile_80 = np.quantile(values, 0.80)
fig=optuna.visualization.plot_edf(study_best_features)
fig.update_layout(
    title="",
    template="plotly_white", 
    font=dict(size=14),
)
fig.add_vline(
    x=quantile_80,
    line_dash="dash",
    line_color="red",
)

pio.write_image(fig, "edf_plot_nested.pdf", format="pdf", scale=3)  

fig.show()

In [21]:
import plotly.express as px
import plotly.graph_objects as go

In [None]:
df = study_best_features.trials_dataframe()
df["params_C"] = pd.to_numeric(df["params_C"], errors="coerce")
df_sorted = df.sort_values(by=["params_solver", "params_C"]).copy()

fig = go.Figure()


for solver in df_sorted["params_solver"].unique():
    df_solver = df_sorted[df_sorted["params_solver"] == solver]
    
    
    df_solver = df_solver.dropna(subset=["params_C", "value"])
    
    
    df_solver = df_solver.sort_values(by="params_C")
    
    fig.add_trace(go.Scatter(
        x=df_solver["params_C"], 
        y=df_solver["value"], 
        mode="lines+markers",  
        name=solver,
        line=dict(width=2),
        marker=dict(size=6)
    ))


fig.update_layout(
    title="",
    xaxis=dict(
        title="C", 
        type="log",  
        showgrid=True, 
        gridcolor='LightGrey'
    ),
    yaxis=dict(
        title="Objective value",
        showgrid=True, 
        gridcolor='LightGrey'
    ),
    template="plotly_white",
    font=dict(size=14),
    plot_bgcolor='white',
    paper_bgcolor='white'
)
pio.write_image(fig, "param_rank_plot_nested_solver_C.pdf", format="pdf", scale=10) 

fig.show()

In [None]:
df = study_best_features.trials_dataframe()
df["params_C"] = pd.to_numeric(df["params_C"], errors="coerce")


df["params_penalty"] = df["params_penalty"].fillna("None")
df_sorted = df.sort_values(by=["params_penalty", "params_C"]).copy()


fig = go.Figure()


for penalty in df_sorted["params_penalty"].unique():
    df_penalty = df_sorted[df_sorted["params_penalty"] == penalty]
    
    
    df_penalty = df_penalty.dropna(subset=["params_C", "value"])
    
    
    df_penalty = df_penalty.sort_values(by="params_C")
    
    fig.add_trace(go.Scatter(
        x=df_penalty["params_C"], 
        y=df_penalty["value"], 
        mode="lines+markers",  
        name=penalty,
        line=dict(width=2),
        marker=dict(size=6)
    ))


fig.update_layout(
    title="",
    xaxis=dict(
        title="C", 
        type="log",  
        showgrid=True, 
        gridcolor='LightGrey'
    ),
    yaxis=dict(
        title="Objective value",
        showgrid=True, 
        gridcolor='LightGrey'
    ),
    template="plotly_white",
    font=dict(size=14),
    plot_bgcolor='white',
    paper_bgcolor='white'
)
pio.write_image(fig, "param_rank_plot_nested_penalty_C.pdf", format="pdf", scale=10)

fig.show()

In [None]:
df = study_best_features.trials_dataframe()
df["params_C"] = pd.to_numeric(df["params_C"], errors="coerce")
df["params_class_weight"] = df["params_class_weight"].fillna("None")
df_sorted = df.sort_values(by=["params_class_weight", "params_C"]).copy()


fig = go.Figure()


for class_weight in df_sorted["params_class_weight"].unique():
    df_class_weight = df_sorted[df_sorted["params_class_weight"] == class_weight]
    
    
    df_class_weight = df_class_weight.dropna(subset=["params_C", "value"])
    
    
    df_class_weight = df_class_weight.sort_values(by="params_C")
    
    fig.add_trace(go.Scatter(
        x=df_class_weight["params_C"], 
        y=df_class_weight["value"], 
        mode="lines+markers",  
        name=class_weight,
        line=dict(width=2),
        marker=dict(size=6)
    ))


fig.update_layout(
    title="",
    xaxis=dict(
        title="C", 
        type="log",  
        showgrid=True, 
        gridcolor='LightGrey'
    ),
    yaxis=dict(
        title="Objective value",
        showgrid=True, 
        gridcolor='LightGrey'
    ),
    template="plotly_white",
    font=dict(size=14),
    plot_bgcolor='white',
    paper_bgcolor='white'
)
pio.write_image(fig, "param_rank_plot_nested_class_C.pdf", format="pdf", scale=10)  

fig.show()

In [59]:
def objective_logistic_nested_improve(trial, data):
    solver = trial.suggest_categorical("solver", ["saga", "lbfgs"])
    penalty = trial.suggest_categorical("penalty", [ "l2"])
    C = trial.suggest_float("C", 0.6, 1.5, log=True)
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])
    valid_combinations = {
        "lbfgs": ["l2", None],
        "saga": [ "l2", None]
    }
    
    
    if penalty not in valid_combinations[solver]:
        raise optuna.exceptions.TrialPruned()
    
    splits = rolling_origin_split(data)
    model=LogisticRegression(
        solver=solver,
        penalty=penalty,
        C=C,
        class_weight=class_weight,
        max_iter=1000,
        random_state=42,
        fit_intercept=False
    )
    log_loss_scores = []

    for training_data, val_data in splits:
        
        training_data = training_data.sort_values("Date")
        val_data = val_data.sort_values("Date")

        
        X_train = training_data.drop(columns=["target", "Date"])
        y_train = training_data["target"]
        X_val = val_data.drop(columns=["target", "Date"])
        y_val = val_data["target"]

        
        model.fit(X_train, y_train)

        
        y_pred_proba = model.predict_proba(X_val)[:, 1]

        
        log_loss_scores.append(log_loss(y_val, y_pred_proba))

    
    return np.mean(log_loss_scores)

In [13]:
best_features=np.load("../logistic_regression/best_models/best_features_log_nested.npy")

In [None]:
train_data_selected = pd.concat([X_train_selected, y_train_data], axis=1)
def wrapped_objective(trial):
    return objective_logistic_nested_improve(trial, train_data_selected)
study_best_features_improve = optuna.create_study(direction="minimize", sampler=optuna.samplers.RandomSampler(seed=42))
study_best_features_improve.optimize(wrapped_objective, n_trials=200, timeout=3600)
best_params_improve = study_best_features_improve.best_params
final_model_improve = LogisticRegression(**best_params_improve, max_iter=1000, random_state=42)
print(f"Best parameters: {best_params_improve}")

In [None]:
final_model_improve.fit(X_train_selected, y_train_data['target'])
X_test_selected = X_test_scaled.iloc[:, selected_features]
y_pred_improve = final_model_improve.predict(X_test_selected)

In [None]:
values = np.array([t.value for t in study_best_features_improve.trials if t.state == optuna.trial.TrialState.COMPLETE])
values.sort()
quantile_80 = np.quantile(values, 0.80)
fig=optuna.visualization.plot_edf(study_best_features_improve)
fig.update_layout(
    title="",
    template="plotly_white", 
    font=dict(size=14),
)
fig.add_vline(
    x=quantile_80,
    line_dash="dash",
    line_color="red",
)


pio.write_image(fig, "edf_plot_nested_2.pdf", format="pdf", scale=10)  


fig.show()

In [39]:
def convert_categorical_params(study):
    df = study.trials_dataframe()
    for param in df.columns:
        if df[param].dtype == "object":  
            df[param] = df[param].astype(str)  
    return df

In [40]:
import plotly.express as px

In [None]:
joblib.dump(final_model, "../logistic_regression/best_models/best_model_log_nested.pkl")

In [36]:
np.save("../logistic_regression/best_models/best_features_log_nested.npy", selected_features)

In [37]:
with open("../logistic_regression/best_models/best_params_log_nested.json", "w") as f:
    json.dump(best_params, f)

In [38]:
with open("../logistic_regression/best_models/best_model_log_nested_as_txt.txt", "w") as f:
    f.write(str(final_model))

In [39]:
joblib.dump(final_model_improve, "../logistic_regression/best_models/best_model_log_nested_improve.pkl")
with open("../logistic_regression/best_models/best_params_log_nested_improve.json", "w") as f:
    json.dump(best_params_improve, f)
with open("../logistic_regression/best_models/best_model_log_nested_improve_as_txt.txt", "w") as f:
    f.write(str(final_model_improve))