In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from scipy.stats import kstest
import shap
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
import plotly.express as px
import plotly.io as pio

In [3]:
matches = pd.read_csv("../../preparation_before_models/data/matches.csv")
matches['Date'] = pd.to_datetime(matches['Date'])

In [None]:
def combine_player_columns(df):
    
    combined_df = df.copy()
    difference_columns = {}

    
    for col in df.columns:
        if col.startswith('player1_'):
            suffix = col[len('player1_'):]
            player2_col = f'player2_{suffix}'
            
            if player2_col in df.columns:
                
                diff_col_name = f'diff_{suffix}'
                combined_df[diff_col_name] = df[col] - df[player2_col]
                
                
                difference_columns[col] = diff_col_name
                difference_columns[player2_col] = diff_col_name

    
    combined_df = combined_df.drop(columns=difference_columns.keys())
    
    return combined_df


matches = combine_player_columns(matches)


print("Columns after combining:")
print(matches.columns)

In [5]:
matches=matches.drop(columns=['non_CO_uncertainty', 'CO_uncertainty',"outdoor", "tournament_level", "best_of", "Round_Num", "Surface_Clay", "Surface_Grass", "Surface_Hard", "temperature_2m", "relative_humidity_2m", "windspeed_10m",  "apparent_temperature", "diff_right_handed","diff_bet_odds","w_ace_avg", "l_ace_avg", "w_CO_ace_avg", "l_CO_ace_avg","w_df_avg", "l_df_avg", "w_CO_df_avg", "l_CO_df_avg", "w_2ndIn_avg","l_2ndIn_avg","w_CO_2ndIn_avg", "l_CO_2ndIn_avg"])

In [6]:
test_data = matches[matches["Date"].dt.year == 2023]
train_data = matches[matches["Date"].dt.year != 2023]

In [7]:
X_train=train_data.drop(columns=['target', 'match_id', 'Date'])
y_train=train_data['target']
X_test=test_data.drop(columns=['target', 'match_id', 'Date'])
y_test=test_data['target']

In [8]:
def scale_features(X):
    scaled_X = X.copy()
    for col in X.columns:
        if kstest(X[col], 'norm').pvalue > 0.05:
            
            scaler = StandardScaler()
        else:
            
            scaler = MinMaxScaler()
        scaled_X[col] = scaler.fit_transform(X[[col]])
    return scaled_X

In [9]:
X_train_scaled = scale_features(X_train)
X_test_scaled = scale_features(X_test)  

In [10]:
def feature_selection_with_shap(X, y, percent):
    model=LogisticRegression(max_iter=1000, fit_intercept=False)
    model.fit(X, y)
    explainer = shap.Explainer(model, X)
    shap_values = explainer(X)
    feature_importance = np.abs(shap_values.values).mean(axis=0)
    
    importance_threshold = np.percentile(feature_importance, 100-percent)  
    selected_features = np.where(feature_importance > importance_threshold)[0]

    print(f"Selected {len(selected_features)} features out of {X.shape[1]} with top {percent}%.")
    return X.iloc[:, selected_features], selected_features

In [11]:
def objective_logistic(trial, X_train, y_train):
    solver = trial.suggest_categorical("solver", ["liblinear", "saga", "lbfgs"])
    penalty = trial.suggest_categorical("penalty", ["l1", "l2", None])
    C = trial.suggest_float("C", 1e-4, 1e4, log=True)
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])
    if (solver == "lbfgs" and penalty not in ["l2", None]) or \
       (solver == "liblinear" and penalty not in ["l1", "l2"]) or \
       (solver == "saga" and penalty not in ["l1", "l2", None]):
        raise optuna.exceptions.TrialPruned()
    model = LogisticRegression(
        C=C,
        penalty=penalty,
        solver=solver,
        max_iter=1000,
        class_weight=class_weight,
        random_state=42,
        fit_intercept=False
    )
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring="neg_log_loss")
    return -scores.mean()

In [None]:
percentages = [50, 65, 75, 85, 100]
best_log_loss = float("inf")
best_features = None
best_params = None
best_num_features = 0

for percent in percentages:
    print(f"Testing top {percent}% features...")
    X_train_selected, selected_features = feature_selection_with_shap(X_train_scaled, y_train, percent)
    X_test_selected = X_test_scaled.iloc[:, selected_features]

    def wrapped_objective(trial):
        return objective_logistic(trial, X_train_selected, y_train)

    study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=42))
    study.optimize(wrapped_objective, n_trials=200, timeout=3600)

    if study.best_value < best_log_loss:
        best_log_loss = study.best_value
        best_features = selected_features
        best_params = study.best_params
        best_num_features = len(selected_features)

print(f"Best log loss: {best_log_loss}")
print(f"Best number of features: {best_num_features}")
print(f"Best parameters: {best_params}")


In [None]:
final_model = LogisticRegression(**best_params)
X_train_final = X_train_scaled.iloc[:, best_features]
X_test_final = X_test_scaled.iloc[:, best_features]
final_model.fit(X_train_final, y_train)
accuracy = final_model.score(X_test_final, y_test)

In [None]:
def wrapped_objective(trial):
    return objective_logistic(trial, X_train_final, y_train)
study_best_features = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=42))
study_best_features.optimize(wrapped_objective, n_trials=200, timeout=3600)

In [None]:
fig = optuna.visualization.plot_param_importances(study_best_features)


fig.update_layout(
    title="",
    template="plotly_white",  
    font=dict(size=14),  
    plot_bgcolor='white',  
    paper_bgcolor='white',  
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey'),  
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightGrey')   
)


pio.write_image(fig, "param_importance_plot.pdf", format="pdf", scale=3)  


fig.show()

In [None]:
values = np.array([t.value for t in study_best_features.trials if t.state == optuna.trial.TrialState.COMPLETE])

values.sort()
probability_0_62 = np.sum(values <= 0.62) / len(values)

quantile_95 = np.quantile(values, 0.95)

fig = optuna.visualization.plot_edf(study_best_features)

fig.update_layout(
    title="",
    template="plotly_white",  
    font=dict(size=14),
)

fig.add_vline(
    x=0.62,
    line_dash="dash",
    line_color="red",
)
fig.update_layout(
    yaxis=dict(
        tickvals=[0.2,0.4,0.6,0.8,0.96],  
        
    )
)

pio.write_image(fig, "edf_plot_cros.pdf", format="pdf", scale=3)  

fig.show()

In [34]:
import joblib
import json

In [None]:
joblib.dump(final_model, "../logistic_regression/best_models/best_model_log_cros.pkl")

In [36]:
np.save("../logistic_regression/best_models/best_features_log_cros.npy", best_features)

In [37]:
with open("../logistic_regression/best_models/best_params_log_cros.json", "w") as f:
    json.dump(best_params, f)

In [38]:
with open("../logistic_regression/best_models/best_model_log_cros_as_txt.txt", "w") as f:
    f.write(str(final_model))