In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from scipy.stats import kstest
import shap
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
matches = pd.read_csv("../../preparation_before_models/data/matches.csv")

In [3]:
matches=matches.drop(columns=['Date','player1_bet_odds','player2_bet_odds',"w_ace_avg", "l_ace_avg", "w_CO_ace_avg", "l_CO_ace_avg","w_df_avg", "l_df_avg", "w_CO_df_avg", "l_CO_df_avg", "w_2ndIn_avg","l_2ndIn_avg","w_CO_2ndIn_avg", "l_CO_2ndIn_avg"])

In [4]:
X=matches.drop(columns=['target', 'match_id'])
y=matches['target']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [6]:
def scale_features(X):
    scaled_X = X.copy()
    for col in X.columns:
        if kstest(X[col], 'norm').pvalue > 0.05:
            # Normal distribution: StandardScaler
            scaler = StandardScaler()
        else:
            # Non-normal distribution: MinMaxScaler
            scaler = MinMaxScaler()
        scaled_X[col] = scaler.fit_transform(X[[col]])
    return scaled_X

In [7]:
X_train_scaled = scale_features(X_train)
X_test_scaled = scale_features(X_test)  

In [8]:
def feature_selection_with_shap(X, y, percent):
    model=LogisticRegression(max_iter=1000)
    model.fit(X, y)
    explainer = shap.Explainer(model, X)
    shap_values = explainer(X)
    feature_importance = np.abs(shap_values.values).mean(axis=0)
    
    importance_threshold = np.percentile(feature_importance, 100-percent)  
    selected_features = np.where(feature_importance > importance_threshold)[0]

    print(f"Selected {len(selected_features)} features out of {X.shape[1]} with top {percent}%.")
    return X.iloc[:, selected_features], selected_features

In [9]:
def objective_logistic(trial, X_train, y_train):
    solver = trial.suggest_categorical("solver", ["liblinear", "saga", "lbfgs"])
    penalty = trial.suggest_categorical("penalty", ["l1", "l2", None])
    C = trial.suggest_float("C", 1e-4, 1e4, log=True)
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])
    if (solver == "lbfgs" and penalty not in ["l2", None]) or \
       (solver == "liblinear" and penalty not in ["l1", "l2"]) or \
       (solver == "saga" and penalty not in ["l1", "l2", None]):
        raise optuna.exceptions.TrialPruned()
    model = LogisticRegression(
        C=C,
        penalty=penalty,
        solver=solver,
        max_iter=1000,
        class_weight=class_weight,
        random_state=42,
    )
    
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_log_loss")
    return -scores.mean()

In [10]:
percentages = [50, 65, 75, 85, 100]
best_log_loss = float("inf")
best_features = None
best_params = None
best_num_features = 0

for percent in percentages:
    print(f"Testing top {percent}% features...")
    X_train_selected, selected_features = feature_selection_with_shap(X_train_scaled, y_train, percent)
    X_test_selected = X_test_scaled.iloc[:, selected_features]

    def wrapped_objective(trial):
        return objective_logistic(trial, X_train_selected, y_train)

    study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=42))
    study.optimize(wrapped_objective, n_trials=200, timeout=3600)

    if study.best_value < best_log_loss:
        best_log_loss = study.best_value
        best_features = selected_features
        best_params = study.best_params
        best_num_features = len(selected_features)

print(f"Best log loss: {best_log_loss}")
print(f"Best number of features: {best_num_features}")
print(f"Best parameters: {best_params}")


Testing top 50% features...


[I 2025-01-04 15:16:52,964] A new study created in memory with name: no-name-b97aa44b-12ec-42f7-a9a4-9666bef453d5
[I 2025-01-04 15:16:53,071] Trial 0 finished with value: 0.6954123227695579 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 0.00029152036385288323, 'class_weight': None}. Best is trial 0 with value: 0.6954123227695579.
[I 2025-01-04 15:16:53,072] Trial 1 pruned. 


Selected 47 features out of 95 with top 50%.


[I 2025-01-04 15:16:53,568] Trial 2 finished with value: 0.6218106584699603 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.6218106584699603.
[I 2025-01-04 15:17:16,069] Trial 3 finished with value: 0.6220280475492912 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 2 with value: 0.6218106584699603.
[I 2025-01-04 15:17:16,470] Trial 4 finished with value: 0.6218106584699603 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class_weight': None}. Best is trial 2 with value: 0.6218106584699603.
[I 2025-01-04 15:17:16,471] Trial 5 pruned. 
[I 2025-01-04 15:17:29,180] Trial 6 finished with value: 0.622028549878696 and parameters: {'solver': 'saga', 'penalty': None, 'C': 0.12865252594826798, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.6218106584699603.
[I 2025-01-04 15:17:29,563] Trial 7 finished wi

Testing top 65% features...


[I 2025-01-04 15:20:08,973] A new study created in memory with name: no-name-fbe5ee7e-77a1-40c8-924e-8d2e8acdfac1
[I 2025-01-04 15:20:09,097] Trial 0 finished with value: 0.6947913770902534 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 0.00029152036385288323, 'class_weight': None}. Best is trial 0 with value: 0.6947913770902534.
[I 2025-01-04 15:20:09,098] Trial 1 pruned. 


Selected 62 features out of 95 with top 65%.


[I 2025-01-04 15:20:09,713] Trial 2 finished with value: 0.6226187530525711 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.6226187530525711.
[I 2025-01-04 15:20:41,021] Trial 3 finished with value: 0.6228241369237336 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 2 with value: 0.6226187530525711.
[I 2025-01-04 15:20:41,592] Trial 4 finished with value: 0.6226187530525711 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class_weight': None}. Best is trial 2 with value: 0.6226187530525711.
[I 2025-01-04 15:20:41,593] Trial 5 pruned. 
[I 2025-01-04 15:20:59,975] Trial 6 finished with value: 0.6228247750225961 and parameters: {'solver': 'saga', 'penalty': None, 'C': 0.12865252594826798, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.6226187530525711.
[I 2025-01-04 15:21:00,589] Trial 7 finished w

Testing top 75% features...


[I 2025-01-04 15:24:35,373] A new study created in memory with name: no-name-20576a5d-cd27-4acb-95ac-aeef04cd4576
[I 2025-01-04 15:24:35,504] Trial 0 finished with value: 0.6945162862576566 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 0.00029152036385288323, 'class_weight': None}. Best is trial 0 with value: 0.6945162862576566.
[I 2025-01-04 15:24:35,506] Trial 1 pruned. 


Selected 71 features out of 95 with top 75%.


[I 2025-01-04 15:24:36,165] Trial 2 finished with value: 0.6237003907566423 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.6237003907566423.
[I 2025-01-04 15:25:15,048] Trial 3 finished with value: 0.6240039208660335 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 2 with value: 0.6237003907566423.
[I 2025-01-04 15:25:15,666] Trial 4 finished with value: 0.6237003907566423 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class_weight': None}. Best is trial 2 with value: 0.6237003907566423.
[I 2025-01-04 15:25:15,667] Trial 5 pruned. 
[I 2025-01-04 15:25:38,802] Trial 6 finished with value: 0.6240048561535665 and parameters: {'solver': 'saga', 'penalty': None, 'C': 0.12865252594826798, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.6237003907566423.
[I 2025-01-04 15:25:39,549] Trial 7 finished w

Testing top 85% features...


[I 2025-01-04 15:34:26,555] A new study created in memory with name: no-name-b284eed9-6395-45f6-9478-f86158907376
[I 2025-01-04 15:34:26,720] Trial 0 finished with value: 0.6943067605354905 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 0.00029152036385288323, 'class_weight': None}. Best is trial 0 with value: 0.6943067605354905.
[I 2025-01-04 15:34:26,722] Trial 1 pruned. 


Selected 80 features out of 95 with top 85%.


[I 2025-01-04 15:34:27,566] Trial 2 finished with value: 0.624677461255825 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.624677461255825.
[I 2025-01-04 15:35:13,441] Trial 3 finished with value: 0.6250467863156732 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 2 with value: 0.624677461255825.
[I 2025-01-04 15:35:14,273] Trial 4 finished with value: 0.624677461255825 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class_weight': None}. Best is trial 2 with value: 0.624677461255825.
[I 2025-01-04 15:35:14,274] Trial 5 pruned. 
[I 2025-01-04 15:35:41,256] Trial 6 finished with value: 0.6250485284364 and parameters: {'solver': 'saga', 'penalty': None, 'C': 0.12865252594826798, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.624677461255825.
[I 2025-01-04 15:35:42,146] Trial 7 finished with value

Testing top 100% features...


[I 2025-01-04 16:14:40,976] A new study created in memory with name: no-name-8cb969da-8ef3-4764-ace0-48e58b14dd26
[I 2025-01-04 16:14:41,157] Trial 0 finished with value: 0.6941167526527658 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 0.00029152036385288323, 'class_weight': None}. Best is trial 0 with value: 0.6941167526527658.


Selected 94 features out of 95 with top 100%.


[I 2025-01-04 16:14:41,160] Trial 1 pruned. 
[I 2025-01-04 16:14:42,389] Trial 2 finished with value: 0.6262931935238777 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.6262931935238777.
[I 2025-01-04 16:15:40,858] Trial 3 finished with value: 0.6268978847904559 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 2 with value: 0.6262931935238777.
[I 2025-01-04 16:15:42,145] Trial 4 finished with value: 0.6262931935238777 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class_weight': None}. Best is trial 2 with value: 0.6262931935238777.
[I 2025-01-04 16:15:42,147] Trial 5 pruned. 
[I 2025-01-04 16:16:16,788] Trial 6 finished with value: 0.6269002981718399 and parameters: {'solver': 'saga', 'penalty': None, 'C': 0.12865252594826798, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.6262931935238777.
[

Best log loss: 0.6207826178260134
Best number of features: 47
Best parameters: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.6985949090068287, 'class_weight': None}


In [26]:
final_model = LogisticRegression(**best_params)
X_train_final = X_train_scaled.iloc[:, best_features]
X_test_final = X_test_scaled.iloc[:, best_features]
final_model.fit(X_train_final, y_train)
accuracy = final_model.score(X_test_final, y_test)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6624940954180444


In [12]:
def wrapped_objective(trial):
    return objective_logistic(trial, X_train_final, y_train)
study_best_features = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=42))
study_best_features.optimize(wrapped_objective, n_trials=200, timeout=3600)

[I 2025-01-04 17:06:16,806] A new study created in memory with name: no-name-57536013-2b97-4e5b-b7ac-ebc245929099
[I 2025-01-04 17:06:16,940] Trial 0 finished with value: 0.6954123227695579 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 0.00029152036385288323, 'class_weight': None}. Best is trial 0 with value: 0.6954123227695579.
[I 2025-01-04 17:06:16,941] Trial 1 pruned. 
[I 2025-01-04 17:06:17,375] Trial 2 finished with value: 0.6218106584699603 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.4452048365748854, 'class_weight': None}. Best is trial 2 with value: 0.6218106584699603.
[I 2025-01-04 17:06:39,650] Trial 3 finished with value: 0.6220280475492912 and parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 3900.1768308022133, 'class_weight': None}. Best is trial 2 with value: 0.6218106584699603.
[I 2025-01-04 17:06:40,069] Trial 4 finished with value: 0.6218106584699603 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.00018841183049085134, 'class

In [13]:
optuna.visualization.plot_optimization_history(study_best_features)

In [14]:
optuna.visualization.plot_slice(study_best_features)

In [15]:
optuna.visualization.plot_param_importances(study_best_features)

In [16]:
optuna.visualization.plot_rank(study_best_features)

In [17]:
optuna.visualization.plot_parallel_coordinate(study_best_features)

In [18]:
optuna.visualization.plot_edf(study_best_features)

In [19]:
optuna.visualization.plot_terminator_improvement(study_best_features)


plot_terminator_improvement is experimental (supported from v3.2.0). The interface can change in the future.


RegretBoundEvaluator is experimental (supported from v3.2.0). The interface can change in the future.


CrossValidationErrorEvaluator is experimental (supported from v3.2.0). The interface can change in the future.

 58%|█████▊    | 117/200 [00:09<00:09,  8.31it/s][W 2025-01-04 17:20:11,561] The optimization of kernel_params failed: 
linalg.cholesky: The factorization could not be completed because the input is not positive-definite (the leading minor of order 5 is not positive-definite).
The default initial kernel params will be used instead.
 74%|███████▎  | 147/200 [00:14<00:08,  6.61it/s][W 2025-01-04 17:20:15,810] The optimization of kernel_params failed: 
linalg.cholesky: The factorization could not be completed because the input is not positive-definite (the leading minor of order 1 is not positive-definite).
The default initial kernel params will be used instead.
 76%

In [20]:
import joblib
import json

In [21]:
joblib.dump(final_model, "../logistic_regression/best_models/best_model_log_cros.pkl")

['../logistic_regression/best_models/best_model_log_cros.pkl']

In [22]:
np.save("../logistic_regression/best_models/best_features_log_cros.npy", best_features)

In [23]:
with open("../logistic_regression/best_models/best_params_log_cros.json", "w") as f:
    json.dump(best_params, f)

In [25]:
with open("../logistic_regression/best_models/best_model_log_cros_as_txt.txt", "w") as f:
    f.write(str(final_model))