In [2]:
import pandas as pd
import optuna
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import numpy as np
import shap
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from scipy.stats import kstest, norm

In [3]:
import warnings
warnings.resetwarnings()
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter("default", ConvergenceWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [4]:
matches = pd.read_csv("../../preparation_before_models/data/matches.csv")

In [5]:
X=matches.drop(columns=['target'])
y=matches['target']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

## standaryzujemy zmienne, które mają rozkład normalny, do identyfikacji użyjemy testu kolmogorova-smirnova

In [7]:
def identify_normal_columns(X):
    normal_columns = []
    for col in X.columns:
        column_values = X[col]
        standarized_column = (column_values - np.mean(column_values)) / np.std(column_values)
        stat, p = kstest(standarized_column, 'norm')
        if p > 0.05:  # Hipoteza zerowa: zmienna ma rozkład normalny
            normal_columns.append(col)
    return normal_columns

normal_columns = identify_normal_columns(X_train)
print(f"Columns with normal ds: {normal_columns}")

Columns with normal ds: ['diff_age', 'diff_elo', 'diff_surface_elo', 'diff_blended_elo']


In [8]:
# Standaryzacja tylko dla zmiennych o rozkładzie normalnym
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()


X_train_scaled[normal_columns] = scaler.fit_transform(X_train[normal_columns])
X_test_scaled[normal_columns] = scaler.transform(X_test[normal_columns])

In [7]:
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## funkcja do wyboru top% cech według shap

In [9]:
def feature_selection_with_shap(X, y, percent):
    model = SVC(kernel='linear', probability=True)
    model.fit(X, y)

    explainer = shap.Explainer(model, X)
    shap_values = explainer(X)

    # Obliczenie średniej absolutnej ważności cech
    feature_importance = np.abs(shap_values.values).mean(axis=0)

    # Wybór cech na podstawie procentowego progu
    importance_threshold = np.percentile(feature_importance, 100 - percent)
    selected_features = np.where(feature_importance >= importance_threshold)[0]

    print(f"Selected {len(selected_features)} features out of {X.shape[1]} with top {percent}%.")
    return X.iloc[:, selected_features], selected_features

## funkcja celu dla optuny

In [8]:
def objective_svm_logloss(trial, X_train, y_train):
    kernel = trial.suggest_categorical("kernel", ["linear", "rbf", "poly"])
    params = {
        "C": trial.suggest_float("C", 1e-3, 1e3, log=True),
        "kernel": kernel
    }
    if kernel in ["rbf", "poly"]:
        params["gamma"] = trial.suggest_float("gamma", 1e-4, 1e0, log=True)
    if kernel == "poly":
        params["degree"] = trial.suggest_int("degree", 2, 5)

    model = SVC(**params)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_log_loss")

    return -scores.mean()

## Testowanie w pętli accuracy dla różnego % cech i optymalizacja

In [None]:
percentages = [ 50, 65, 75, 85, 100]  # Procent cech do wybrania
best_logloss = float("inf")
best_features = None
best_params = None
best_num_features = 0

for percent in percentages:
    print(f"Testing top {percent}% features...")
    
    # Jeśli procent wynosi 100%, nie przeprowadzamy selekcji
    if percent == 100:
        X_train_selected, selected_features = X_train_scaled, list(range(X_train_scaled.shape[1]))
    else:
        X_train_selected, selected_features = feature_selection_with_shap(X_train_scaled, y_train, percent)
    
    X_test_selected = X_test_scaled.iloc[:, selected_features]

    # Optuna optymalizacja dla aktualnego podzbioru cech
    def wrapped_objective(trial):
        return objective_svm_logloss(trial, X_train_selected, y_train)

    study_svm_logloss = optuna.create_study(direction="minimize")
    study_svm_logloss.optimize(wrapped_objective, n_trials=100, timeout=3600)

    if study_svm_logloss.best_value < best_logloss:
        best_logloss = study_svm_logloss.best_value
        best_features = selected_features
        best_params = study_svm_logloss.best_params
        best_num_features = len(selected_features)

print(f"Best logloss: {best_logloss}")
print(f"Best number of features: {best_num_features}")
print(f"Best parameters: {best_params}")

Testing top 50% features...


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])
[I 2024-12-19 16:22:22,421] A new study created in memory with name: no-name-bdcbe084-562e-4bce-b95d-35067ca93f68


Selected 21 features out of 41 with top 50%.


  "C": trial.suggest_loguniform("C", 1e-3, 1e3),
  params["gamma"] = trial.suggest_loguniform("gamma", 1e-4, 1e0)
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


## Finalny model

In [None]:
X_train_final = X_train_scaled.iloc[:, best_features]
X_test_final = X_test_scaled.iloc[:, best_features]

final_model = SVC(**best_params)
final_model.fit(X_train_final, y_train)

y_pred_final = final_model.predict(X_test_final)
test_accuracy = accuracy_score(y_test, y_pred_final)
print(f"Final test accuracy: {test_accuracy}")

## shap bardzo dlugo zajmuje, więc zobacze na wstepie dla wszystkich cech jak sytuacja wygląda

In [13]:
def wrapped_objective(trial):
    return objective_svm_logloss(trial, X_train_scaled, y_train)
study_svm = optuna.create_study(direction="maximize", sampler=optuna.samplers.RandomSampler(seed=42))
study_svm.optimize(wrapped_objective, n_trials=200, timeout=3600)
best_params_svm = study_svm.best_params
print("Best parameters:", best_params_svm)

[I 2024-12-19 20:28:21,973] A new study created in memory with name: no-name-ef487da6-5f3e-4cb1-8811-27b30ecb2596
[I 2024-12-19 20:28:42,463] Trial 0 finished with value: 0.651512376364475 and parameters: {'kernel': 'rbf', 'C': 3.907967156822881, 'gamma': 0.00042079886696066364}. Best is trial 0 with value: 0.651512376364475.
[I 2024-12-19 20:29:35,941] Trial 1 finished with value: 0.5638345718239123 and parameters: {'kernel': 'poly', 'C': 4.0428727350273315, 'gamma': 0.06796578090758151, 'degree': 2}. Best is trial 0 with value: 0.651512376364475.
[I 2024-12-19 20:29:48,515] Trial 2 finished with value: 0.6528281658381592 and parameters: {'kernel': 'linear', 'C': 0.012329623163659839}. Best is trial 2 with value: 0.6528281658381592.
[I 2024-12-19 20:30:03,909] Trial 3 finished with value: 0.5009620253164556 and parameters: {'kernel': 'poly', 'C': 0.3905441275210791, 'gamma': 0.0014618962793704966, 'degree': 4}. Best is trial 2 with value: 0.6528281658381592.
[I 2024-12-19 20:30:43,281

In [None]:
final_model_svm = SVC(**best_params_svm)
final_model_svm.fit(X_train, y_train)
y_pred_svm = final_model_svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("Final model accuracy on test data:", accuracy_svm)