In [2]:
import sys
from pathlib import Path
sys.path.append(str(Path().resolve().parent))  # eine Ebene hoch auf project/
import utils.data_loader as data_loader
import pandas as pd
import numpy as np  
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV

In [6]:
# load train dataset 

X_train, y_train = data_loader.get_training_data()

In [9]:
# create random subset of 15000 examples to optimize hyperparameters 
X_sub, _, y_sub, _ = train_test_split(
    X_train, y_train,
    train_size=15000,
    stratify=y_train,     # wichtig bei Klassifikationsproblemen
    random_state=42
)

In [11]:
C_range = [0.1, 1, 10, 100, 1000] 
gamma_range = [1, 0.1, 0.01, 0.001, 0.0001] 
kernels = ['linear', 'rbf'] 
param_dist = [ 
    {'kernel': ['linear'], 'C': C_range}, 
    {'kernel': ['rbf'], 'C': C_range, 'gamma': gamma_range} 
    ]

random_search = RandomizedSearchCV(SVC(random_state=42), param_dist, n_iter=25, refit=True, cv=5, verbose=10, n_jobs=-1, random_state=42)
random_search.fit(X_sub,y_sub)

results = random_search.cv_results_
df_results = pd.DataFrame(results)
df_results.to_csv("random_search_results.csv", index=False)
print(f'The best parameter is {random_search.best_params_}')
print(f'The best score is {random_search.best_score_}')
print(f'The best estimator {random_search.best_estimator_}')

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5; 1/25] START C=1000, gamma=0.01, kernel=rbf.............................
[CV 4/5; 1/25] START C=1000, gamma=0.01, kernel=rbf.............................
[CV 5/5; 1/25] START C=1000, gamma=0.01, kernel=rbf.............................
[CV 3/5; 1/25] START C=1000, gamma=0.01, kernel=rbf.............................
[CV 1/5; 2/25] START C=10, gamma=1, kernel=rbf..................................
[CV 3/5; 2/25] START C=10, gamma=1, kernel=rbf..................................
[CV 2/5; 1/25] START C=1000, gamma=0.01, kernel=rbf.............................
[CV 2/5; 2/25] START C=10, gamma=1, kernel=rbf..................................
[CV 1/5; 1/25] END C=1000, gamma=0.01, kernel=rbf;, score=0.962 total time= 2.0min
[CV 4/5; 2/25] START C=10, gamma=1, kernel=rbf..................................
[CV 5/5; 1/25] END C=1000, gamma=0.01, kernel=rbf;, score=0.964 total time= 2.0min
[CV 4/5; 1/25] END C=1000, gamma=0.01, kern

In [5]:
df_results = df = pd.read_csv("random_search_results.csv")
df_sorted = df.sort_values(by="mean_test_score", ascending=True)
df_sorted = df_sorted[['param_C', 'param_kernel', 'param_gamma', 'mean_test_score']]
df_sorted

Unnamed: 0,param_C,param_kernel,param_gamma,mean_test_score
12,0.1,rbf,1.0,0.112333
18,1000.0,rbf,1.0,0.179867
24,100.0,rbf,1.0,0.179867
1,10.0,rbf,1.0,0.179867
5,0.1,rbf,0.0001,0.2
4,0.1,rbf,0.001,0.855533
14,1.0,rbf,0.1,0.889667
21,1000.0,rbf,0.1,0.895467
11,10.0,rbf,0.1,0.895467
20,100.0,rbf,0.1,0.895467


In [12]:
X_sub, _, y_sub, _ = train_test_split(
    X_train, y_train,
    train_size=15000,
    stratify=y_train,
    random_state=43
)

In [14]:
# new subclasses without gamma = 1 due to bad results
C_range = [0.1, 1, 10, 100, 1000] 
gamma_range = [0.1, 0.01, 0.001, 0.0001] 
kernels = ['linear', 'rbf'] 
param_dist = [ 
    {'kernel': ['linear'], 'C': C_range}, 
    {'kernel': ['rbf'], 'C': C_range, 'gamma': gamma_range} 
    ]

random_search = RandomizedSearchCV(SVC(random_state=42), param_dist, n_iter=20, refit=True, cv=5, verbose=10, n_jobs=-1, random_state=42, return_train_score=True)
random_search.fit(X_sub,y_sub)

results = random_search.cv_results_
df_results = pd.DataFrame(results)
df_results.to_csv("random_search_results20.csv", index=False)
print(f'The best parameter is {random_search.best_params_}')
print(f'The best score is {random_search.best_score_}')
print(f'The best estimator {random_search.best_estimator_}')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5; 2/20] START C=10, gamma=0.0001, kernel=rbf.............................
[CV 5/5; 1/20] START C=0.1, gamma=0.0001, kernel=rbf............................
[CV 2/5; 1/20] START C=0.1, gamma=0.0001, kernel=rbf............................
[CV 4/5; 1/20] START C=0.1, gamma=0.0001, kernel=rbf............................
[CV 3/5; 1/20] START C=0.1, gamma=0.0001, kernel=rbf............................
[CV 1/5; 1/20] START C=0.1, gamma=0.0001, kernel=rbf............................
[CV 2/5; 2/20] START C=10, gamma=0.0001, kernel=rbf.............................
[CV 3/5; 2/20] START C=10, gamma=0.0001, kernel=rbf.............................
[CV 1/5; 2/20] END C=10, gamma=0.0001, kernel=rbf;, score=(train=0.919, test=0.912) total time= 3.8min
[CV 4/5; 2/20] START C=10, gamma=0.0001, kernel=rbf.............................
[CV 3/5; 2/20] END C=10, gamma=0.0001, kernel=rbf;, score=(train=0.918, test=0.909) total time= 3.9min
[CV

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# ---- load results ----
df = pd.read_csv("results/random_search_results20.csv")
df.columns = df.columns.str.strip()

# ---- LINEAR: C vs. Accuracy ----
df_lin = df[df["param_kernel"] == "linear"].copy()
if not df_lin.empty:
    df_lin = df_lin.dropna(subset=["param_C"]).sort_values("param_C")

    plt.figure()
    plt.plot(df_lin["param_C"], df_lin["mean_test_score"], marker="o", label="Validation (CV mean)")
    if "mean_train_score" in df_lin.columns:
        plt.plot(df_lin["param_C"], df_lin["mean_train_score"], marker="o", label="Training (CV mean)")
    plt.xscale("log")
    plt.xlabel("C")
    plt.ylabel("Accuracy")
    plt.title("SVM Linear — Accuracy vs C")
    plt.legend()
    plt.tight_layout()
    plt.savefig("results/svm_linear_accuracy_vs_C.png", dpi=300)
    plt.close()  # Fenster schließen, spart Speicher

# ---- RBF: Heatmaps ----
df_rbf = df[df["param_kernel"] == "rbf"].copy()
if not df_rbf.empty:
    df_rbf = (
        df_rbf.dropna(subset=["param_C", "param_gamma"])
              .groupby(["param_gamma", "param_C"], as_index=False)
              .agg(mean_test_score=("mean_test_score", "mean"),
                   mean_train_score=("mean_train_score", "mean"))
    )

    Cs = np.sort(df_rbf["param_C"].unique())
    gammas = np.sort(df_rbf["param_gamma"].unique())

    H_val = df_rbf.pivot(index="param_gamma", columns="param_C", values="mean_test_score")
    H_trn = df_rbf.pivot(index="param_gamma", columns="param_C", values="mean_train_score")

    # --- Validation Heatmap ---
    plt.figure()
    im = plt.imshow(H_val.values, origin="lower", aspect="auto", cmap="Oranges")
    plt.colorbar(im, label="Mean CV Accuracy (Validation)")
    plt.xticks(range(len(Cs)), [str(c) for c in Cs], rotation=45, ha="right")
    plt.yticks(range(len(gammas)), [str(g) for g in gammas])
    plt.xlabel("C")
    plt.ylabel("gamma")
    plt.title("SVM RBF — Validation Accuracy")
    plt.tight_layout()
    plt.savefig("results/svm_rbf_validation_heatmap.png", dpi=300)
    plt.close()

    # --- Training Heatmap ---
    plt.figure()
    im = plt.imshow(H_trn.values, origin="lower", aspect="auto", cmap="Oranges")
    plt.colorbar(im, label="Mean CV Accuracy (Training)")
    plt.xticks(range(len(Cs)), [str(c) for c in Cs], rotation=45, ha="right")
    plt.yticks(range(len(gammas)), [str(g) for g in gammas])
    plt.xlabel("C")
    plt.ylabel("gamma")
    plt.title("SVM RBF — Training Accuracy")
    plt.tight_layout()
    plt.savefig("results/svm_rbf_training_heatmap.png", dpi=300)
    plt.close()



In [18]:
df_sorted = df.sort_values(by="mean_test_score", ascending=False)
df_sorted = df_sorted[['param_C', 'param_kernel', 'param_gamma', 'mean_test_score', 'mean_train_score']]
df_sorted

Unnamed: 0,param_C,param_kernel,param_gamma,mean_test_score,mean_train_score
18,100.0,rbf,0.01,0.968533,1.0
8,1000.0,rbf,0.01,0.968533,1.0
3,1000.0,rbf,0.001,0.944667,1.0
12,10.0,rbf,0.001,0.936533,0.956667
19,1000.0,rbf,0.0001,0.9296,0.9811
2,0.1,linear,,0.9296,0.969517
15,100.0,rbf,0.0001,0.929533,0.94935
7,1.0,linear,,0.9164,0.994333
4,1.0,rbf,0.001,0.914467,0.9206
1,10.0,rbf,0.0001,0.911667,0.917717


In [22]:
import joblib 
params = random_search.best_params_
params

{'kernel': 'rbf', 'gamma': 0.01, 'C': 1000}

In [24]:
# C = 100 since results are identical as shown above (preventing overfitting)
model = SVC(kernel='rbf', C=100, gamma=0.01, random_state=42)

# train model 
model.fit(X_train, y_train)

# save
joblib.dump(model, "svm_model.pkl") 