In [23]:
# Setup

import numpy as np

# Ein Zufalls-Seed für Reproduzierbarkeit
np.random.seed(42)

from datetime import datetime
import pandas as pd
from scipy import stats
import sklearn as sklearn
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, classification_report, matthews_corrcoef
from sklearn.impute import KNNImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from IPython.display import display, clear_output

In [24]:
# Daten importieren
data = pd.read_csv('data/diabetes.csv')

# Nullen durch NaN ersetzen
data["Glucose"] = data["Glucose"].replace(0, np.nan)
data["BloodPressure"] = data["BloodPressure"].replace(0, np.nan)
data["SkinThickness"] = data["SkinThickness"].replace(0, np.nan)
data["Insulin"] = data["Insulin"].replace(0, np.nan)
data["BMI"] = data["BMI"].replace(0, np.nan)
data["DiabetesPedigreeFunction"] = data["DiabetesPedigreeFunction"].replace(0, np.nan)
data["Age"] = data["Age"].replace(0, np.nan)

# NaN Werte imputieren
imputer = KNNImputer(n_neighbors=3)

data_imputed = pd.DataFrame(imputer.fit_transform(data))
data_imputed.columns = data.columns

In [25]:
def svm_func(random_state):
    # Input-Daten
    X = data_imputed[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']].values
    # Der Wert, der vorhergesagt werden soll
    y = data_imputed['Outcome'].values

    # Train-Test-Split als 70/30
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)

    clf = make_pipeline(StandardScaler(), svm.SVC(C=300, kernel='poly', gamma='scale', degree=6, verbose=True))
    clf.fit(X_train, y_train)

    predictions = clf.predict(X_test)

    #report = classification_report(y_test, predictions)
    #matrix = confusion_matrix(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='macro')
    mcc = matthews_corrcoef(y_test, predictions)
    
    return clf, f1, mcc

    #print(matrix)
    #print(report)

In [26]:
# 100 verschiedene Modelle trainieren und in einen DataFrame speichern
# Das kann ein paar Stunden dauern

results = {}

for i in np.arange(50):
    model, f1, mcc = svm_func(i)
    results[i] = [model, f1, mcc]
    clear_output()
    df_results = pd.DataFrame.from_dict(results, orient='index', columns=['model', 'f1', 'mcc'])
    display(df_results)

Unnamed: 0,model,f1,mcc
0,"(StandardScaler(), SVC(C=300, degree=6, kernel...",0.927233,0.858153
1,"(StandardScaler(), SVC(C=300, degree=6, kernel...",0.931811,0.870929
2,"(StandardScaler(), SVC(C=300, degree=6, kernel...",0.92377,0.851021
3,"(StandardScaler(), SVC(C=300, degree=6, kernel...",0.924114,0.858049
4,"(StandardScaler(), SVC(C=300, degree=6, kernel...",0.935218,0.876336
5,"(StandardScaler(), SVC(C=300, degree=6, kernel...",0.943668,0.890251
6,"(StandardScaler(), SVC(C=300, degree=6, kernel...",0.92971,0.861975
7,"(StandardScaler(), SVC(C=300, degree=6, kernel...",0.92783,0.862541
8,"(StandardScaler(), SVC(C=300, degree=6, kernel...",0.952422,0.907753
9,"(StandardScaler(), SVC(C=300, degree=6, kernel...",0.928679,0.864718


In [27]:
# DataFrame nach bestem F1-Score sortieren
df_results_sorted = df_results.sort_values(by=['f1', 'mcc'], ascending=False)

In [28]:
df_results.describe()

Unnamed: 0,f1,mcc
count,50.0,50.0
mean,0.931264,0.867328
std,0.009481,0.017997
min,0.914001,0.834452
25%,0.925343,0.856986
50%,0.929465,0.863937
75%,0.936458,0.87679
max,0.952422,0.907753
