In [25]:
# Setup

import numpy as np

# Ein Zufalls-Seed für Reproduzierbarkeit
np.random.seed(42)

from datetime import datetime
import pandas as pd
from scipy import stats
import sklearn as sklearn
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, classification_report
from sklearn.impute import KNNImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from IPython.display import display, clear_output

In [26]:
# Daten importieren
data = pd.read_csv('data/diabetes.csv')

# Nullen durch NaN ersetzen
data["Glucose"] = data["Glucose"].replace(0, np.nan)
data["BloodPressure"] = data["BloodPressure"].replace(0, np.nan)
data["SkinThickness"] = data["SkinThickness"].replace(0, np.nan)
data["Insulin"] = data["Insulin"].replace(0, np.nan)
data["BMI"] = data["BMI"].replace(0, np.nan)
data["DiabetesPedigreeFunction"] = data["DiabetesPedigreeFunction"].replace(0, np.nan)
data["Age"] = data["Age"].replace(0, np.nan)

# NaN Werte imputieren
imputer = KNNImputer(n_neighbors=3)

data_imputed = pd.DataFrame(imputer.fit_transform(data))
data_imputed.columns = data.columns

In [27]:
def svm_func(random_state):
    # Input-Daten
    X = data_imputed[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']].values
    # Der Wert, der vorhergesagt werden soll
    y = data_imputed['Outcome'].values

    # Train-Test-Split als 70/30
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)

    clf = make_pipeline(StandardScaler(), svm.SVC(C=300, kernel='poly', gamma='scale', degree=8, verbose=True))
    clf.fit(X_train, y_train)

    predictions = clf.predict(X_test)

    #report = classification_report(y_test, predictions)
    #matrix = confusion_matrix(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='macro')
    
    return clf, f1

    #print(matrix)
    #print(report)

In [28]:
# 100 verschiedene Modelle trainieren und in einen DataFrame speichern
# Das kann ein paar Stunden dauern

results = {}

for i in np.arange(50):
    model, f1 = svm_func(i)
    results[i] = [model, f1]
    clear_output()
    df_results = pd.DataFrame.from_dict(results, orient='index', columns=['model', 'f1'])
    display(df_results)

Unnamed: 0,model,f1
0,"(StandardScaler(), SVC(C=300, degree=8, kernel...",0.90513
1,"(StandardScaler(), SVC(C=300, degree=8, kernel...",0.912051
2,"(StandardScaler(), SVC(C=300, degree=8, kernel...",0.889334
3,"(StandardScaler(), SVC(C=300, degree=8, kernel...",0.91664
4,"(StandardScaler(), SVC(C=300, degree=8, kernel...",0.901602
5,"(StandardScaler(), SVC(C=300, degree=8, kernel...",0.931808
6,"(StandardScaler(), SVC(C=300, degree=8, kernel...",0.905542
7,"(StandardScaler(), SVC(C=300, degree=8, kernel...",0.905745
8,"(StandardScaler(), SVC(C=300, degree=8, kernel...",0.934794
9,"(StandardScaler(), SVC(C=300, degree=8, kernel...",0.914124


In [29]:
# DataFrame nach bestem F1-Score sortieren
df_results_sorted = df_results.sort_values(by=['f1'], ascending=False)

In [30]:
df_results.describe()

Unnamed: 0,f1
count,50.0
mean,0.910707
std,0.010432
min,0.887695
25%,0.905238
50%,0.90929
75%,0.916365
max,0.938167
