In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from lime.lime_tabular import LimeTabularExplainer

# Schritt 1: Daten laden und vorbereiten
file_path = "data/Obesity_Dataset_FE_WOBMI.xlsx"
data = pd.read_excel(file_path)

# Ziel- und Feature-Spalten definieren
target_column = 'Class'
X = data.drop(columns=[target_column])  # Features
y = data[target_column]  # Zielvariable

# Klassenlabels auf 0-basierte Indizes umstellen
y = y - y.min()

# Schritt 2: Daten in Training und Test aufteilen
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Daten skalieren
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Benutzerdefinierte Klassenlabels
class_names = ["Untergewicht", "Normalgewicht", "Übergewicht", "Adipositas"]
feature_names = ["Sex", "Age",	"Height", "Overweight_Obese_Family", "Consumption_of_Fast_Food", "Frequency_of_Consuming_Vegetables", "Number_of_Main_Meals_Daily",	"Food_Intake_Between_Meals", "Smoking",	"Liquid_Intake_Daily",	"Calculation_of_Calorie_Intake", "Physical_Excercise",	"Schedule_Dedicated_to_Technology",	"Type_of_Transportation_Used",	"FoodConsumption", "Activity"]


# Schritt 3: Feature-Auswahl
selector = SelectFromModel(LinearSVC(penalty="l2", dual=False, random_state=42)).fit(X_train, y_train)
selected_features = X.columns[selector.get_support()]
print("Ausgewählte Features:", selected_features)
X_train = selector.transform(X_train)
X_test = selector.transform(X_test)

# Sicherstellen, dass Features ausgewählt wurden
if X_train.shape[1] == 0:
    raise ValueError("Keine Features nach der Auswahl übrig. Überprüfen Sie die Feature-Auswahl.")

# Schritt 4: Verschiedene Modelle definieren
models = {
    #'SVM': {
    #  'model': Pipeline([('classification', LinearSVC(dual=False, max_iter=1000, random_state=42))]),
    #  'params': {'classification__C': [0.01, 0.1, 1, 10]}
    #},
    'Random Forest': {
        'model': Pipeline([('classification', RandomForestClassifier(random_state=42))]),
        'params': {'classification__n_estimators': [50, 100, 200], 'classification__max_depth': [None, 10, 20]}
    },
    #'Logistic Regression': {
    #    'model': Pipeline([('classification', LogisticRegression(max_iter=500, random_state=42))]),
    #    'params': {'classification__C': [0.01, 0.1, 1, 10]}
    #},
    
}

# Schritt 5: Hyperparameter-Tuning und Training
best_models = {}
for name, config in models.items():
    grid_search = GridSearchCV(config['model'], config['params'], cv=5, scoring='accuracy', n_jobs=-1)
    best_models[name] = grid_search


def plot_learning_curves(model, X_train, y_train, X_dev, y_dev):
    train_fs, dev_fs = [], []
    for m in range(10, X_train.shape[0],10):
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.best_estimator_.predict(X_train[:m])
        y_dev_predict = model.best_estimator_.predict(X_dev)
        train_fs.append(f1_score(y_train[:m], y_train_predict,average='weighted'))
        dev_fs.append(f1_score(y_dev, y_dev_predict, average='weighted'))
        #print("\nKlassifikationsbericht:\n", classification_report(y_dev, y_dev_predict, target_names=class_labels))
    plt.plot(train_fs, "r-+", linewidth=2, label="train")
    plt.plot(dev_fs, "b-", linewidth=3, label="dev")
    plt.legend()
    plt.xlabel('Index der Trainingsiteration')
    plt.ylabel('F-Score des Lerners')
    plt.show()  # Zeigt den Plot an


# Schritt 6: Ergebnisse der besten Modelle auswerten
#for name, grid_search in best_models.items():
    #plot_learning_curves(grid_search, X_train, y_train, X_test, y_test)

# Schritt 7: Modell evaluieren
for name, grid_search in best_models.items():
    #Pipeline definieren
    c = grid_search

    # Pipeline trainieren
    c.fit(X_train, y_train)

    # Wahrscheinlichkeiten für ein Testbeispiel vorhersagen
    idx = 1
    print(c.predict_proba([X_test[idx]]).round(3))

    # LIME Tabular Explainer initialisieren
    explainer = LimeTabularExplainer(
        training_data=X_train,  # Trainingsdaten als NumPy-Array
        feature_names=feature_names,  # Spaltennamen
        class_names=class_names,  # Klassenlabels
        mode='classification'
    )

    # Testbeispiel auswählen
    sample = X_test[idx]  # Zeile als NumPy-Array

    # Erklärung mit LIME für tabellarische Daten erzeugen
    exp = explainer.explain_instance(
    sample,
    c.predict_proba,
    num_features=5, # Anzahl der erklärbaren Features
    top_labels=2
    )

    # Ausgabe
    print('Document id: %d' % idx)
    print('Predicted class:', class_names[c.predict([X_test[idx]])[0]])
    print('True class:', class_names[y_test.iloc[idx]])

    # Welche beiden Labels wurden vorgeschlagen?
    top_labels= exp.available_labels()


    # Ausgabe textuell

    print ('Explanation for class %s' % class_names[top_labels[0]])
    print ('\n'.join(map(str, exp.as_list(top_labels[0]))))

    print ('Explanation for class %s' % class_names[top_labels[1]])
    print ('\n'.join(map(str, exp.as_list(top_labels[1]))))

    

Ausgewählte Features: Index(['Sex', 'Age', 'Number_of_Main_Meals_Daily', 'Smoking',
       'Calculation_of_Calorie_Intake', 'Physical_Excercise',
       'Type_of_Transportation_Used'],
      dtype='object')
[[0.    0.612 0.388 0.   ]]
Document id: 1
Predicted class: Normalgewicht
True class: Normalgewicht
Explanation for class Normalgewicht
('Age <= -0.82', 0.4116342741902539)
('-0.93 < Frequency_of_Consuming_Vegetables <= -0.19', 0.04583713655558833)
('-1.36 < Height <= 0.21', 0.033552306389893434)
('-1.10 < Number_of_Main_Meals_Daily <= 0.21', 0.027996545793167918)
('-1.52 < Overweight_Obese_Family <= 0.66', 0.0030420171323058093)
Explanation for class Übergewicht
('Age <= -0.82', -0.16674160498869184)
('-0.93 < Frequency_of_Consuming_Vegetables <= -0.19', 0.04901520693182352)
('-1.36 < Height <= 0.21', 0.02750351852928668)
('-1.52 < Overweight_Obese_Family <= 0.66', -0.005661832576923768)
('-1.10 < Number_of_Main_Meals_Daily <= 0.21', 0.0040734446600769825)
