In [16]:
import time
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_auc_score, matthews_corrcoef,
    mean_absolute_error, mean_squared_error
)

def run_experiment(data, n_repeats=10):
    # Säubere die Daten, entferne unnötige Spalten (z. B. 'Unnamed')
    data = data.loc[:, ~data.columns.str.contains('^Unnamed')]

    # Überprüfen der Datenstruktur
    print("Datensatz-Form:", data.shape)
    print("Spalten:", data.columns)

    # Features (X) und Zielvariable (y) definieren
    X = data.iloc[:, :-1].values  # alle Spalten außer der letzten
    y = data.iloc[:, -1].values   # letzte Spalte als Zielvariable

    accuracies = []
    training_times = []
    evaluation_times = []

    for i in range(n_repeats):
        # Splitte die Daten in Trainings- und Testdaten
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)

        # Trainingszeit messen
        start_train = time.time()
        model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
        model.fit(X_train, y_train)
        end_train = time.time()
        training_time = end_train - start_train
        training_times.append(training_time)

        # Evaluationszeit messen
        start_eval = time.time()
        y_pred = model.predict(X_test)
        end_eval = time.time()
        evaluation_time = end_eval - start_eval
        evaluation_times.append(evaluation_time)

        # Metriken berechnen
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)

        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        cm = confusion_matrix(y_test, y_pred)

        # AUC-Berechnung
        proba = model.predict_proba(X_test)
        unique_classes = np.unique(y_test)
        if len(unique_classes) > 2:
            roc_auc = roc_auc_score(y_test, proba, multi_class='ovr')
        else:
            roc_auc = roc_auc_score(y_test, proba[:, 1])

        mcc = matthews_corrcoef(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)



    return np.array(accuracies), np.array(training_times), np.array(evaluation_times)


# Beispielaufruf
data_path = "C:\\Users\\jakob\\Documents\\Praktikum\\ML_Praktikum\\ml_praktikum_jagoetz_wkathari\\dataset\\clf_num\\Higgs.csv"
data = pd.read_csv(data_path)
accuracies, training_times, evaluation_times = run_experiment(data, n_repeats=10)

print("Accuracies:", accuracies)
print("Training Times:", training_times)
print("Evaluation Times:", evaluation_times)

# Berechnen von Mittelwert und Standardfehler (SE = std / sqrt(n))
def mean_and_se(values):
    mean_val = np.mean(values)
    se_val = np.std(values, ddof=1) / np.sqrt(len(values))
    return mean_val, se_val

mean_acc, se_acc = mean_and_se(accuracies)
mean_tt, se_tt = mean_and_se(training_times)
mean_et, se_et = mean_and_se(evaluation_times)

print("\nZusammenfassung (Mittelwert ± SE):")
print(f"Accuracy: {mean_acc:.4f} ± {se_acc:.4f}")
print(f"Training Time: {mean_tt:.4f}s ± {se_tt:.4f}s")
print(f"Evaluation Time: {mean_et:.4f}s ± {se_et:.4f}s")


Datensatz-Form: (940160, 25)
Spalten: Index(['lepton_pT', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude',
       'missing_energy_phi', 'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_2_pt',
       'jet_2_eta', 'jet_2_phi', 'jet_3_pt', 'jet_3_eta', 'jet_3_phi',
       'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'm_jj', 'm_jjj', 'm_lv', 'm_jlv',
       'm_bb', 'm_wbb', 'm_wwbb', 'target'],
      dtype='object')


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Accuracies: [0.73705539 0.73674162 0.73680012 0.73706071 0.73690117 0.7372309
 0.73534824 0.7360077  0.73689585 0.73619916]
Training Times: [3.48956251 3.46041369 3.4388051  3.38158131 3.38998175 3.52903509
 3.51561236 3.78859925 3.67370248 3.77179384]
Evaluation Times: [0.06981015 0.06021237 0.05895448 0.07591462 0.06287837 0.06227303
 0.05713439 0.06990886 0.06632233 0.06914854]

Zusammenfassung (Mittelwert ± SE):
Accuracy: 0.7366 ± 0.0002
Training Time: 3.5439s ± 0.0472s
Evaluation Time: 0.0653s ± 0.0019s
