In [6]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_auc_score,
    matthews_corrcoef,
    mean_absolute_error,
    mean_squared_error,
)
from sklearn.model_selection import cross_val_score, train_test_split
import wittgenstein as lw

# Datei laden
print("Lade die CSV-Datei...")
csv_file_path = "C:\\Users\\jakob\\Documents\\Praktikum\\ML_Praktikum\\ml_praktikum_jagoetz_wkathari\\dataset\\clf_cat\\default-of-credit-card-clients.csv"
data = pd.read_csv(csv_file_path)

# Zielvariable setzen (Spalte: "y")
target_column = "y"

if target_column not in data.columns:
    raise ValueError(f"Die Datei enthält keine '{target_column}'-Spalte. Bitte überprüfen.")

# Features und Zielvariable trennen
X = data.drop(columns=[target_column])
y = data[target_column]

# Zielvariable in nominal umwandeln (falls numerisch)
if y.dtype != 'object':
    print(f"Konvertiere Zielvariable '{target_column}' in nominale Werte...")
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

# Diskretisierung der numerischen Features
print("Diskretisiere numerische Features...")
discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
X = pd.DataFrame(discretizer.fit_transform(X), columns=X.columns)

# Positive Klasse definieren
pos_class = 1  # Annahme: 1 ist die positive Klasse

# Anzahl der Wiederholungen
n_experiments = 10

# Matrizen für Ergebnisse
train_times = []
eval_times = []
accuracies = []
precisions = []
recalls = []
f1_scores = []
roc_aucs = []
mccs = []
maes = []
rmses = []

for i in range(n_experiments):
    print(f"\nStarte Experiment {i + 1}/{n_experiments} mit random_seed={i}...")

    # Train-Test-Split mit zufälligem Seed
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=i)

    # RIPPER initialisieren mit optimierten Parametern
    ripper = lw.RIPPER(
        n_discretize_bins=10,  # Diskretisierungsintervall
        verbosity=0,          # Deaktiviert Ausgaben
    )

    # **Trainingszeit messen**
    start_time_train = time.time()
    ripper.fit(
        pd.concat([X_train, pd.Series(y_train, name=target_column)], axis=1),
        class_feat=target_column,
        pos_class=pos_class,  # Positive Klasse definieren
    )
    end_time_train = time.time()
    train_time = end_time_train - start_time_train

    # **Vorhersagen und Evaluierung**
    start_time_eval = time.time()
    y_pred = ripper.predict(X_test)
    end_time_eval = time.time()
    eval_time = end_time_eval - start_time_eval

    # Berechnung der Metriken
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    mcc = matthews_corrcoef(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    roc_auc = roc_auc_score(y_test, y_pred) if len(np.unique(y_test)) == 2 else None

    # Ergebnisse speichern
    train_times.append(train_time)
    eval_times.append(eval_time)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    mccs.append(mcc)
    maes.append(mae)
    rmses.append(rmse)
    roc_aucs.append(roc_auc if roc_auc is not None else None)

    # Ergebnisse dieses Experiments ausgeben
    print(f"Trainingszeit: {train_time:.4f} s, Evaluierungszeit: {eval_time:.4f} s")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

# **Matrizen erstellen**
train_times_matrix = np.array(train_times).reshape(1, -1)
eval_times_matrix = np.array(eval_times).reshape(1, -1)
metrics_matrix = {
    "Accuracy": np.array(accuracies).reshape(1, -1),
    "Precision": np.array(precisions).reshape(1, -1),
    "Recall": np.array(recalls).reshape(1, -1),
    "F1-Score": np.array(f1_scores).reshape(1, -1),
    "MCC": np.array(mccs).reshape(1, -1),
    "MAE": np.array(maes).reshape(1, -1),
    "RMSE": np.array(rmses).reshape(1, -1),
    "ROC AUC": np.array(roc_aucs).reshape(1, -1),
}

# **Ergebnisse ausgeben**
print("\n=== Ergebnis-Matrizen ===")
print("\nTrainingszeiten (s):")
print(train_times_matrix)

print("\nEvaluierungszeiten (s):")
print(eval_times_matrix)

print("\nMetriken:")
for metric_name, matrix in metrics_matrix.items():
    print(f"\n{metric_name}:")
    print(matrix)


Lade die CSV-Datei...
Konvertiere Zielvariable 'y' in nominale Werte...
Diskretisiere numerische Features...

Starte Experiment 1/10 mit random_seed=0...
Trainingszeit: 2.6784 s, Evaluierungszeit: 0.1335 s
Accuracy: 0.4881, Precision: 0.5000, Recall: 0.0009, F1-Score: 0.0018

Starte Experiment 2/10 mit random_seed=1...
Trainingszeit: 2.7130 s, Evaluierungszeit: 0.0899 s
Accuracy: 0.5068, Precision: 1.0000, Recall: 0.0009, F1-Score: 0.0018

Starte Experiment 3/10 mit random_seed=2...
Trainingszeit: 2.8067 s, Evaluierungszeit: 0.0832 s
Accuracy: 0.5014, Precision: 0.3333, Recall: 0.0009, F1-Score: 0.0018

Starte Experiment 4/10 mit random_seed=3...
Trainingszeit: 3.3108 s, Evaluierungszeit: 0.0869 s
Accuracy: 0.5039, Precision: 0.0000, Recall: 0.0000, F1-Score: 0.0000

Starte Experiment 5/10 mit random_seed=4...
Trainingszeit: 3.2151 s, Evaluierungszeit: 0.1095 s
Accuracy: 0.5112, Precision: 1.0000, Recall: 0.0014, F1-Score: 0.0028

Starte Experiment 6/10 mit random_seed=5...
Trainingsze

In [11]:
import pandas as pd
import numpy as np
import time
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_auc_score,
    matthews_corrcoef,
    mean_absolute_error,
    mean_squared_error,
)
from sklearn.model_selection import train_test_split


# Datei laden
print("Lade die CSV-Datei...")
csv_file_path = "C:\\Users\\jakob\\Documents\\Praktikum\\ML_Praktikum\\ml_praktikum_jagoetz_wkathari\\dataset\\clf_num\\Higgs.csv"
data = pd.read_csv(csv_file_path)

# Zielvariable setzen (Spalte: "y")
target_column = "target"

if target_column not in data.columns:
    raise ValueError(f"Die Datei enthält keine '{target_column}'-Spalte. Bitte überprüfen.")

# Features und Zielvariable trennen
X = data.drop(columns=[target_column])
y = data[target_column]

# Zielvariable in nominal umwandeln (falls numerisch)
if y.dtype != 'object':
    print(f"Konvertiere Zielvariable '{target_column}' in nominale Werte...")
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

# Diskretisierung der numerischen Features
print("Diskretisiere numerische Features...")
discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
X = pd.DataFrame(discretizer.fit_transform(X), columns=X.columns)

# Positive Klasse definieren
pos_class = 1  # Annahme: 1 ist die positive Klasse

# Anzahl der Wiederholungen
n_experiments = 10

# Matrizen für Ergebnisse
train_times = []
eval_times = []
accuracies = []
precisions = []
recalls = []
f1_scores = []
roc_aucs = []
mccs = []
maes = []
rmses = []

for i in range(n_experiments):
    print(f"\nStarte Experiment {i + 1}/{n_experiments} mit random_seed={i}...")

    # Train-Test-Split mit zufälligem Seed
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=i)

    # PART (Entscheidungsbaum mit Regelableitung)
    part_model = DecisionTreeClassifier(
        criterion="gini",  # Ähnlich wie PART: Gini-Index für Teilung
        max_depth=None,    # Keine Begrenzung der Tiefe
        min_samples_split=2,  # Minimaler Split
        random_state=i
    )

    # **Trainingszeit messen**
    start_time_train = time.time()
    part_model.fit(X_train, y_train)
    end_time_train = time.time()
    train_time = end_time_train - start_time_train

    # **Vorhersagen und Evaluierung**
    start_time_eval = time.time()
    y_pred = part_model.predict(X_test)
    end_time_eval = time.time()
    eval_time = end_time_eval - start_time_eval

    # Berechnung der Metriken
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    mcc = matthews_corrcoef(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    roc_auc = roc_auc_score(y_test, y_pred) if len(np.unique(y_test)) == 2 else None

    # Ergebnisse speichern
    train_times.append(train_time)
    eval_times.append(eval_time)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    mccs.append(mcc)
    maes.append(mae)
    rmses.append(rmse)
    roc_aucs.append(roc_auc if roc_auc is not None else None)

    # Ergebnisse dieses Experiments ausgeben
    print(f"Trainingszeit: {train_time:.4f} s, Evaluierungszeit: {eval_time:.4f} s")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

    # Regeln des Entscheidungsbaums ausgeben (optional)
    rules = export_text(part_model, feature_names=list(X.columns))
    print("\nRegeln des Entscheidungsbaums:")
    print(rules)

# **Matrizen erstellen**
train_times_matrix = np.array(train_times).reshape(1, -1)
eval_times_matrix = np.array(eval_times).reshape(1, -1)
metrics_matrix = {
    "Accuracy": np.array(accuracies).reshape(1, -1),
    "Precision": np.array(precisions).reshape(1, -1),
    "Recall": np.array(recalls).reshape(1, -1),
    "F1-Score": np.array(f1_scores).reshape(1, -1),
    "MCC": np.array(mccs).reshape(1, -1),
    "MAE": np.array(maes).reshape(1, -1),
    "RMSE": np.array(rmses).reshape(1, -1),
    "ROC AUC": np.array(roc_aucs).reshape(1, -1),
}

# **Ergebnisse ausgeben**
print("\n=== Ergebnis-Matrizen ===")
print("\nTrainingszeiten (s):")
print(train_times_matrix)

print("\nEvaluierungszeiten (s):")
print(eval_times_matrix)

print("\nMetriken:")
for metric_name, matrix in metrics_matrix.items():
    print(f"\n{metric_name}:")
    print(matrix)


Lade die CSV-Datei...
Konvertiere Zielvariable 'target' in nominale Werte...
Diskretisiere numerische Features...

Starte Experiment 1/10 mit random_seed=0...
Trainingszeit: 9.3809 s, Evaluierungszeit: 0.2590 s
Accuracy: 0.5626, Precision: 0.5619, Recall: 0.5627, F1-Score: 0.5623

Regeln des Entscheidungsbaums:
|--- m_bb <= 0.50
|   |--- jet_1_pt <= 0.50
|   |   |--- jet_2_pt <= 0.50
|   |   |   |--- m_wwbb <= 0.50
|   |   |   |   |--- jet_4_pt <= 0.50
|   |   |   |   |   |--- jet_3_pt <= 0.50
|   |   |   |   |   |   |--- missing_energy_magnitude <= 0.50
|   |   |   |   |   |   |   |--- lepton_pT <= 2.50
|   |   |   |   |   |   |   |   |--- lepton_pT <= 1.50
|   |   |   |   |   |   |   |   |   |--- lepton_pT <= 0.50
|   |   |   |   |   |   |   |   |   |   |--- m_jlv <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 37
|   |   |   |   |   |   |   |   |   |   |--- m_jlv >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 20
