# 1. Introduction

In this notebook, the IGANN-IT model is applied to 23 structured data sets and compared with interpretable and non-interpretable baselines.

# 2. Setup

In [79]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, r2_score, log_loss, average_precision_score, f1_score
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, Ridge, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from interpret.glassbox import ExplainableBoostingRegressor, ExplainableBoostingClassifier
from pathlib import Path
import importlib
import time

import igann
importlib.reload(igann)
from igann import IGANN

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
data_path = "/Users/maximilianveitl/data/shared"
seed = 42

# 3. Overview Datasets

In [3]:
import yaml

with open("/Users/maximilianveitl/workspace/master/datasets-main/datasets.yml", "r") as file:
    data = yaml.safe_load(file)

dataset_rows = []
for name, cfg in data["datasets"].items():
    num_features = cfg.get("numerical_features", [])
    cat_features = cfg.get("categorical_features", [])

    if len(cat_features) >= 1:
        row = {
            "name": name,
            "file_name": cfg.get("file_name"),
            "task_type": cfg.get("task_type"),
            "n_num_features": len(num_features),
            "n_cat_features": len(cat_features),
            "target": cfg.get("target"),
            "separator": cfg.get("separator", ","),
            "folder_name": cfg.get("folder_name", "shared"),
            "num_features": num_features,
            "cat_features": cat_features
        }
        dataset_rows.append(row)
        
df_overview = pd.DataFrame(dataset_rows)

df_overview.head(23)

Unnamed: 0,name,file_name,task_type,n_num_features,n_cat_features,target,separator,folder_name,num_features,cat_features
0,stroke,healthcare-dataset-stroke-data.csv,binary,3,7,stroke,",",shared,"[age, avg_glucose_level, bmi]","[gender, hypertension, heart_disease, ever_mar..."
1,churn,WA_Fn-UseC_-Telco-Customer-Churn.csv,binary,3,16,Churn,",",shared,"[tenure, MonthlyCharges, TotalCharges]","[gender, SeniorCitizen, Partner, Dependents, P..."
2,fico,fico_heloc_dataset_v1.csv,binary,21,2,RiskPerformance,",",shared,"[ExternalRiskEstimate, MSinceOldestTradeOpen, ...","[MaxDelq2PublicRecLast12M, MaxDelqEver]"
3,bank,bank-full.csv,binary,6,9,y,;,shared,"[age, balance, day, campaign, pdays, previous]","[job, marital, education, default, housing, lo..."
4,adult,adult_census_income.csv,binary,6,8,income,",",shared,"[age, fnlwgt, education.num, capital.gain, cap...","[workclass, education, marital.status, occupat..."
5,airline,airline_train.csv,binary,18,4,satisfaction,",",shared,"[Age, Flight Distance, Inflight wifi service, ...","[Gender, Customer Type, Type of Travel, Class]"
6,college,college_data.csv,binary,4,6,will_go_to_college,",",shared,"[parent_age, parent_salary, house_area, averag...","[type_school, school_accreditation, gender, in..."
7,weather,weatherAUS.csv,binary,16,5,RainTomorrow,",",shared,"[MinTemp, MaxTemp, Rainfall, Evaporation, Suns...","[Location, WindGustDir, WindDir9am, WindDir3pm..."
8,compas,compas-scores-two-years.csv,binary,7,5,two_year_recid,",",shared,"[age, juv_fel_count, juv_misd_count, juv_other...","[sex, age_cat, race, c_charge_degree, c_charge..."
9,car,car.data,regression,13,11,price,",",shared,"[wheel-base, length, width, height, curb-weigh...","[symboling, make, fuel-type, aspiration, num-o..."


In [4]:
# Shape of the datasets
for idx, row in df_overview.iterrows():
    file_path = f"{data_path}/{row['name']}/{row['file_name']}"
    data = pd.read_csv(file_path, sep=row["separator"])
    print(f"Dataset: {row['name']}, Shape: {data.shape}")

Dataset: stroke, Shape: (5110, 12)
Dataset: churn, Shape: (7043, 21)
Dataset: fico, Shape: (10459, 24)
Dataset: bank, Shape: (45211, 17)
Dataset: adult, Shape: (32561, 15)
Dataset: airline, Shape: (103904, 25)
Dataset: college, Shape: (1000, 11)
Dataset: weather, Shape: (145460, 23)
Dataset: compas, Shape: (7214, 53)
Dataset: car, Shape: (201, 26)
Dataset: student, Shape: (649, 33)
Dataset: bike, Shape: (17379, 17)
Dataset: insurance, Shape: (1338, 7)
Dataset: crab, Shape: (3893, 9)
Dataset: diamond, Shape: (53943, 10)
Dataset: productivity, Shape: (1197, 15)
Dataset: diabetes, Shape: (442, 12)


# 4. Evaluation Regression Tasks

In [None]:
results_regression = pd.DataFrame(columns=["model", "dataset", "mse", "r2", "train_duration"])

## 4.1 Lasso and Ridge Regression

In [6]:
# model Lasso and Ridge (one-hot encoding needed)
datasets_filtered = df_overview[(df_overview["task_type"] == "regression") & (df_overview["name"] != "car")]

models = {
    "Lasso": Lasso(alpha=0.01),
    "Ridge": Ridge(alpha=0.01)
}

for idx, row in datasets_filtered.iterrows():
    # load data
    file_name = row["file_name"]
    separator = row["separator"]
    folder_name = row["name"]
    target = row["target"]
    file_path = f"{data_path}/{folder_name}/{file_name}"

    data = pd.read_csv(file_path, sep=separator)
    
    X = data.drop(columns=target)
    y = data[target]

    # Preprocessing
    # 1. Train-Test-Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

    # 2. Split features into numerical and categorical
    X_train_num = X_train[row["num_features"]]
    X_train_cat = X_train[row["cat_features"]]
    X_test_num = X_test[row["num_features"]]
    X_test_cat = X_test[row["cat_features"]]

    # 3. Replace empty strings and other chars in numerical features with np.nan
    X_train_num = X_train_num.replace({"": np.nan, " ": np.nan, "?": np.nan})
    X_test_num = X_test_num.replace({"": np.nan, " ": np.nan, "?": np.nan})

    # 4. Correct data types
    X_train_num = X_train_num.astype(float)
    X_test_num = X_test_num.astype(float)
    X_train_cat = X_train_cat.astype(str)
    X_test_cat = X_test_cat.astype(str)

    # 5. Impute missing values
    imputer_num = SimpleImputer(strategy='mean')
    imputer_cat = SimpleImputer(strategy='most_frequent')
    X_train_num = imputer_num.fit_transform(X_train_num)
    X_train_cat = imputer_cat.fit_transform(X_train_cat)
    X_test_num = imputer_num.transform(X_test_num)
    X_test_cat = imputer_cat.transform(X_test_cat)

    # 6. Scale numerical features and target and one-hot encode categorical features
    scaler = StandardScaler()
    X_train_num = scaler.fit_transform(X_train_num)
    X_test_num = scaler.transform(X_test_num)
    y_train_mean = y_train.mean()
    y_train_std = y_train.std()
    y_train = (y_train - y_train_mean) / y_train_std
    y_test = (y_test - y_train_mean) / y_train_std
    X_train_cat = pd.DataFrame(X_train_cat, columns=row["cat_features"])
    X_test_cat = pd.DataFrame(X_test_cat, columns=row["cat_features"])
    X_train_cat = pd.get_dummies(X_train_cat, drop_first=True)
    X_test_cat = pd.get_dummies(X_test_cat, drop_first=True)
    X_test_cat = X_test_cat.reindex(columns=X_train_cat.columns, fill_value=0)

    # 7. Concatenate numerical and categorical features
    X_train_num = pd.DataFrame(X_train_num, columns=row["num_features"])
    X_test_num = pd.DataFrame(X_test_num, columns=row["num_features"])

    X_train = pd.concat([X_train_num, X_train_cat], axis=1)
    X_test = pd.concat([X_test_num, X_test_cat], axis=1)

    # 8. Train Models
    for name, model in models.items():
        print(f"Training {name} on {folder_name} dataset...")
        start_time = time.time()
        model.fit(X_train, y_train)
        train_duration = time.time() - start_time
        y_pred = model.predict(X_test)

        # 9. Evaluate Models
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        results_regression.loc[len(results_regression)] = {
            'model': name,
            'dataset': folder_name,
            'mse': mse,
            'r2': r2,
            'train_duration': train_duration}

Training Lasso on student dataset...
Training Ridge on student dataset...
Training Lasso on bike dataset...
Training Ridge on bike dataset...
Training Lasso on insurance dataset...
Training Ridge on insurance dataset...
Training Lasso on crab dataset...
Training Ridge on crab dataset...
Training Lasso on diamond dataset...
Training Ridge on diamond dataset...
Training Lasso on productivity dataset...
Training Ridge on productivity dataset...
Training Lasso on diabetes dataset...
Training Ridge on diabetes dataset...


## 4.2 Decision Tree and Random Forest

In [7]:
# DecisionTree, RandomForest (OrdinalEncoder needed)
datasets_filtered = df_overview[(df_overview["task_type"] == "regression") & (df_overview["name"] != "car")]

models = {
    "DecisionTree": DecisionTreeRegressor(max_depth=5, random_state=seed),
    "RandomForest": RandomForestRegressor(n_estimators=100, max_depth=10, random_state=seed),
}

for idx, row in datasets_filtered.iterrows():
    # load data
    file_name = row["file_name"]
    separator = row["separator"]
    folder_name = row["name"]
    target = row["target"]
    file_path = f"{data_path}/{folder_name}/{file_name}"

    data = pd.read_csv(file_path, sep=separator)
    
    X = data.drop(columns=target)
    y = data[target]

    # Preprocessing
    # 1. Train-Test-Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

    # 2. Split features into numerical and categorical
    X_train_num = X_train[row["num_features"]]
    X_train_cat = X_train[row["cat_features"]]
    X_test_num = X_test[row["num_features"]]
    X_test_cat = X_test[row["cat_features"]]

    # 3. Replace empty strings and other chars in numerical features with np.nan
    X_train_num = X_train_num.replace({"": np.nan, " ": np.nan, "?": np.nan})
    X_test_num = X_test_num.replace({"": np.nan, " ": np.nan, "?": np.nan})

    # 4. Correct data types
    X_train_num = X_train_num.astype(float)
    X_test_num = X_test_num.astype(float)
    X_train_cat = X_train_cat.astype(str)
    X_test_cat = X_test_cat.astype(str)

    # 5. Impute missing values
    imputer_num = SimpleImputer(strategy='mean')
    imputer_cat = SimpleImputer(strategy='most_frequent')
    X_train_num = imputer_num.fit_transform(X_train_num)
    X_train_cat = imputer_cat.fit_transform(X_train_cat)
    X_test_num = imputer_num.transform(X_test_num)
    X_test_cat = imputer_cat.transform(X_test_cat)

    # 6. Scale numerical features and target and ordinal encoding categorical features
    scaler = StandardScaler()
    X_train_num = scaler.fit_transform(X_train_num)
    X_test_num = scaler.transform(X_test_num)
    y_train_mean = y_train.mean()
    y_train_std = y_train.std()
    y_train = (y_train - y_train_mean) / y_train_std
    y_test = (y_test - y_train_mean) / y_train_std
    X_train_cat = pd.DataFrame(X_train_cat, columns=row["cat_features"])
    X_test_cat = pd.DataFrame(X_test_cat, columns=row["cat_features"])
    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    X_train_cat = encoder.fit_transform(X_train_cat)
    X_test_cat = encoder.transform(X_test_cat)

    # 7. Concatenate numerical and categorical features
    X_train_num = pd.DataFrame(X_train_num, columns=row["num_features"])
    X_test_num = pd.DataFrame(X_test_num, columns=row["num_features"])
    X_train_cat = pd.DataFrame(X_train_cat, columns=row["cat_features"])
    X_test_cat = pd.DataFrame(X_test_cat, columns=row["cat_features"])

    X_train = pd.concat([X_train_num, X_train_cat], axis=1)
    X_test = pd.concat([X_test_num, X_test_cat], axis=1)
        
    # 8. Train Models
    for name, model in models.items():
        print(f"Training {name} on {folder_name} dataset...")
        start_time = time.time()
        model.fit(X_train, y_train)
        train_duration = time.time() - start_time
        y_pred = model.predict(X_test)

        # 9. Evaluate Models
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        results_regression.loc[len(results_regression)] = {
            'model': name,
            'dataset': folder_name,
            'mse': mse,
            'r2': r2,
            'train_duration': train_duration}

Training DecisionTree on student dataset...
Training RandomForest on student dataset...
Training DecisionTree on bike dataset...
Training RandomForest on bike dataset...
Training DecisionTree on insurance dataset...
Training RandomForest on insurance dataset...
Training DecisionTree on crab dataset...
Training RandomForest on crab dataset...
Training DecisionTree on diamond dataset...
Training RandomForest on diamond dataset...
Training DecisionTree on productivity dataset...
Training RandomForest on productivity dataset...
Training DecisionTree on diabetes dataset...
Training RandomForest on diabetes dataset...


## 4.3 EBM

In [8]:
# EBM

datasets_filtered = df_overview[(df_overview["task_type"] == "regression") & (df_overview["name"] != "car")]

for idx, row in datasets_filtered.iterrows():
    # load data
    file_name = row["file_name"]
    separator = row["separator"]
    folder_name = row["name"]
    target = row["target"]
    file_path = f"{data_path}/{folder_name}/{file_name}"

    data = pd.read_csv(file_path, sep=separator)
    
    X = data.drop(columns=target)
    y = data[target]

    # Preprocessing
    # 1. Train-Test-Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

    # 2. Split features into numerical and categorical
    X_train_num = X_train[row["num_features"]]
    X_train_cat = X_train[row["cat_features"]]
    X_test_num = X_test[row["num_features"]]
    X_test_cat = X_test[row["cat_features"]]

    # 3. Replace empty strings and other chars in numerical features with np.nan
    X_train_num = X_train_num.replace({"": np.nan, " ": np.nan, "?": np.nan})
    X_test_num = X_test_num.replace({"": np.nan, " ": np.nan, "?": np.nan})

    # 4. Correct data types
    X_train_num = X_train_num.astype(float)
    X_test_num = X_test_num.astype(float)
    X_train_cat = X_train_cat.astype(str)
    X_test_cat = X_test_cat.astype(str)

    # 5. Impute missing values
    imputer_num = SimpleImputer(strategy='mean')
    imputer_cat = SimpleImputer(strategy='most_frequent')
    X_train_num = imputer_num.fit_transform(X_train_num)
    X_train_cat = imputer_cat.fit_transform(X_train_cat)
    X_test_num = imputer_num.transform(X_test_num)
    X_test_cat = imputer_cat.transform(X_test_cat)

    # 6. Scale numerical features and target and one-hot encode categorical features
    scaler = StandardScaler()
    X_train_num = scaler.fit_transform(X_train_num)
    X_test_num = scaler.transform(X_test_num)
    y_train_mean = y_train.mean()
    y_train_std = y_train.std()
    y_train = (y_train - y_train_mean) / y_train_std
    y_test = (y_test - y_train_mean) / y_train_std

    # 7. Concatenate numerical and categorical features
    X_train_num = pd.DataFrame(X_train_num, columns=row["num_features"])
    X_test_num = pd.DataFrame(X_test_num, columns=row["num_features"])
    X_train_cat = pd.DataFrame(X_train_cat, columns=row["cat_features"])
    X_test_cat = pd.DataFrame(X_test_cat, columns=row["cat_features"])

    X_train = pd.concat([X_train_num, X_train_cat], axis=1)
    X_test = pd.concat([X_test_num, X_test_cat], axis=1)

    # 8. Train Model
    model = ExplainableBoostingRegressor(interactions=1, max_bins=64, learning_rate=0.1, max_leaves=3, min_samples_leaf=2, random_state=seed)
    print(f"Training EBM on {folder_name} dataset...")
    start_time = time.time()
    model.fit(X_train, y_train)
    train_duration = time.time() - start_time
    y_pred = model.predict(X_test)

    # 9. Evaluate Model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results_regression.loc[len(results_regression)] = {
        'model': 'EBM',
        'dataset': folder_name,
        'mse': mse,
        'r2': r2,
        'train_duration': train_duration}

Training EBM on student dataset...
Training EBM on bike dataset...
Training EBM on insurance dataset...
Training EBM on crab dataset...
Training EBM on diamond dataset...
Training EBM on productivity dataset...
Training EBM on diabetes dataset...


## 4.4 IGANN

In [9]:
# IGANN

datasets_filtered = df_overview[(df_overview["task_type"] == "regression") & (df_overview["name"] != "car")]

for idx, row in datasets_filtered.iterrows():
    # load data
    file_name = row["file_name"]
    separator = row["separator"]
    folder_name = row["name"]
    target = row["target"]
    file_path = f"{data_path}/{folder_name}/{file_name}"

    data = pd.read_csv(file_path, sep=separator)
    
    X = data.drop(columns=target)
    y = data[target]

    # Preprocessing
    # 1. Train-Test-Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

    # 2. Split features into numerical and categorical
    X_train_num = X_train[row["num_features"]]
    X_train_cat = X_train[row["cat_features"]]
    X_test_num = X_test[row["num_features"]]
    X_test_cat = X_test[row["cat_features"]]

    # 3. Replace empty strings and other chars in numerical features with np.nan
    X_train_num = X_train_num.replace({"": np.nan, " ": np.nan, "?": np.nan})
    X_test_num = X_test_num.replace({"": np.nan, " ": np.nan, "?": np.nan})

    # 4. Correct data types
    X_train_num = X_train_num.astype(float)
    X_test_num = X_test_num.astype(float)
    X_train_cat = X_train_cat.astype(str)
    X_test_cat = X_test_cat.astype(str)

    # 5. Impute missing values
    imputer_num = SimpleImputer(strategy='mean')
    imputer_cat = SimpleImputer(strategy='most_frequent')
    X_train_num = imputer_num.fit_transform(X_train_num)
    X_train_cat = imputer_cat.fit_transform(X_train_cat)
    X_test_num = imputer_num.transform(X_test_num)
    X_test_cat = imputer_cat.transform(X_test_cat)

    # 6. Scale numerical features and target and one-hot encode categorical features
    scaler = StandardScaler()
    X_train_num = scaler.fit_transform(X_train_num)
    X_test_num = scaler.transform(X_test_num)
    y_train_mean = y_train.mean()
    y_train_std = y_train.std()
    y_train = (y_train - y_train_mean) / y_train_std
    y_test = (y_test - y_train_mean) / y_train_std

    # 7. Concatenate numerical and categorical features
    X_train_num = pd.DataFrame(X_train_num, columns=row["num_features"])
    X_test_num = pd.DataFrame(X_test_num, columns=row["num_features"])
    X_train_cat = pd.DataFrame(X_train_cat, columns=row["cat_features"])
    X_test_cat = pd.DataFrame(X_test_cat, columns=row["cat_features"])

    X_train = pd.concat([X_train_num, X_train_cat], axis=1)
    X_test = pd.concat([X_test_num, X_test_cat], axis=1)

    # 8. Train Model
    print(f"Training IGANN on {folder_name} dataset...")
    model = IGANN(task="regression", n_hid=10, igann_it=False)
    start_time = time.time()
    model.fit(X_train, y_train)
    train_duration = time.time() - start_time
    y_pred = model.predict(X_test)

    # 9. Evaluate Model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results_regression.loc[len(results_regression)] = {
        'model': 'IGANN',
        'dataset': folder_name,
        'mse': mse,
        'r2': r2,
        'train_duration': train_duration}

Training IGANN on student dataset...
Training IGANN on bike dataset...
Training IGANN on insurance dataset...
Training IGANN on crab dataset...
Training IGANN on diamond dataset...
Training IGANN on productivity dataset...
Training IGANN on diabetes dataset...


## 4.5 IGANN-IT

In [10]:
# IGANN-IT

datasets_filtered = df_overview[(df_overview["task_type"] == "regression") & (df_overview["name"] != "car")]

for idx, row in datasets_filtered.iterrows():
    # load data
    file_name = row["file_name"]
    separator = row["separator"]
    folder_name = row["name"]
    target = row["target"]
    file_path = f"{data_path}/{folder_name}/{file_name}"

    data = pd.read_csv(file_path, sep=separator)
    
    X = data.drop(columns=target)
    y = data[target]

    # Preprocessing
    # 1. Train-Test-Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

    # 2. Split features into numerical and categorical
    X_train_num = X_train[row["num_features"]]
    X_train_cat = X_train[row["cat_features"]]
    X_test_num = X_test[row["num_features"]]
    X_test_cat = X_test[row["cat_features"]]

    # 3. Replace empty strings and other chars in numerical features with np.nan
    X_train_num = X_train_num.replace({"": np.nan, " ": np.nan, "?": np.nan})
    X_test_num = X_test_num.replace({"": np.nan, " ": np.nan, "?": np.nan})

    # 4. Correct data types
    X_train_num = X_train_num.astype(float)
    X_test_num = X_test_num.astype(float)
    X_train_cat = X_train_cat.astype(str)
    X_test_cat = X_test_cat.astype(str)

    # 5. Impute missing values
    imputer_num = SimpleImputer(strategy='mean')
    imputer_cat = SimpleImputer(strategy='most_frequent')
    X_train_num = imputer_num.fit_transform(X_train_num)
    X_train_cat = imputer_cat.fit_transform(X_train_cat)
    X_test_num = imputer_num.transform(X_test_num)
    X_test_cat = imputer_cat.transform(X_test_cat)

    # 6. Scale numerical features and target and one-hot encode categorical features
    scaler = StandardScaler()
    X_train_num = scaler.fit_transform(X_train_num)
    X_test_num = scaler.transform(X_test_num)
    y_train_mean = y_train.mean()
    y_train_std = y_train.std()
    y_train = (y_train - y_train_mean) / y_train_std
    y_test = (y_test - y_train_mean) / y_train_std

    # 7. Concatenate numerical and categorical features
    X_train_num = pd.DataFrame(X_train_num, columns=row["num_features"])
    X_test_num = pd.DataFrame(X_test_num, columns=row["num_features"])
    X_train_cat = pd.DataFrame(X_train_cat, columns=row["cat_features"])
    X_test_cat = pd.DataFrame(X_test_cat, columns=row["cat_features"])

    X_train = pd.concat([X_train_num, X_train_cat], axis=1)
    X_test = pd.concat([X_test_num, X_test_cat], axis=1)

    # 8. Train Model
    print(f"Training IGANN on {folder_name} dataset...")
    model = IGANN(task="regression", n_hid=10, interaction_detection_method="rulefit")
    start_time = time.time()
    model.fit(X_train, y_train)
    train_duration = time.time() - start_time
    y_pred = model.predict(X_test)

    # 9. Evaluate Model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results_regression.loc[len(results_regression)] = {
        'model': 'IGANN-IT',
        'dataset': folder_name,
        'mse': mse,
        'r2': r2,
        'train_duration': train_duration}

Training IGANN on student dataset...
No feature combination found. Model does not capture interactions. Try different feature interaction detection method.
Training IGANN on bike dataset...
Training IGANN on insurance dataset...
Training IGANN on crab dataset...
Training IGANN on diamond dataset...
Training IGANN on productivity dataset...
Training IGANN on diabetes dataset...


In [11]:
results_regression

Unnamed: 0,model,dataset,mse,r2,train_duration
0,Lasso,student,0.767566,0.199821,0.003276
1,Ridge,student,0.784711,0.181947,0.00255
2,Lasso,bike,0.591985,0.389836,0.029584
3,Ridge,bike,0.586052,0.395951,0.004884
4,Lasso,insurance,0.245697,0.763513,0.003799
5,Ridge,insurance,0.241802,0.767262,0.002962
6,Lasso,crab,0.485787,0.484257,0.007466
7,Ridge,crab,0.452826,0.519251,0.003008
8,Lasso,diamond,0.108228,0.887382,0.039186
9,Ridge,diamond,0.074707,0.922263,0.013417


## 4.5 Results Regression Tasks

In [None]:
def create_metric_df(metric):
    pivot_df = results_regression.pivot_table(index='model', columns='dataset', values=metric)
    pivot_df['mean'] = pivot_df.mean(axis=1)
    pivot_df = pivot_df[['mean'] + [col for col in pivot_df.columns if col != 'mean']]
    return pivot_df

mse_df = create_metric_df('mse')
r2_df = create_metric_df('r2')
train_time_df = create_metric_df('train_duration')

In [13]:
mse_df

dataset,mean,bike,crab,diabetes,diamond,insurance,productivity,student
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DecisionTree,0.460575,0.346287,0.506275,0.706907,0.067993,0.160075,0.564664,0.871822
EBM,0.362584,0.162454,0.447842,0.463721,0.040074,0.131755,0.542617,0.749625
IGANN,0.429819,0.393025,0.432301,0.466449,0.073215,0.250985,0.653038,0.73972
IGANN-IT,0.412374,0.370538,0.429943,0.466449,0.083083,0.143847,0.653038,0.73972
Lasso,0.48566,0.591985,0.485787,0.461463,0.108228,0.245697,0.738891,0.767566
RandomForest,0.352222,0.076867,0.440953,0.495727,0.019196,0.1568,0.461513,0.814498
Ridge,0.484018,0.586052,0.452826,0.469776,0.074707,0.241802,0.778256,0.784711


In [14]:
r2_df

dataset,mean,bike,crab,diabetes,diamond,insurance,productivity,student
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DecisionTree,0.510148,0.643079,0.462506,0.224923,0.929249,0.845925,0.37422,0.091135
EBM,0.613903,0.832557,0.524542,0.49156,0.958301,0.873184,0.398653,0.218524
IGANN,0.544555,0.594905,0.541041,0.48857,0.923816,0.758423,0.27628,0.22885
IGANN-IT,0.561489,0.618083,0.543545,0.48857,0.913547,0.861545,0.27628,0.22885
Lasso,0.485711,0.389836,0.484257,0.494037,0.887382,0.763513,0.181135,0.199821
RandomForest,0.625376,0.920773,0.531856,0.456468,0.980025,0.849078,0.488536,0.150895
Ridge,0.487015,0.395951,0.519251,0.484921,0.922263,0.767262,0.13751,0.181947


In [15]:
train_time_df

dataset,mean,bike,crab,diabetes,diamond,insurance,productivity,student
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DecisionTree,0.013839,0.017874,0.008155,0.002927,0.058551,0.002749,0.003563,0.003051
EBM,6.228575,4.802202,1.880723,1.693768,21.923022,1.168456,1.98141,10.150442
IGANN,2.269603,10.725619,0.986557,0.765108,1.887702,0.433337,0.90865,0.180249
IGANN-IT,4.911774,17.286643,2.49191,1.053885,8.639091,2.84588,1.383201,0.681804
Lasso,0.013115,0.029584,0.007466,0.00318,0.039186,0.003799,0.005316,0.003276
RandomForest,1.805201,2.475679,1.1106,0.257446,7.887241,0.285077,0.329367,0.290999
Ridge,0.004609,0.004884,0.003008,0.002117,0.013417,0.002962,0.003322,0.00255


# 5. Evaluation Regression Tasks

In [None]:
datasets_filtered = df_overview[(df_overview["task_type"] == "binary") & (df_overview["name"] != "adult") & (df_overview["name"] != "airline") & (df_overview["name"] != "weather") & (df_overview["name"] != "compas")]

for idx, row in datasets_filtered.iterrows():
    file_path = f"{data_path}/{row['name']}/{row['file_name']}"
    data = pd.read_csv(file_path, sep=row["separator"])

    print(f"\nDataset: {row['name']}, Shape: {data.shape}")
    print(data[row["target"]].value_counts())


Dataset: stroke, Shape: (5110, 12)
stroke
0    4861
1     249
Name: count, dtype: int64

Dataset: churn, Shape: (7043, 21)
Churn
No     5174
Yes    1869
Name: count, dtype: int64

Dataset: fico, Shape: (10459, 24)
RiskPerformance
Bad     5459
Good    5000
Name: count, dtype: int64

Dataset: bank, Shape: (45211, 17)
y
no     39922
yes     5289
Name: count, dtype: int64

Dataset: adult, Shape: (32561, 15)
income
<=50K    24720
>50K      7841
Name: count, dtype: int64

Dataset: airline, Shape: (103904, 25)
satisfaction
neutral or dissatisfied    58879
satisfied                  45025
Name: count, dtype: int64

Dataset: college, Shape: (1000, 11)
will_go_to_college
True     500
False    500
Name: count, dtype: int64

Dataset: weather, Shape: (145460, 23)
RainTomorrow
No     110316
Yes     31877
Name: count, dtype: int64

Dataset: compas, Shape: (7214, 53)
two_year_recid
0    3963
1    3251
Name: count, dtype: int64


In [109]:
results_classification = pd.DataFrame(columns=["model", "dataset", "log_loss", "pr_auc", "f1", "train_duration"])

## 5.1 Logistic Regression (L1 and L2)

In [110]:
# Logistic Regression with L1 and L2 regularization (one-hot encoding needed)
datasets_filtered = df_overview[(df_overview["task_type"] == "binary") & (df_overview["name"] != "adult") & (df_overview["name"] != "airline") & (df_overview["name"] != "weather") & (df_overview["name"] != "compas")]

models = {
    "LogisticRegression_L1": LogisticRegression(penalty='l1', solver='liblinear', C=1.0, max_iter=1000, random_state=seed),
    "LogisticRegression_L2": LogisticRegression(penalty='l2', solver='liblinear', C=1.0, max_iter=1000, random_state=seed),
}

for idx, row in datasets_filtered.iterrows():
    # load data
    file_name = row["file_name"]
    separator = row["separator"]
    folder_name = row["name"]
    target = row["target"]
    file_path = f"{data_path}/{folder_name}/{file_name}"

    data = pd.read_csv(file_path, sep=separator)
    
    X = data.drop(columns=target)
    y = data[target]

    # Preprocessing
    # 1. Train-Test-Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

    # 2. Split features into numerical and categorical
    X_train_num = X_train[row["num_features"]]
    X_train_cat = X_train[row["cat_features"]]
    X_test_num = X_test[row["num_features"]]
    X_test_cat = X_test[row["cat_features"]]

    # 3. Replace empty strings and other chars in numerical features with np.nan
    X_train_num = X_train_num.replace({"": np.nan, " ": np.nan, "?": np.nan})
    X_test_num = X_test_num.replace({"": np.nan, " ": np.nan, "?": np.nan})

    # 4. Correct data types
    X_train_num = X_train_num.astype(float)
    X_test_num = X_test_num.astype(float)
    X_train_cat = X_train_cat.astype(str)
    X_test_cat = X_test_cat.astype(str)

    # 5. Impute missing values
    imputer_num = SimpleImputer(strategy='mean')
    imputer_cat = SimpleImputer(strategy='most_frequent')
    X_train_num = imputer_num.fit_transform(X_train_num)
    X_train_cat = imputer_cat.fit_transform(X_train_cat)
    X_test_num = imputer_num.transform(X_test_num)
    X_test_cat = imputer_cat.transform(X_test_cat)

    # 6. Scale numerical features and target and one-hot encode categorical features
    scaler = StandardScaler()
    X_train_num = scaler.fit_transform(X_train_num)
    X_test_num = scaler.transform(X_test_num)
    X_train_cat = pd.DataFrame(X_train_cat, columns=row["cat_features"])
    X_test_cat = pd.DataFrame(X_test_cat, columns=row["cat_features"])
    X_train_cat = pd.get_dummies(X_train_cat, drop_first=True)
    X_test_cat = pd.get_dummies(X_test_cat, drop_first=True)
    X_test_cat = X_test_cat.reindex(columns=X_train_cat.columns, fill_value=0)

    # 7. Concatenate numerical and categorical features
    X_train_num = pd.DataFrame(X_train_num, columns=row["num_features"])
    X_test_num = pd.DataFrame(X_test_num, columns=row["num_features"])

    X_train = pd.concat([X_train_num, X_train_cat], axis=1)
    X_test = pd.concat([X_test_num, X_test_cat], axis=1)

    # 9. y_train and y_test to 0 and 1
    minority_class = y_train.value_counts().idxmin()
    y_train = (y_train == minority_class).astype(int)
    y_test = (y_test == minority_class).astype(int)

    # 9. Train Models
    for name, model in models.items():
        print(f"Training {name} on {folder_name} dataset...")
        start_time = time.time()
        model.fit(X_train, y_train)
        train_duration = time.time() - start_time
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]

        # 10. Evaluate Models
        logloss = log_loss(y_test, y_pred_proba)
        pr_auc = average_precision_score(y_test, y_pred_proba)
        f1 = f1_score(y_test, y_pred)
        results_classification.loc[len(results_classification)] = {
            'model': name,
            'dataset': folder_name,
            'log_loss': logloss,
            'pr_auc': pr_auc,
            'f1': f1,
            'train_duration': train_duration}

Training LogisticRegression_L1 on stroke dataset...
Training LogisticRegression_L2 on stroke dataset...
Training LogisticRegression_L1 on churn dataset...
Training LogisticRegression_L2 on churn dataset...
Training LogisticRegression_L1 on fico dataset...
Training LogisticRegression_L2 on fico dataset...
Training LogisticRegression_L1 on bank dataset...
Training LogisticRegression_L2 on bank dataset...
Training LogisticRegression_L1 on college dataset...
Training LogisticRegression_L2 on college dataset...


## 5.2 Decision Tree and Random Forest

In [111]:
# Decision Tree and Random Forest (one-hot encoding needed)
datasets_filtered = df_overview[(df_overview["task_type"] == "binary") & (df_overview["name"] != "adult") & (df_overview["name"] != "airline") & (df_overview["name"] != "weather") & (df_overview["name"] != "compas")]

models = {
    "DecisionTree": DecisionTreeClassifier(max_depth=5, random_state=seed),
    "RandomForest": RandomForestClassifier(n_estimators=100, max_depth=10, random_state=seed),
}

for idx, row in datasets_filtered.iterrows():
    # load data
    file_name = row["file_name"]
    separator = row["separator"]
    folder_name = row["name"]
    target = row["target"]
    file_path = f"{data_path}/{folder_name}/{file_name}"

    data = pd.read_csv(file_path, sep=separator)
    
    X = data.drop(columns=target)
    y = data[target]

    # Preprocessing
    # 1. Train-Test-Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

    # 2. Split features into numerical and categorical
    X_train_num = X_train[row["num_features"]]
    X_train_cat = X_train[row["cat_features"]]
    X_test_num = X_test[row["num_features"]]
    X_test_cat = X_test[row["cat_features"]]

    # 3. Replace empty strings and other chars in numerical features with np.nan
    X_train_num = X_train_num.replace({"": np.nan, " ": np.nan, "?": np.nan})
    X_test_num = X_test_num.replace({"": np.nan, " ": np.nan, "?": np.nan})

    # 4. Correct data types
    X_train_num = X_train_num.astype(float)
    X_test_num = X_test_num.astype(float)
    X_train_cat = X_train_cat.astype(str)
    X_test_cat = X_test_cat.astype(str)

    # 5. Impute missing values
    imputer_num = SimpleImputer(strategy='mean')
    imputer_cat = SimpleImputer(strategy='most_frequent')
    X_train_num = imputer_num.fit_transform(X_train_num)
    X_train_cat = imputer_cat.fit_transform(X_train_cat)
    X_test_num = imputer_num.transform(X_test_num)
    X_test_cat = imputer_cat.transform(X_test_cat)

    # 6. Scale numerical features and target and one-hot encode categorical features
    scaler = StandardScaler()
    X_train_num = scaler.fit_transform(X_train_num)
    X_test_num = scaler.transform(X_test_num)
    X_train_cat = pd.DataFrame(X_train_cat, columns=row["cat_features"])
    X_test_cat = pd.DataFrame(X_test_cat, columns=row["cat_features"])
    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    X_train_cat = encoder.fit_transform(X_train_cat)
    X_test_cat = encoder.transform(X_test_cat)

    # 7. Concatenate numerical and categorical features
    X_train_num = pd.DataFrame(X_train_num, columns=row["num_features"])
    X_test_num = pd.DataFrame(X_test_num, columns=row["num_features"])
    X_train_cat = pd.DataFrame(X_train_cat, columns=row["cat_features"])
    X_test_cat = pd.DataFrame(X_test_cat, columns=row["cat_features"])

    X_train = pd.concat([X_train_num, X_train_cat], axis=1)
    X_test = pd.concat([X_test_num, X_test_cat], axis=1)

    # 9. y_train and y_test to 0 and 1
    minority_class = y_train.value_counts().idxmin()
    y_train = (y_train == minority_class).astype(int)
    y_test = (y_test == minority_class).astype(int)

    # 9. Train Models
    for name, model in models.items():
        print(f"Training {name} on {folder_name} dataset...")
        start_time = time.time()
        model.fit(X_train, y_train)
        train_duration = time.time() - start_time
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]

        # 10. Evaluate Models
        logloss = log_loss(y_test, y_pred_proba)
        pr_auc = average_precision_score(y_test, y_pred_proba)
        f1 = f1_score(y_test, y_pred)
        results_classification.loc[len(results_classification)] = {
            'model': name,
            'dataset': folder_name,
            'log_loss': logloss,
            'pr_auc': pr_auc,
            'f1': f1,
            'train_duration': train_duration}

Training DecisionTree on stroke dataset...
Training RandomForest on stroke dataset...
Training DecisionTree on churn dataset...
Training RandomForest on churn dataset...
Training DecisionTree on fico dataset...
Training RandomForest on fico dataset...
Training DecisionTree on bank dataset...
Training RandomForest on bank dataset...
Training DecisionTree on college dataset...
Training RandomForest on college dataset...


## 5.3  EBM

In [112]:
# EBM
datasets_filtered = df_overview[(df_overview["task_type"] == "binary") & (df_overview["name"] != "adult") & (df_overview["name"] != "airline") & (df_overview["name"] != "weather") & (df_overview["name"] != "compas")]

for idx, row in datasets_filtered.iterrows():
    # load data
    file_name = row["file_name"]
    separator = row["separator"]
    folder_name = row["name"]
    target = row["target"]
    file_path = f"{data_path}/{folder_name}/{file_name}"

    data = pd.read_csv(file_path, sep=separator)
    
    X = data.drop(columns=target)
    y = data[target]

    # Preprocessing
    # 1. Train-Test-Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

    # 2. Split features into numerical and categorical
    X_train_num = X_train[row["num_features"]]
    X_train_cat = X_train[row["cat_features"]]
    X_test_num = X_test[row["num_features"]]
    X_test_cat = X_test[row["cat_features"]]

    # 3. Replace empty strings and other chars in numerical features with np.nan
    X_train_num = X_train_num.replace({"": np.nan, " ": np.nan, "?": np.nan})
    X_test_num = X_test_num.replace({"": np.nan, " ": np.nan, "?": np.nan})

    # 4. Correct data types
    X_train_num = X_train_num.astype(float)
    X_test_num = X_test_num.astype(float)
    X_train_cat = X_train_cat.astype(str)
    X_test_cat = X_test_cat.astype(str)

    # 5. Impute missing values
    imputer_num = SimpleImputer(strategy='mean')
    imputer_cat = SimpleImputer(strategy='most_frequent')
    X_train_num = imputer_num.fit_transform(X_train_num)
    X_train_cat = imputer_cat.fit_transform(X_train_cat)
    X_test_num = imputer_num.transform(X_test_num)
    X_test_cat = imputer_cat.transform(X_test_cat)

    # 6. Scale numerical features and target and one-hot encode categorical features
    scaler = StandardScaler()
    X_train_num = scaler.fit_transform(X_train_num)
    X_test_num = scaler.transform(X_test_num)

    # 7. Concatenate numerical and categorical features
    X_train_num = pd.DataFrame(X_train_num, columns=row["num_features"])
    X_test_num = pd.DataFrame(X_test_num, columns=row["num_features"])
    X_train_cat = pd.DataFrame(X_train_cat, columns=row["cat_features"])
    X_test_cat = pd.DataFrame(X_test_cat, columns=row["cat_features"])

    X_train = pd.concat([X_train_num, X_train_cat], axis=1)
    X_test = pd.concat([X_test_num, X_test_cat], axis=1)

    # 9. y_train and y_test to 0 and 1
    minority_class = y_train.value_counts().idxmin()
    y_train = (y_train == minority_class).astype(int)
    y_test = (y_test == minority_class).astype(int)

    # 9. Train Models
    model = ExplainableBoostingClassifier(interactions=1, max_bins=64, learning_rate=0.1, max_leaves=3, min_samples_leaf=2, random_state=seed)
    print(f"Training EBM on {folder_name} dataset...")
    start_time = time.time()
    model.fit(X_train, y_train)
    train_duration = time.time() - start_time
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # 10. Evaluate Models
    logloss = log_loss(y_test, y_pred_proba)
    pr_auc = average_precision_score(y_test, y_pred_proba)
    f1 = f1_score(y_test, y_pred)
    results_classification.loc[len(results_classification)] = {
        'model': 'EBM',
        'dataset': folder_name,
        'log_loss': logloss,
        'pr_auc': pr_auc,
        'f1': f1,
        'train_duration': train_duration}

Training EBM on stroke dataset...
Training EBM on churn dataset...
Training EBM on fico dataset...
Training EBM on bank dataset...
Training EBM on college dataset...


## 5.4 IGANN

In [113]:
# IGANN
datasets_filtered = df_overview[(df_overview["task_type"] == "binary") & (df_overview["name"] != "adult") & (df_overview["name"] != "airline") & (df_overview["name"] != "weather") & (df_overview["name"] != "compas")]

for idx, row in datasets_filtered.iterrows():
    # load data
    file_name = row["file_name"]
    separator = row["separator"]
    folder_name = row["name"]
    target = row["target"]
    file_path = f"{data_path}/{folder_name}/{file_name}"

    data = pd.read_csv(file_path, sep=separator)
    
    X = data.drop(columns=target)
    y = data[target]

    # Preprocessing
    # 1. Train-Test-Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

    # 2. Split features into numerical and categorical
    X_train_num = X_train[row["num_features"]]
    X_train_cat = X_train[row["cat_features"]]
    X_test_num = X_test[row["num_features"]]
    X_test_cat = X_test[row["cat_features"]]

    # 3. Replace empty strings and other chars in numerical features with np.nan
    X_train_num = X_train_num.replace({"": np.nan, " ": np.nan, "?": np.nan})
    X_test_num = X_test_num.replace({"": np.nan, " ": np.nan, "?": np.nan})

    # 4. Correct data types
    X_train_num = X_train_num.astype(float)
    X_test_num = X_test_num.astype(float)
    X_train_cat = X_train_cat.astype(str)
    X_test_cat = X_test_cat.astype(str)

    # 5. Impute missing values
    imputer_num = SimpleImputer(strategy='mean')
    imputer_cat = SimpleImputer(strategy='most_frequent')
    X_train_num = imputer_num.fit_transform(X_train_num)
    X_train_cat = imputer_cat.fit_transform(X_train_cat)
    X_test_num = imputer_num.transform(X_test_num)
    X_test_cat = imputer_cat.transform(X_test_cat)

    # 6. Scale numerical features and target and one-hot encode categorical features
    scaler = StandardScaler()
    X_train_num = scaler.fit_transform(X_train_num)
    X_test_num = scaler.transform(X_test_num)

    # 7. Concatenate numerical and categorical features
    X_train_num = pd.DataFrame(X_train_num, columns=row["num_features"])
    X_test_num = pd.DataFrame(X_test_num, columns=row["num_features"])
    X_train_cat = pd.DataFrame(X_train_cat, columns=row["cat_features"])
    X_test_cat = pd.DataFrame(X_test_cat, columns=row["cat_features"])

    X_train = pd.concat([X_train_num, X_train_cat], axis=1)
    X_test = pd.concat([X_test_num, X_test_cat], axis=1)

    # 9. y_train and y_test to 0 and 1
    minority_class = y_train.value_counts().idxmin()
    y_train = (y_train == minority_class).astype(int)
    y_test = (y_test == minority_class).astype(int)

    # 9. Train Models
    model = IGANN(task="classification", n_hid=10, igann_it=False)
    print(f"Training IGANN on {folder_name} dataset...")
    start_time = time.time()
    model.fit(X_train, y_train)
    train_duration = time.time() - start_time
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # 10. Evaluate Models
    logloss = log_loss(y_test, y_pred_proba)
    pr_auc = average_precision_score(y_test, y_pred_proba)
    f1 = f1_score(y_test, y_pred)
    results_classification.loc[len(results_classification)] = {
        'model': 'IGANN',
        'dataset': folder_name,
        'log_loss': logloss,
        'pr_auc': pr_auc,
        'f1': f1,
        'train_duration': train_duration}

Training IGANN on stroke dataset...
Training IGANN on churn dataset...
Training IGANN on fico dataset...
Training IGANN on bank dataset...
Training IGANN on college dataset...


## 5.5 IGANN-IT

In [114]:
# IGANN-IT
datasets_filtered = df_overview[(df_overview["task_type"] == "binary") & (df_overview["name"] != "adult") & (df_overview["name"] != "airline") & (df_overview["name"] != "weather") & (df_overview["name"] != "compas")]

for idx, row in datasets_filtered.iterrows():
    # load data
    file_name = row["file_name"]
    separator = row["separator"]
    folder_name = row["name"]
    target = row["target"]
    file_path = f"{data_path}/{folder_name}/{file_name}"

    data = pd.read_csv(file_path, sep=separator)
    
    X = data.drop(columns=target)
    y = data[target]

    # Preprocessing
    # 1. Train-Test-Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

    # 2. Split features into numerical and categorical
    X_train_num = X_train[row["num_features"]]
    X_train_cat = X_train[row["cat_features"]]
    X_test_num = X_test[row["num_features"]]
    X_test_cat = X_test[row["cat_features"]]

    # 3. Replace empty strings and other chars in numerical features with np.nan
    X_train_num = X_train_num.replace({"": np.nan, " ": np.nan, "?": np.nan})
    X_test_num = X_test_num.replace({"": np.nan, " ": np.nan, "?": np.nan})

    # 4. Correct data types
    X_train_num = X_train_num.astype(float)
    X_test_num = X_test_num.astype(float)
    X_train_cat = X_train_cat.astype(str)
    X_test_cat = X_test_cat.astype(str)

    # 5. Impute missing values
    imputer_num = SimpleImputer(strategy='mean')
    imputer_cat = SimpleImputer(strategy='most_frequent')
    X_train_num = imputer_num.fit_transform(X_train_num)
    X_train_cat = imputer_cat.fit_transform(X_train_cat)
    X_test_num = imputer_num.transform(X_test_num)
    X_test_cat = imputer_cat.transform(X_test_cat)

    # 6. Scale numerical features and target and one-hot encode categorical features
    scaler = StandardScaler()
    X_train_num = scaler.fit_transform(X_train_num)
    X_test_num = scaler.transform(X_test_num)

    # 7. Concatenate numerical and categorical features
    X_train_num = pd.DataFrame(X_train_num, columns=row["num_features"])
    X_test_num = pd.DataFrame(X_test_num, columns=row["num_features"])
    X_train_cat = pd.DataFrame(X_train_cat, columns=row["cat_features"])
    X_test_cat = pd.DataFrame(X_test_cat, columns=row["cat_features"])

    X_train = pd.concat([X_train_num, X_train_cat], axis=1)
    X_test = pd.concat([X_test_num, X_test_cat], axis=1)

    # 9. y_train and y_test to 0 and 1
    minority_class = y_train.value_counts().idxmin()
    y_train = (y_train == minority_class).astype(int)
    y_test = (y_test == minority_class).astype(int)

    # 9. Train Models
    model = IGANN(task="classification", n_hid=10, igann_it=True, interaction_detection_method="rulefit")
    print(f"Training IGANN on {folder_name} dataset...")
    start_time = time.time()
    model.fit(X_train, y_train)
    train_duration = time.time() - start_time
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # 10. Evaluate Models
    logloss = log_loss(y_test, y_pred_proba)
    pr_auc = average_precision_score(y_test, y_pred_proba)
    f1 = f1_score(y_test, y_pred)
    results_classification.loc[len(results_classification)] = {
        'model': 'IGANN-IT',
        'dataset': folder_name,
        'log_loss': logloss,
        'pr_auc': pr_auc,
        'f1': f1,
        'train_duration': train_duration}

Training IGANN on stroke dataset...
Training IGANN on churn dataset...
Training IGANN on fico dataset...
Training IGANN on bank dataset...
Training IGANN on college dataset...
No feature combination found. Model does not capture interactions. Try different feature interaction detection method.


## 5.6 Results Classification Tasks

In [116]:
def create_classification_metric_df(metric):
    pivot_df = results_classification.pivot_table(index='model', columns='dataset', values=metric)
    pivot_df['mean'] = pivot_df.mean(axis=1)
    pivot_df = pivot_df[['mean'] + [col for col in pivot_df.columns if col != 'mean']]
    return pivot_df

logloss_df = create_classification_metric_df('log_loss')
prauc_df = create_classification_metric_df('pr_auc')
f1_df = create_classification_metric_df('f1')  # falls du F1 misst
train_time_df = create_classification_metric_df('train_duration')

In [117]:
logloss_df

dataset,mean,bank,churn,college,fico,stroke
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DecisionTree,0.611913,0.34197,0.504942,0.878982,0.647117,0.686555
EBM,0.343272,0.298289,0.3996,0.270529,0.56017,0.187771
IGANN,0.352934,0.302892,0.406086,0.294825,0.572233,0.188633
IGANN-IT,0.352431,0.300378,0.406086,0.294825,0.572233,0.188633
LogisticRegression_L1,0.361939,0.305582,0.406098,0.326884,0.5825,0.188633
LogisticRegression_L2,0.362872,0.305557,0.406027,0.331676,0.581795,0.189305
RandomForest,0.354042,0.300309,0.410178,0.271934,0.565926,0.221863


In [118]:
prauc_df

dataset,mean,bank,churn,college,fico,stroke
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DecisionTree,0.552865,0.328163,0.624367,0.914755,0.71095,0.186089
EBM,0.625602,0.433338,0.701364,0.964446,0.760377,0.268485
IGANN,0.617536,0.418484,0.686792,0.955758,0.74336,0.283288
IGANN-IT,0.619451,0.428059,0.686792,0.955758,0.74336,0.283288
LogisticRegression_L1,0.61114,0.415437,0.686904,0.939384,0.730686,0.283288
LogisticRegression_L2,0.609905,0.415376,0.686721,0.936816,0.731517,0.279093
RandomForest,0.616262,0.42592,0.686032,0.969084,0.754586,0.245686


In [119]:
f1_df

dataset,mean,bank,churn,college,fico,stroke
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DecisionTree,0.496846,0.303132,0.620619,0.837398,0.659924,0.063158
EBM,0.500544,0.308807,0.598817,0.885496,0.684909,0.024691
IGANN,0.494086,0.291762,0.628959,0.870588,0.67912,0.0
IGANN-IT,0.496064,0.301651,0.628959,0.870588,0.67912,0.0
LogisticRegression_L1,0.489626,0.285052,0.628959,0.862745,0.671375,0.0
LogisticRegression_L2,0.487773,0.283227,0.628249,0.854902,0.672485,0.0
RandomForest,0.482935,0.282339,0.563177,0.884615,0.684541,0.0


In [120]:
train_time_df

dataset,mean,bank,churn,college,fico,stroke
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DecisionTree,0.021938,0.056432,0.013702,0.004217,0.027123,0.008215
EBM,3.863355,6.548609,2.565856,1.146001,2.225947,6.83036
IGANN,3.649745,13.807416,0.506658,0.581314,2.596665,0.756672
IGANN-IT,4.160512,11.500682,2.657727,0.560003,3.7389,2.34525
LogisticRegression_L1,0.28666,0.510881,0.385161,0.006539,0.510507,0.02021
LogisticRegression_L2,0.067794,0.212105,0.01934,0.003388,0.093327,0.010809
RandomForest,0.875594,2.28217,0.551217,0.199228,0.967871,0.377486


In [None]:
# np.random.seed(1)

# num_samples = 5000

# fahrleistung = np.random.randint(5000, 50000, num_samples)  # Annual mileage in km
# fahrzeugtyp = np.random.choice(["Kleinwagen", "SUV", "Sportwagen", "Transporter"], num_samples)
# alter_fahrer = np.random.randint(18, 80, num_samples)  # Age of driver
# anzahl_unfälle = np.random.poisson(0.5, num_samples)  # Previous accidents (Poisson distribution)
# region = np.random.choice(["ländlich", "städtisch", "Metropole"], num_samples)

# # Define probability of an accident based on features
# base_prob = 0.02  # Base probability of accident

# # Interaction effect: Higher mileage increases risk for Sportwagen & Transporter more
# prob_fahrleistung = np.zeros(num_samples)

# # Different effects for vehicle types
# fahrzeugtyp_effect = np.zeros(num_samples)

# interaktion_fahrleistung_typ = np.array([
#     (f / 50000) * (
#         0.01 if t == "Kleinwagen" else
#         0.03 if t == "SUV" else
#         0.10 if t == "Sportwagen" else
#         0.08  # Transporter
#     ) for f, t in zip(fahrleistung, fahrzeugtyp)
# ])

# # Age effect: Younger and older drivers have higher risk
# alter_effect = np.where((alter_fahrer < 25) | (alter_fahrer > 65), 0.05, 0.02)

# # More past accidents -> higher probability
# unfall_effect = anzahl_unfälle * 0.04

# # Regional effect: More accidents in cities & metropolitan areas
# region_effect = np.array([
#     0.01 if r == "ländlich" else
#     0.03 if r == "städtisch" else
#     0.05 for r in region
# ])

# # Compute final probability (capped at 1)
# final_prob = np.clip(
#     base_prob + interaktion_fahrleistung_typ + alter_effect + unfall_effect + region_effect,
#     0, 1
# )
# # Generate binary target variable (1 = accident, 0 = no accident) using probabilities
# y = np.random.binomial(1, final_prob, num_samples)

# # Create DataFrame
# X = pd.DataFrame({
#     "Jährliche Fahrleistung (km)": fahrleistung,
#     "Fahrzeugtyp": fahrzeugtyp,
#     "Alter Fahrer": alter_fahrer,
#     "Anzahl früherer Unfälle": anzahl_unfälle,
#     "Region": region
#     })

In [None]:
# import numpy as np
# import pandas as pd

# num_samples = 10000

# # Merkmale generieren
# fahrleistung = np.random.randint(5000, 50000, num_samples)  # Annual mileage in km
# fahrzeugtyp = np.random.choice(["Kleinwagen", "SUV", "Sportwagen", "Transporter"], num_samples)
# alter_fahrer = np.random.randint(18, 80, num_samples)  # Age of driver
# anzahl_unfälle = np.random.poisson(0.5, num_samples)  # Previous accidents
# region = np.random.choice(["ländlich", "städtisch", "Metropole"], num_samples)

# base_prob = 0.350  # statt 0.02

# interaktion_fahrleistung_typ = np.array([
#     (f / 50000) * (
#         0.02 if t == "Kleinwagen" else
#         0.05 if t == "SUV" else
#         0.25 if t == "Sportwagen" else
#         0.20  # Transporter
#     ) for f, t in zip(fahrleistung, fahrzeugtyp)
# ])

# # Alterseffekt: höhere Gefahr für Jüngere und Ältere
# alter_effect = np.where((alter_fahrer < 25) | (alter_fahrer > 65), 0.05, 0.02)

# # Unfallhistorie
# unfall_effect = anzahl_unfälle * 0.04

# # Regionseffekt
# region_effect = np.array([
#     0.01 if r == "ländlich" else
#     0.03 if r == "städtisch" else
#     0.05 for r in region
# ])

# # Finale Unfallwahrscheinlichkeit
# final_prob = np.clip(
#     base_prob + interaktion_fahrleistung_typ + alter_effect + unfall_effect + region_effect,
#     0, 1
# )

# # Zielvariable (Unfall: 1 = ja, 0 = nein)
# y = np.random.binomial(1, final_prob, num_samples)

# # DataFrame
# X = pd.DataFrame({
#     "Jährliche Fahrleistung (km)": fahrleistung,
#     "Fahrzeugtyp": fahrzeugtyp,
#     "Alter Fahrer": alter_fahrer,
#     "Anzahl früherer Unfälle": anzahl_unfälle,
#     "Region": region
# })


In [None]:
# # scale data
# scaler = StandardScaler()
# continuous_features = sorted(['Jährliche Fahrleistung (km)', 'Alter Fahrer', 'Anzahl früherer Unfälle'])
# X_num = scaler.fit_transform(X[continuous_features])
# X_cat = X.drop(columns=continuous_features)
# X = pd.concat([pd.DataFrame(X_num, columns=continuous_features), X_cat], axis=1)

# # train-test-split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

In [None]:
# model = IGANN(task='classification', n_hid=10, igann_it=False)
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)

# # Evaluate the model
# from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
# accuracy = accuracy_score(y_test, y_pred)
# f1 = f1_score(y_test, y_pred)
# roc_auc = roc_auc_score(y_test, y_pred)
# print(f"Accuracy: {accuracy:.2f}")
# print(f"F1 Score: {f1:.2f}")
# print(f"ROC AUC: {roc_auc:.2f}")

Accuracy: 0.55
F1 Score: 0.55
ROC AUC: 0.55


In [None]:
# model = IGANN(task='classification', n_hid=10, igann_it=True, interaction_detection_method="rulefit")
# model.fit(X, y)
# y_pred = model.predict(X_test)

# # Evaluate the model
# accuracy = accuracy_score(y_test, y_pred)
# f1 = f1_score(y_test, y_pred)
# roc_auc = roc_auc_score(y_test, y_pred)
# print(f"Accuracy: {accuracy:.2f}")
# print(f"F1 Score: {f1:.2f}")
# print(f"ROC AUC: {roc_auc:.2f}")

Accuracy: 0.56
F1 Score: 0.55
ROC AUC: 0.56
