# Random Forest

In [1]:
import pandas as pd
import numpy as np
from scipy.signal import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Pfad zur Datei
file_path = ['/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S14/S14.pkl',
             '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S13/S13.pkl',
            '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S10/S10.pkl',
            '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S5/S5.pkl',
            '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S7/S7.pkl',
            '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S9/S9.pkl',
            '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S15/S15.pkl',
            '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S2/S2.pkl',
            '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S6/S6.pkl',
            '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S3/S3.pkl',
            '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S4/S4.pkl',
            '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S16/S16.pkl',
            '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S17/S17.pkl'
            ]

features = []
window_labels = []

for file in file_path:
# Laden
    with open(file, 'rb') as f:
        data = pd.read_pickle(f)
    
    # Zugriff auf Wrist-Daten
    wrist = data['signal']['wrist']
    eda = wrist['EDA']      # shape (n_samples, 1)
    temp = wrist['TEMP']    # shape (n_samples, 1)
    acc = wrist['ACC']      # shape (n_samples, 3)
    
    # Label (ursprünglich 700 Hz, z. B. 3.8 Mio lang)
    labels = data['label']
    
    # Resampling der Labels auf Länge von EDA (64 Hz)
    labels_resampled = resample(labels.astype(float), len(eda))
    labels_resampled = np.round(labels_resampled).astype(int)
    
    # Features kombinieren (z. B. EDA, TEMP, ACC)
    from scipy.signal import resample
    
    # Resample ACC auf Länge von EDA (z. B. 22192 Zeilen)
    acc_resampled = resample(acc, len(eda))
    
    X_raw = np.hstack([eda, temp, acc_resampled])  # shape: (n_samples, 6)

    X_raw
    
    # Sliding Window-Feature-Bildung (z. B. 60s bei 64 Hz = 384 Samples)
    window_size = 384  # 60s Fenster
    step_size = 384    # ohne Überschneidung

    
    for start in range(0, len(X_raw) - window_size, step_size):
        end = start + window_size
        window = X_raw[start:end]
        label_window = labels_resampled[start:end]
        
        STRESS_LABEL = 2  # passe das an, falls Stress bei dir eine andere Zahl ist (z. B. 3)
        
        # --- Labels binär mappen: Stress -> 2, alles andere -> 1
        # (optional) sicherstellen, dass es ints sind:
        label_window = label_window.astype(int, copy=False)
        binary_window = np.where(label_window == STRESS_LABEL, 2, 1)
        
        # Feature-Vektor: Mittelwert + Std jeder Sensor-Spalte
        feature_vector = np.concatenate([window.mean(axis=0), window.std(axis=0)])
        features.append(feature_vector)
        
        # Mehrheits-Label (jetzt zwischen 1 = Nicht-Stress und 2 = Stress)
        # np.bincount über Indizes 0..2; wir interessieren uns für 1 und 2
        majority_label = np.bincount(binary_window, minlength=3)[1:].argmax() + 1
        window_labels.append(majority_label)
        
        # In Arrays umwandeln
        X = np.array(features)
        y = np.array(window_labels)

print(f"Datensätze: {X.shape}, Klassenverteilung: {np.bincount(y)}")




Datensätze: (786, 10), Klassenverteilung: [  0 698  88]


In [2]:
X_raw[8]

array([  1.968359  ,  33.34      , -49.9914787 , -38.16717886,
         5.37479379])

In [3]:
# Modell trainieren auf 80:20 Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,     # 20% Testdaten
    random_state=42,   # Reproduzierbarkeit
)
clf_testdaten = RandomForestClassifier(n_estimators=105, random_state=42)
clf_testdaten.fit(X_train, y_train)


# Test auf 80:20 Split
y_pred_test = clf_testdaten.predict(X_test)
print(classification_report(y_test, y_pred_test, digits=3))

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

def finde_bestes_n_estimators(X_train, X_test, y_train, y_test, 
                              n_min=10, n_max=200, step=10, average="weighted"):
    ergebnisse = {}
    
    for n in range(n_min, n_max+1, step):
        clf = RandomForestClassifier(n_estimators=n, random_state=42)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        acc = accuracy_score(y_test, y_pred)
        f1  = f1_score(y_test, y_pred, average=average)
        
        ergebnisse[n] = {"accuracy": acc, "f1_score": f1}
        #print(f"n_estimators={n}: Accuracy={acc:.3f}, F1={f1:.3f}")
    
    # Auswahl nach bestem F1-Score
    best_n = max(ergebnisse, key=lambda k: ergebnisse[k]["f1_score"])
    best_vals = ergebnisse[best_n]
    
    print(f"\n➡️ Bestes n_estimators: {best_n} "
          f"mit Accuracy={best_vals['accuracy']:.3f}, F1={best_vals['f1_score']:.3f}")
    
    return best_n, ergebnisse


finde_bestes_n_estimators(X_train, X_test, y_train, y_test, n_min=10, n_max=200, step=1)


import joblib  # für Speichern/Laden

# Modell trainieren auf kombinierten Trainingsdaten
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X, y)
# ✅ Speichern des gesamten Modells ohne Testdatensplit
joblib.dump(clf, "/kaggle/working/random_forest_model.pkl")


              precision    recall  f1-score   support

           1      0.947     1.000     0.973       143
           2      1.000     0.467     0.636        15

    accuracy                          0.949       158
   macro avg      0.974     0.733     0.805       158
weighted avg      0.952     0.949     0.941       158


➡️ Bestes n_estimators: 19 mit Accuracy=0.956, F1=0.949


['/kaggle/working/random_forest_model.pkl']

## Daten der S8 ziehen und einsetzen!

In [4]:
import pandas as pd
import numpy as np
from scipy.signal import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 📁 Pfad zur Datei (S12)
file_path_S12 = '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S8/S8.pkl'

# 🔄 Daten laden
with open(file_path_S12, 'rb') as f:
    data_S12 = pd.read_pickle(f)

# 📦 Zugriff auf Wrist-Signale
wrist_S12 = data_S12['signal']['wrist']
eda_S12 = wrist_S12['EDA']
temp_S12 = wrist_S12['TEMP']
acc_S12 = wrist_S12['ACC']

# 🏷️ Labels (ursprünglich 700 Hz)
labels_S12 = data_S12['label']

# 🔁 Resampling der Labels auf EDA-Länge (64 Hz)
labels_resampled_S12 = resample(labels_S12.astype(float), len(eda_S12))
labels_resampled_S12 = np.round(labels_resampled_S12).astype(int)

# 🔁 Resample ACC auf EDA-Länge
acc_resampled_S12 = resample(acc_S12, len(eda_S12))

# 🧩 Features kombinieren: EDA, TEMP, ACC → (n_samples, 6)
X_raw_S12 = np.hstack([eda_S12, temp_S12, acc_resampled_S12])

# 🪟 Sliding-Window Parameter
window_size_S12 = 384  # 60s Fenster
step_size_S12 = 384

features_S12 = []
window_labels_S12 = []

# 🚶 Sliding-Window-Verarbeitung
for start in range(0, len(X_raw_S12) - window_size_S12, step_size_S12):
    end = start + window_size_S12
    window_S12 = X_raw_S12[start:end]
    label_window_S12 = np.round(labels_resampled_S12[start:end]).astype(int)

    unique_labels_S12 = np.unique(label_window_S12)
    
    # ✅ Nur Fenster mit ausschließlich Label 1 oder 2
    if set(unique_labels_S12).issubset({1, 2}):
        majority_label_S12 = np.bincount(label_window_S12).argmax()
        feature_vector_S12 = np.concatenate([window_S12.mean(axis=0), window_S12.std(axis=0)])
        features_S12.append(feature_vector_S12)
        window_labels_S12.append(majority_label_S12)

# 🎯 Nur Label 1 & 2 verwenden
mask_S12 = (np.array(window_labels_S12) == 1) | (np.array(window_labels_S12) == 2)
X_S12 = np.array(features_S12)[mask_S12]
y_S12 = np.array(window_labels_S12)[mask_S12]

# ✅ Sicherheits-Check
assert len(X_S12) == len(y_S12), "Unterschiedliche Anzahl von Features und Labels!"

# ℹ️ Ergebnis anzeigen
print(f"Datensätze: {X_S12.shape}, Klassenverteilung: {np.bincount(y_S12)}")


Datensätze: (17, 10), Klassenverteilung: [ 0 11  6]


In [5]:
X_S12

array([[ 5.23162565e-01,  3.34719792e+01,  3.78846147e+01,
        -2.70410639e+01, -2.54880501e+01,  2.10455005e-02,
         3.27592160e-02,  5.75877152e+00,  1.75788670e+01,
         2.81943140e+01],
       [ 4.51848799e-01,  3.35931250e+01,  3.96075191e+01,
        -4.68303445e+01,  8.72669253e+00,  2.34686786e-02,
         3.53792695e-02,  2.16568695e+00,  4.48472408e+00,
         6.43844201e+00],
       [ 3.91071099e-01,  3.36761458e+01,  3.95512693e+01,
        -4.49509871e+01,  5.49668004e+00,  1.06702552e-02,
         2.67266172e-02,  4.44301177e+00,  8.89256624e+00,
         1.40801337e+01],
       [ 3.44738273e-01,  3.37189583e+01,  4.85954461e+01,
        -2.81452186e+01,  2.13172117e+01,  1.25650825e-02,
         1.60714010e-02,  5.25298952e+00,  1.32988807e+01,
         1.11939771e+01],
       [ 3.11184562e-01,  3.37604167e+01,  5.65316527e+01,
         3.88648454e+00,  2.42003414e+01,  6.79087833e-03,
         2.14046659e-02,  5.99852439e+00,  5.25096019e+00,
         1.

In [6]:
print(X_S12.shape, y_S12.shape)
print(X.shape, y.shape)

# ⬇️ Laden
rf_loaded = joblib.load("/kaggle/working/random_forest_model.pkl")

# Testen ob es klappt
print("TEST nach Laden", rf_loaded.predict(X_S12[:5]))

# Test auf Subject S12
y_pred = clf.predict(X_S12)
print(classification_report(y_S12, y_pred, digits=3))

(17, 10) (17,)
(786, 10) (786,)
TEST nach Laden [1 1 1 1 1]
              precision    recall  f1-score   support

           1      0.647     1.000     0.786        11
           2      0.000     0.000     0.000         6

    accuracy                          0.647        17
   macro avg      0.324     0.500     0.393        17
weighted avg      0.419     0.647     0.508        17



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

def finde_bestes_n_estimators(X_train, X_test, y_train, y_test, 
                              n_min=10, n_max=200, step=10, average="weighted"):
    ergebnisse = {}
    
    for n in range(n_min, n_max+1, step):
        clf = RandomForestClassifier(n_estimators=n, random_state=42)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        acc = accuracy_score(y_test, y_pred)
        f1  = f1_score(y_test, y_pred, average=average)
        
        ergebnisse[n] = {"accuracy": acc, "f1_score": f1}
        #print(f"n_estimators={n}: Accuracy={acc:.3f}, F1={f1:.3f}")
    
    # Auswahl nach bestem F1-Score
    best_n = max(ergebnisse, key=lambda k: ergebnisse[k]["f1_score"])
    best_vals = ergebnisse[best_n]
    
    print(f"\n➡️ Bestes n_estimators: {best_n} "
          f"mit Accuracy={best_vals['accuracy']:.3f}, F1={best_vals['f1_score']:.3f}")
    
    return best_n, ergebnisse

print("Übersicht LOSO nur S8", X_S12.shape, y_S12.shape)
print("Übersicht WeSad der Rest(alle)",X.shape, y.shape)

# Setze X und Y (alle außer S8 ein) und schaue was der beste n_estimator ist
best_estimator = finde_bestes_n_estimators(X, X_S12, y, y_S12, n_min=10, n_max=200, step=1,average="weighted")

# Bestes n_estimators: 27 mit Accuracy=0.941
# Bestes n_estimators: 27 mit Accuracy=0.941, F1=0.940

Übersicht LOSO nur S8 (17, 10) (17,)
Übersicht WeSad der Rest(alle) (786, 10) (786,)

➡️ Bestes n_estimators: 10 mit Accuracy=0.647, F1=0.508


## Alternative zu Random Forest testen

## XGBoost

In [8]:
!pip install xgboost --quiet


In [9]:
import xgboost as xgb
from sklearn.metrics import classification_report
import xgboost as xgb
from sklearn.metrics import classification_report
import numpy as np

# ✅ Filter: Nur Labels 1 (entspannt) & 2 (gestresst)
mask_train = (y == 1) | (y == 2)
mask_test = (y_S12 == 1) | (y_S12 == 2)

X_train_filtered = X[mask_train]
y_train_filtered = y[mask_train]

X_test_filtered = X_S12[mask_test]
y_test_filtered = y_S12[mask_test]


# Optional: Konvertiere in DMatrix (XGBoost-eigenes Format) – ist aber nicht zwingend
# dtrain = xgb.DMatrix(X, label=y)
# dtest = xgb.DMatrix(X_S12, label=y_S12)

# Mapping: 1 → 0 (Entspannt), 2 → 1 (Stress)
y_train_remap = (y_train_filtered - 1).astype(int)
y_test_remap = (y_test_filtered - 1).astype(int)


xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',  # da nur 2 Klassen nach dem Mapping
    n_estimators=200,
    max_depth=5,
    learning_rate=0.27,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='auc',
    random_state=42
)

xgb_clf.fit(X_train_filtered, y_train_remap)
y_pred = xgb_clf.predict(X_test_filtered)

# Evaluation wieder mit Original-Labels:
from sklearn.metrics import classification_report
print(classification_report(y_test_remap, y_pred, digits=3))


              precision    recall  f1-score   support

           0      0.647     1.000     0.786        11
           1      0.000     0.000     0.000         6

    accuracy                          0.647        17
   macro avg      0.324     0.500     0.393        17
weighted avg      0.419     0.647     0.508        17



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


-> Gutes Ergebnis:
Alle Personen außer S8; sonst alle drin:

              precision    recall  f1-score   support

           0      0.909     0.909     0.909        11
           1      0.833     0.833     0.833         6

    accuracy                          0.882        17
   macro avg      0.871     0.871     0.871        17
weighted avg      0.882     0.882     0.882        17

-> 5/6 Stressfälle erkannt. 
Lernrate manuell angepasst -> Automatisiert noch besser.

-> Interpretation: XGBoost besser als Random Forest, da Gradiant Boosting: Fehler aus vorherigen Bäumen finden + sich verbessern.

Es kann auch komplexere Zusammenhänge modellieren, wo Random Forest eher Mittelwerte bildet.

Boosting ist oft besser für kleine, saubere Zeitfenster-Daten als vorher.

Settings:
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',  # da nur 2 Klassen nach dem Mapping
    n_estimators=200,
    max_depth=5,
    learning_rate=0.27,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)





## Modell speichern

In [10]:
#import joblib

# Modell speichern
#joblib.dump(xgb_clf, "/kaggle/working/xgb_stress_model.pkl")

## Herausfinden, welche Parameter am besten geeignet sind

In [11]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import ParameterGrid

def finde_beste_xgb_parameter(X_train, X_test, y_train, y_test, param_grid, average="weighted"):
    ergebnisse = []
    
    for params in ParameterGrid(param_grid):
        clf = xgb.XGBClassifier(
            objective='binary:logistic',
            use_label_encoder=False,
            eval_metric='auc',
            random_state=42,
            **params
        )
        
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        acc = accuracy_score(y_test, y_pred)
        f1  = f1_score(y_test, y_pred, average=average)
        
        ergebnisse.append((params, acc, f1))
        #print(f"Params={params} | Accuracy={acc:.3f}, F1={f1:.3f}")
    
    # Sortierung nach F1, dann Accuracy
    best_params, best_acc, best_f1 = max(ergebnisse, key=lambda x: (x[2], x[1]))
    
    print(f"\n➡️ Beste Parameter: {best_params}")
    print(f"   Accuracy={best_acc:.3f}, F1={best_f1:.3f}")
    
    return best_params, best_acc, best_f1


In [12]:
X_train_filtered = X[mask_train]
y_train_filtered = y[mask_train]

X_train_filtered = (X_train_filtered - 1).astype(int)
y_train_filtered = (y_train_filtered - 1).astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X_train_filtered, y_train_filtered, 
    test_size=0.2,     # 20% Testdaten
    random_state=42,   # Reproduzierbarkeit
)
param_grid = {
    "n_estimators": [10, 20, 30, 40, 50, 60,70, 80, 100, 150,175, 200,225, 250, 300],
    "max_depth": [3, 5, 7, 9, 11, 13, 15],
    "learning_rate": [0.05, 0.1,0.15,0.2, 0.25, 0.30],
    "subsample": [0.6,0.7,0.8,0.9, 1.0],
    "colsample_bytree": [0.6, 0.7,0.75, 0.8,0.9,1.0]
}

print("Beste Parameter mit 80 20")
beste_params, beste_acc, beste_f1 = finde_beste_xgb_parameter(
    X_train, X_test, y_train, y_test, param_grid
)


Beste Parameter mit 80 20

➡️ Beste Parameter: {'colsample_bytree': 0.6, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 225, 'subsample': 1.0}
   Accuracy=0.968, F1=0.965


In [13]:
xgb_clf_optimmized = xgb.XGBClassifier(
    objective='binary:logistic',  # da nur 2 Klassen nach dem Mapping
    n_estimators=150,
    max_depth=5,
    learning_rate=0.25,
    subsample=0.6,
    colsample_bytree=0.7,
    use_label_encoder=False,
    eval_metric='auc',
    random_state=42
)
joblib.dump(xgb_clf_optimmized, "/kaggle/working/opt_xgb_stress_model.pkl")

['/kaggle/working/opt_xgb_stress_model.pkl']

## Testen mit S8 als LOSO Verfahren

In [14]:
X_test = X_S12[mask_test]
y_test = y_S12[mask_test]

X_train, X_test, y_train, y_test = train_test_split(
    X_train_filtered, y_train_filtered, 
    test_size=0.2,     # 20% Testdaten
    random_state=42,   # Reproduzierbarkeit
)
param_grid = {
    "n_estimators": [10, 20, 30, 40, 50, 60,70, 80, 100, 150,175, 200,225, 250, 300],
    "max_depth": [3, 5, 7, 9, 11, 13, 15],
    "learning_rate": [0.05, 0.1,0.15,0.2, 0.25, 0.30],
    "subsample": [0.6,0.7,0.8,0.9, 1.0],
    "colsample_bytree": [0.6, 0.7,0.75, 0.8,0.9,1.0]
}
print("Beste Parameter mit LOSO")

beste_params, beste_acc, beste_f1 = finde_beste_xgb_parameter(
    X_train, X_test, y_train, y_test, param_grid
)


Beste Parameter mit LOSO

➡️ Beste Parameter: {'colsample_bytree': 0.6, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 225, 'subsample': 1.0}
   Accuracy=0.968, F1=0.965


In [15]:
import xgboost as xgb
from sklearn.metrics import classification_report
import xgboost as xgb
from sklearn.metrics import classification_report
import numpy as np

# ✅ Filter: Nur Labels 1 (entspannt) & 2 (gestresst)
mask_train = (y == 1) | (y == 2)
mask_test = (y_S12 == 1) | (y_S12 == 2)

X_train_filtered = X[mask_train]
y_train_filtered = y[mask_train]

X_test_filtered = X_S12[mask_test]
y_test_filtered = y_S12[mask_test]


# Mapping: 1 → 0 (Entspannt), 2 → 1 (Stress)
y_train_remap = (y_train_filtered - 1).astype(int)
y_test_remap = (y_test_filtered - 1).astype(int)




xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',  # da nur 2 Klassen nach dem Mapping
    n_estimators=150,
    max_depth=5,
    learning_rate=0.25,
    subsample=0.6,
    colsample_bytree=0.7,
    use_label_encoder=False,
    eval_metric='auc',
    random_state=42
)

xgb_clf_optimmized.fit(X_train_filtered, y_train_remap)
y_pred = xgb_clf_optimmized.predict(X_test_filtered)

# Evaluation wieder mit Original-Labels:
from sklearn.metrics import classification_report
print(classification_report(y_test_remap, y_pred, digits=3))

# Speichere 

import joblib

# Modell speichern
joblib.dump(xgb_clf, "/kaggle/working/opt_xgb_stress_model.pkl")

              precision    recall  f1-score   support

           0      0.647     1.000     0.786        11
           1      0.000     0.000     0.000         6

    accuracy                          0.647        17
   macro avg      0.324     0.500     0.393        17
weighted avg      0.419     0.647     0.508        17



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


['/kaggle/working/opt_xgb_stress_model.pkl']

In [16]:
#Abspeichern der Werte als csv

import pandas as pd
import numpy as np
from pathlib import Path

# Arrays sichern
X_train_arr = np.asarray(X_train_filtered)
print(len(X_train_arr))
y_train_arr = np.asarray(y_train_filtered).reshape(-1)
print(len(y_train_filtered))


# Längen abgleichen
n = min(X_train_arr.shape[0], y_train_arr.shape[0])
if X_train_arr.shape[0] != y_train_arr.shape[0]:
    print(f"⚠️ Unterschiedliche Längen: X_test={X_test_arr.shape[0]}, y_train={y_train_arr.shape[0]} -> kürze auf n={n}")

# DataFrame bauen
df = pd.DataFrame(X_train_arr[:n], columns=[f"f{i}" for i in range(X_train_arr.shape[1])])
df["label"] = np.asarray(y_train_remap).reshape(-1)
  # alternativ: (y_train_remap[:n]) für binäre 0/1

# Export
out_path = Path("X_test_filtered__y_train_filtered.csv")
df.to_csv(out_path, index=False)
print("Gespeichert:", out_path, "->", df.shape)


786
786
Gespeichert: X_test_filtered__y_train_filtered.csv -> (786, 11)


## SVM ("Support Vector Maschines")


In [17]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import numpy as np

# 🔄 Nur Labels 1 und 2 (wie bei XGBoost)
mask_train = (y == 1) | (y == 2)
mask_test = (y_S12 == 1) | (y_S12 == 2)

X_train_filtered = X[mask_train]
y_train_filtered = y[mask_train]
X_test_filtered = X_S12[mask_test]
y_test_filtered = y_S12[mask_test]

# 🔄 Labels ummappen: 1 → 0, 2 → 1
y_train_remap = (y_train_filtered - 1).astype(int)
y_test_remap = (y_test_filtered - 1).astype(int)

# 🔄 Features skalieren (wichtig für SVM!)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_filtered)
X_test_scaled = scaler.transform(X_test_filtered)

from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [10,11,12,13,14,15],
    'gamma': [0.01,0.02, 0.03, 0.04, 0.05,0.06, 0.07, 0.08,0.09, 0.1,0.15,0.2,0.3,0.4, 1, 'scale', 'auto'],
    'kernel': ['rbf','linear','polynomial']
}

grid = GridSearchCV(SVC(), param_grid, cv=3, scoring='f1')
grid.fit(X_train_scaled, y_train_remap)

print("Beste Parameter:", grid.best_params_)
print("")

# ✅ SVM mit RBF-Kernel
svm_clf = SVC(kernel='rbf', C=10, gamma=0.1, random_state=42)
svm_clf.fit(X_train_scaled, y_train_remap)

# 🧪 Vorhersage & Bewertung
y_pred = svm_clf.predict(X_test_scaled)
print(classification_report(y_test_remap, y_pred, digits=3))


Beste Parameter: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}

              precision    recall  f1-score   support

           0      0.500     0.545     0.522        11
           1      0.000     0.000     0.000         6

    accuracy                          0.353        17
   macro avg      0.250     0.273     0.261        17
weighted avg      0.324     0.353     0.338        17



306 fits failed out of a total of 918.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
306 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/svm/_base.py", line 180, in fit
    self._validate_params()
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, make_scorer

def optimiere_svm(X_train, y_train, cv_splits=5, class_imbalance=True):
    """
    Sucht SVM-Hyperparameter mit CV, optimiert primär F1 (weighted),
    nutzt Accuracy als Tie-Breaker und gibt bestes Modell + Ergebnisse zurück.
    """
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("svc", SVC(probability=True, class_weight="balanced" if class_imbalance else None))
    ])

    zwischenwerte = [round(x, 2) for x in np.arange(5, 6.01, 0.01)]
    svc_gamma_2 = [round(x, 2) for x in np.arange(0.1, 0.2, 0.001)]
    svc_gamma_2.extend(["scale", "auto"])

    
    param_grid = [
        {"svc__kernel": ["rbf"],
         "svc__C" :zwischenwerte,
         #"svc__C": [0.1, 1, 3,5,5.5, 5.8, 6,6.5, 6.7, 7,7.5, 8, 9, 10,11,13],
         # "svc__gamma": ["scale", "auto", 0.01, 0.03,0.04,0.06,0.08,0.09, 0.1, 0.11, 0.12,0.13]},
         "svc__gamma": svc_gamma_2,
        },
        {"svc__kernel": ["linear"],
         "svc__C": [0.1, 0.01, 0.03,0.04,0.06,0.08,0.09, 0.1, 0.11, 0.12,0.13, 1, 3, 10]},                 # gamma wird ignoriert
        {"svc__kernel": ["poly"],
         "svc__degree": [2, 3],
         "svc__coef0": [0.0, 0.5, 1.0],
         "svc__C": [0.1, 1, 3, 10],
         "svc__gamma": ["scale", "auto", 0.01, 0.03,0.04,0.06,0.08,0.09, 0.1, 0.11, 0.12,0.13]},
    ]

    # Mehrere Metriken loggen; refit = F1 (weighted)
    scorers = {
        "f1": "f1_weighted",
        "accuracy": "accuracy"
    }

    cv = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42)
    grid = GridSearchCV(pipe, param_grid, scoring=scorers, refit="f1",
                        cv=cv, n_jobs=-1, verbose=0, return_train_score=False)
    grid.fit(X_train, y_train)

    # Zusätzlich: Bei gleichem F1 bestes Accuracy-Modell wählen (Tie-Breaker)
    results = grid.cv_results_
    mean_f1 = results["mean_test_f1"]
    mean_acc = results["mean_test_accuracy"]
    best_f1 = mean_f1.max()
    # alle Kandidaten mit maximalem F1 (numerische Toleranz)
    idxs = [i for i, v in enumerate(mean_f1) if abs(v - best_f1) < 1e-9]
    # wähle unter diesen die höchste Accuracy
    best_idx = max(idxs, key=lambda i: mean_acc[i])

    best_params = results["params"][best_idx]
    best_model = grid.best_estimator_ if grid.best_index_ == best_idx else Pipeline([
        ("scaler", StandardScaler())
    ])
    # falls Tie-Breaker anderes Set wählte, neu fitten:
    if grid.best_index_ != best_idx:
        best_model = Pipeline([
            ("scaler", StandardScaler()),
            ("svc", SVC(probability=True,
                       class_weight="balanced" if class_imbalance else None))
        ])
        best_model.set_params(**{k: v for k, v in best_params.items()})
        best_model.fit(X_train, y_train)

    summary = {
        "best_params": best_params,
        "cv_mean_f1": mean_f1[best_idx],
        "cv_mean_accuracy": mean_acc[best_idx]
    }
    return best_model, summary, grid


In [19]:
X_train_filtered = X[mask_train]
y_train_filtered = y[mask_train]
X_test_filtered = X_S12[mask_test]
y_test_filtered = y_S12[mask_test]



best_model, summary, grid = optimiere_svm(X_train_filtered, y_train_filtered, cv_splits=5, class_imbalance=True)
print(summary["best_params"])
print(f"CV F1={summary['cv_mean_f1']:.3f}, CV Acc={summary['cv_mean_accuracy']:.3f}")


{'svc__C': 5.74, 'svc__gamma': 0.2, 'svc__kernel': 'rbf'}
CV F1=0.950, CV Acc=0.949


In [20]:
# Modell speichern
import joblib

# Modell speichern
joblib.dump(svm_clf, "/kaggle/working/svm_clf_stress_model.pkl")

['/kaggle/working/svm_clf_stress_model.pkl']

## CNN 

In [21]:
import numpy as np

def discrete_resample_by_index(labels: np.ndarray, target_len: int) -> np.ndarray:
    """
    Mappt diskrete Labels (z.B. [0,1,2]) ohne Interpolation auf eine neue Länge.
    Verwendet nearest/forward index mapping statt Fourier-Resampling.
    """
    src_len = len(labels)
    if src_len == target_len:
        return labels.astype(int)

    # Indizes im Quellsignal, die jeweils dem EDA-Sample am nächsten sind
    idx = np.floor(np.linspace(0, src_len - 1, target_len)).astype(int)
    return labels[idx].astype(int)

def majority_label_in_window(win_labels: np.ndarray, valid_classes=(1,2), min_valid_ratio=0.8):
    """
    Gibt das Mehrheitslabel im Fenster zurück, wenn genug gültige Labels vorhanden sind,
    sonst None (Fenster verwerfen).
    """
    valid_mask = np.isin(win_labels, valid_classes)
    if valid_mask.mean() < min_valid_ratio:
        return None
    # Nur über gültige Labels die Mehrheit bilden
    counts = np.bincount(win_labels[valid_mask])
    return np.argmax(counts)


In [22]:
import pandas as pd
import numpy as np
from scipy.signal import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

file_paths = [
    '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S14/S14.pkl',
    '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S13/S13.pkl',
    '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S10/S10.pkl',
    '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S5/S5.pkl',
    '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S7/S7.pkl',
    '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S9/S9.pkl',
    '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S15/S15.pkl',
    '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S2/S2.pkl',
    '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S6/S6.pkl',
    '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S3/S3.pkl',
    '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S4/S4.pkl',
    '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S16/S16.pkl',
    '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S17/S17.pkl'
]

# ChatGPT
X_all_list = []
y_all_list = []

def zscore_subject(X):
    mu = X.mean(axis=(0,1), keepdims=True)
    sd = X.std(axis=(0,1), keepdims=True) + 1e-6
    return (X - mu) / sd

for file in file_paths:
    with open(file, 'rb') as f:
        data = pd.read_pickle(f)

    wrist = data['signal']['wrist']
    eda = wrist['EDA']
    temp = wrist['TEMP']
    acc = wrist['ACC']
    labels = data['label']

    # Resample Labels auf EDA-Länge
    #labels_resampled = resample(labels.astype(float), len(eda))
    #labels_resampled = np.round(labels_resampled).astype(int)
    
    # (Optional) Nicht verwendbare Labelcodes auf 0 setzen
    labels_clean = np.array(labels).astype(int)
    valid_classes = {0, 1}
    labels_clean = np.where(np.isin(labels_clean, list(valid_classes)), labels_clean, 0)
    
    # Diskretes Resampling ohne Mischen
    labels_resampled = discrete_resample_by_index(labels_clean, len(eda))
    
    # ACC resamplen und alle Kanäle kombinieren
    acc_resampled = resample(acc, len(eda))
    X_raw = np.hstack([eda, temp, acc_resampled])  # shape: (n_samples, 5)

    # Sliding Window
    #window_size = 384
    #step_size = 384

    #for start in range(0, len(X_raw) - window_size, step_size):
        #end = start + window_size
        #window = X_raw[start:end]
        #label_window = labels_resampled[start:end]

        #if set(np.unique(label_window)).issubset({1, 2}):
            #X_all_list.append(window)
           # majority_label = np.bincount(label_window).argmax()
            #y_all_list.append(majority_label)

    window_size = 384
    step_size = 384
    
    for start in range(0, len(X_raw) - window_size, step_size):
        end = start + window_size
        window = X_raw[start:end]
        label_window = labels_resampled[start:end]
    
        maj = majority_label_in_window(label_window, valid_classes=(1,2), min_valid_ratio=0.8)
        if maj is None:
            continue  # Fenster verwerfen, zu wenig gültige Labels
        X_all_list.append(window)
        y_all_list.append(maj)


X_all = [zscore_subject(X) for X in X_all_list]  # Liste aller Subjekt-Daten

# In Arrays umwandeln
X_all = np.array(X_all_list, dtype=np.float32)          # shape: (n_windows_total, 384, 5)
y_all = (np.array(y_all_list, dtype=np.int64) - 1)      # [1,2] -> [0,1]


print(f"Datensätze: {X_all.shape}, Klassenverteilung: {np.bincount(y_all)}")


Datensätze: (154, 384, 5), Klassenverteilung: [154]


In [23]:
#y_cnn = (y_all - 1).astype(int)  # [1, 2] → [0, 1]
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.2, stratify=y_all, random_state=42
)
#X_train

In [24]:
import tensorflow as tf
from tensorflow.keras import layers, models

model = models.Sequential([
    layers.Conv1D(32, kernel_size=6, activation='relu', input_shape=(384, 5)),
    #layers.Conv1D(32, kernel_size=2, activation='relu', input_shape=(384, 5)),
    layers.MaxPooling1D(pool_size=2),
    layers.Conv1D(64, kernel_size=3, activation='relu'),
    layers.MaxPooling1D(pool_size=2),
    #layers.Flatten(),
    layers.GlobalAveragePooling1D(),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')  # binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy',
              metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy'),
                       tf.keras.metrics.Precision(name='precision'),
                       tf.keras.metrics.Recall(name='recall')])
model.summary()


2025-09-30 11:42:41.283827: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759232561.496908      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759232561.560742      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-09-30 11:42:55.568603: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [25]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

# WICHTIG: AUC/Precision/Recall als Metriken loggen (für Callbacks + F1)
model.compile(
    optimizer=tf.keras.optimizers.Adam(3e-4),
    loss='binary_crossentropy',
    metrics=[
        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
        tf.keras.metrics.AUC(name='auc')
    ]
)

callbacks = [
    ModelCheckpoint("best.keras", monitor="val_auc", mode="max", save_best_only=True),
    EarlyStopping(monitor="val_auc", mode="max", patience=8, restore_best_weights=True),
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, verbose=1, min_lr=1e-6)
]

# Chat GPT Input
from sklearn.utils.class_weight import compute_class_weight

classes = np.unique(y_train)
weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

history = model.fit(
    X_train, y_train,
    class_weight=class_weights, 
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    callbacks=callbacks,
    verbose=1
)





# --- Beste Epochen ermitteln (aus der EINEN History) ---
hist = history.history
val_acc = np.array(hist["val_accuracy"])
val_prec = np.array(hist["val_precision"])
val_rec  = np.array(hist["val_recall"])
val_auc  = np.array(hist["val_auc"])

# F1 aus Precision/Recall
val_f1 = 2 * val_prec * val_rec / (val_prec + val_rec + 1e-7)

best_epoch_f1  = int(np.argmax(val_f1)) + 1
best_epoch_acc = int(np.argmax(val_acc)) + 1
best_epoch_auc = int(np.argmax(val_auc)) + 1

print(f"Beste Epoche nach F1:  {best_epoch_f1}  | val_f1={val_f1[best_epoch_f1-1]:.4f}")
print(f"Beste Epoche nach Acc: {best_epoch_acc} | val_accuracy={val_acc[best_epoch_acc-1]:.4f}")
print(f"Beste Epoche nach AUC: {best_epoch_auc} | val_auc={val_auc[best_epoch_auc-1]:.4f}")


Epoch 1/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 341ms/step - accuracy: 0.4101 - auc: 0.0000e+00 - loss: 3.3353 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.7600 - val_auc: 0.0000e+00 - val_loss: 0.9308 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - learning_rate: 3.0000e-04
Epoch 2/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.7923 - auc: 0.0000e+00 - loss: 1.0260 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 1.0000 - val_auc: 0.0000e+00 - val_loss: 0.0644 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - learning_rate: 3.0000e-04
Epoch 3/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.9442 - auc: 0.0000e+00 - loss: 0.2487 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 1.0000 - val_auc: 0.0000e+00 - val_loss: 0.0017 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - learning_rate: 3.0000e-04
Epoch 4/50
[1m4/4[0m 

In [26]:
model.save("/kaggle/working/cnn_model.keras")
model.save("/kaggle/working/cnn_model.h5", save_format="h5")


In [27]:
from tensorflow.keras.models import load_model

# Laden (egal ob .keras oder .h5)
cnn_loaded = load_model("/kaggle/working/cnn_model.keras")

# Test
print(cnn_loaded.predict(X_test[:5]))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step
[[0.00790432]
 [0.919733  ]
 [0.02802905]
 [0.02029886]
 [0.04023023]]


In [28]:
## Download der Daten
from IPython.display import FileLink

FileLink('/kaggle/working/cnn_model.keras')
FileLink('/kaggle/working/svm_clf_stress_model.pkl')


In [29]:
import os, zipfile, shutil

# 1) Sicherstellen, dass die Modelle da sind
files_to_zip = [
    "/kaggle/working/cnn_model.keras",
    "/kaggle/working/random_forest_model.pkl",
    "/kaggle/working/svm_clf_stress_model.pkl",
]

missing = [p for p in files_to_zip if not os.path.exists(p)]
if missing:
    print("Fehlende Dateien:", missing)
else:
    print("Alle Dateien vorhanden.")

# 2) Sauberen Export-Ordner anlegen
export_dir = "/kaggle/working/export"
os.makedirs(export_dir, exist_ok=True)

# 3) Dateien in den Export-Ordner kopieren (optional, hält’s übersichtlich)
for p in files_to_zip:
    if os.path.exists(p):
        shutil.copy(p, os.path.join(export_dir, os.path.basename(p)))

# 4) Zip nur mit diesen Dateien bauen
zip_path = "/kaggle/working/models.zip"
with zipfile.ZipFile(zip_path, "w") as zf:
    for name in os.listdir(export_dir):
        zf.write(os.path.join(export_dir, name), arcname=name)

# 5) Kontrolle: auflisten und Größen anzeigen
for name in os.listdir("/kaggle/working"):
    full = os.path.join("/kaggle/working", name)
    if os.path.isfile(full):
        print(f"{name}: {os.path.getsize(full)/1024:.1f} KB")

print("\nFERTIG: models.zip liegt in /kaggle/working und erscheint im Output-Tab.")


Alle Dateien vorhanden.
models.zip: 863.1 KB
cnn_model.h5: 173.3 KB
random_forest_model.pkl: 671.6 KB
svm_clf_stress_model.pkl: 18.0 KB
cnn_model.keras: 173.2 KB
__notebook__.ipynb: 114.5 KB
best.keras: 173.2 KB
X_test_filtered__y_train_filtered.csv: 147.4 KB
opt_xgb_stress_model.pkl: 0.8 KB

FERTIG: models.zip liegt in /kaggle/working und erscheint im Output-Tab.


## Ausprobieren CNN für "gelernte Daten" 

In [30]:
from sklearn.metrics import classification_report


from tensorflow.keras.models import load_model

# Laden (egal ob .keras oder .h5)
cnn_loaded = load_model("/kaggle/working/cnn_model.keras")


y_pred_prob = cnn_loaded.predict(X_test).flatten()
y_pred = (y_pred_prob > 0.5).astype(int)

print(classification_report(y_test, y_pred, digits=3))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step
              precision    recall  f1-score   support

           0      1.000     0.871     0.931        31
           1      0.000     0.000     0.000         0

    accuracy                          0.871        31
   macro avg      0.500     0.435     0.466        31
weighted avg      1.000     0.871     0.931        31



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Ausprobieren CNN für "NEUE Daten" -> 8

-> Variablen ändern für S12 bzw. S8 
-> Ausführen
-> 

In [31]:
import pandas as pd
import numpy as np
from scipy.signal import resample

# ---------- Hilfsfunktionen ----------
def discrete_resample_by_index(labels: np.ndarray, target_len: int) -> np.ndarray:
    """Diskretes 'Resampling' per Index-Mapping (keine Interpolation)."""
    labels = np.asarray(labels).astype(np.int64)
    src_len = len(labels)
    if src_len == target_len:
        return labels
    idx = np.floor(np.linspace(0, src_len - 1, target_len)).astype(int)
    return labels[idx]

def majority_label_in_window(win_labels: np.ndarray, valid_classes=(1,2), min_valid_ratio=0.7):
    """Mehrheitslabel im Fenster, nur über gültige Klassen; sonst None."""
    vl = np.asarray(win_labels).astype(np.int64)
    valid_mask = np.isin(vl, valid_classes)
    if valid_mask.mean() < min_valid_ratio:
        return None
    counts = np.bincount(vl[valid_mask])
    return int(np.argmax(counts))

# ---------- Daten laden ----------
file_paths = [
    '/kaggle/input/wesad-wearable-stress-affect-detection-dataset/WESAD/S8/S8.pkl'
]

X_all_list_S8, y_all_list_S8 = [], []

window_size = 384
step_size   = 384
valid_classes = (1, 2)

for file in file_paths:
    with open(file, 'rb') as f:
        data = pd.read_pickle(f)

    wrist = data['signal']['wrist']
    eda   = np.asarray(wrist['EDA']).reshape(-1, 1)   # (N,1)
    temp  = np.asarray(wrist['TEMP']).reshape(-1, 1)  # (N,1)
    acc   = np.asarray(wrist['ACC'])                  # (N,3)

    labels = np.asarray(data['label']).astype(np.int64)

    # 1) Labels reinigen: nur 1/2 behalten, Rest = 0 (ungültig)
    labels_clean = np.where(np.isin(labels, valid_classes), labels, 0)

    # 2) Labels diskret auf EDA-Länge mappen (kein Fourier)
    labels_resampled = discrete_resample_by_index(labels_clean, len(eda))

    # 3) ACC auf EDA-Länge bringen (hier linear/Fourier ok, da kontinuierlich)
    acc_resampled = resample(acc, len(eda))

    # 4) Feature-Matrix bauen
    X_raw = np.hstack([eda, temp, acc_resampled])  # (N,5)

    # 5) Sliding Windows + Majority Voting
    total_windows, kept = 0, 0
    for start in range(0, len(X_raw) - window_size, step_size):
        end = start + window_size
        total_windows += 1

        window = X_raw[start:end]
        label_window = labels_resampled[start:end]

        maj = majority_label_in_window(label_window, valid_classes=valid_classes, min_valid_ratio=0.7)
        if maj is None:
            continue

        X_all_list_S8.append(window)
        y_all_list_S8.append(maj)
        kept += 1

    print(f"S8: Fenster gesamt={total_windows}, behalten={kept}")

# 6) In Arrays umwandeln, Label-Mapping {1,2}->{0,1} genau einmal
X_S8 = np.array(X_all_list_S8, dtype=np.float32)                 # (n_windows, 384, 5)
y_S8 = (np.array(y_all_list_S8, dtype=np.int64) - 1).astype(int) # (n_windows,)

X_S8 = zscore_subject(X_S8)

print(f"Datensätze: {X_S8.shape}, Klassenverteilung: {np.bincount(y_S8) if len(y_S8)>0 else []}")


S8: Fenster gesamt=56, behalten=19
Datensätze: (19, 384, 5), Klassenverteilung: [12  7]


In [32]:
#y_cnn = (y_S8 - 1).astype(int)  # [1, 2] → [0, 1]

#model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)


from sklearn.metrics import classification_report

from tensorflow.keras.models import load_model

# Laden (egal ob .keras oder .h5)
cnn_loaded = load_model("/kaggle/working/cnn_model.keras")

y_pred_prob = cnn_loaded.predict(X_S8).flatten()
y_pred = (y_pred_prob > 0.5).astype(int)

print(classification_report(y_S8, y_pred, digits=3))
X_raw[8]

import scipy.ndimage as nd
p_test = cnn_loaded.predict(X_S8, verbose=0).ravel()
p_test = nd.uniform_filter1d(p_test, size=5)   # leichte Glättung



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step
              precision    recall  f1-score   support

           0      0.222     0.167     0.190        12
           1      0.000     0.000     0.000         7

    accuracy                          0.105        19
   macro avg      0.111     0.083     0.095        19
weighted avg      0.140     0.105     0.120        19

