In [3]:
# ============================================================
# model_retraining_ohe.ipynb
# Zweck: Einheitliches Retraining mit konsistentem One-Hot-Encoding
# ============================================================

import pandas as pd
import joblib
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import os
import json
from datetime import datetime

# === 1. Daten laden ===
train_path = "../data/processed/train.csv"
df = pd.read_csv(train_path)
print(f"✅ Loaded dataset from {train_path}")
print("Shape:", df.shape)
display(df.head(3))

# === 2. Zielvariable definieren ===
y = df["Churn"]
X = df.drop(columns=["Churn"])

# === 3. One-Hot-Encoding ===
X_encoded = pd.get_dummies(X, drop_first=False)
feature_names = X_encoded.columns.tolist()
print(f"Encoded features: {len(feature_names)}")

# === 4. Train/Test-Split ===
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

# === 5. Modell trainieren ===
model = XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

model.fit(X_train, y_train)

# === 6. Evaluation ===
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred)
print("\n📊 Evaluation:")
print("F1-Score:", round(f1, 3))
print(classification_report(y_test, y_pred))

# === 7. Speichern von Modell + Feature-Liste ===
os.makedirs("models", exist_ok=True)

joblib.dump(model, "models/xgboost_model.pkl")
joblib.dump(feature_names, "models/feature_names.pkl")

# Optional: Feature-Namen auch als JSON für Transparenz
with open("models/feature_names.json", "w") as f:
    json.dump(feature_names, f, indent=2)

# === 8. Logging zur Reproduzierbarkeit ===
log_entry = {
    "timestamp": datetime.now().isoformat(timespec="seconds"),
    "model": "XGBoost",
    "n_features": len(feature_names),
    "f1_score": round(f1, 3)
}

log_path = "models/training_log.json"
if os.path.exists(log_path):
    existing = json.load(open(log_path))
else:
    existing = []
existing.append(log_entry)

with open(log_path, "w") as f:
    json.dump(existing, f, indent=2)

print("\n✅ Model and feature list saved to /models/")
print(f"Total features: {len(feature_names)}")
print(f"🕓 Training log updated → {log_path}")

✅ Loaded dataset from ../data/processed/train.csv
Shape: (4929, 29)


Unnamed: 0,Churn,Contract,Dependents_Yes,DeviceProtection_No_internet_service,DeviceProtection_Yes,InternetService,MonthlyCharges,MultipleLines_No_phone_service,MultipleLines_Yes,OnlineBackup_No_internet_service,...,SeniorCitizen,StreamingMovies_No_internet_service,StreamingMovies_Yes,StreamingTV_No_internet_service,StreamingTV_Yes,TechSupport_No_internet_service,TechSupport_Yes,TotalCharges,gender_Male,tenure
0,0,24,0.0,0.0,1.0,1,0.416044,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,1.0,0.432157,1.0,0.861111
1,0,1,1.0,0.0,0.0,1,0.312407,1.0,0.0,0.0,...,0,0.0,1.0,0.0,1.0,0.0,0.0,0.353428,0.0,0.833333
2,0,12,0.0,0.0,1.0,1,0.425511,0.0,0.0,0.0,...,0,0.0,1.0,0.0,0.0,0.0,0.0,0.386641,1.0,0.736111


Encoded features: 28

📊 Evaluation:
F1-Score: 0.584
              precision    recall  f1-score   support

           0       0.84      0.90      0.87       724
           1       0.65      0.53      0.58       262

    accuracy                           0.80       986
   macro avg       0.75      0.71      0.73       986
weighted avg       0.79      0.80      0.79       986


✅ Model and feature list saved to /models/
Total features: 28
🕓 Training log updated → models/training_log.json
