In [7]:
from pathlib import Path
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import datetime, sys, sklearn

ROOT_DIR = Path(__file__).resolve().parents[1] if "__file__" in locals() else Path.cwd().parents[0]
DATA_DIR = ROOT_DIR / "data" / "encoded"
OUT_DIR = ROOT_DIR / "data" / "scaled"
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [8]:
train = pd.read_csv(DATA_DIR / "train_encoded.csv")
val = pd.read_csv(DATA_DIR / "val_encoded.csv")
test = pd.read_csv(DATA_DIR / "test_encoded.csv")

In [9]:
num_cols = ["tenure", "MonthlyCharges", "TotalCharges"]

scaler = MinMaxScaler()

train[num_cols] = scaler.fit_transform(train[num_cols])
val[num_cols] = scaler.transform(val[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

In [10]:
train.to_csv(OUT_DIR / "train_scaled.csv", index=False)
val.to_csv(OUT_DIR / "val_scaled.csv", index=False)
test.to_csv(OUT_DIR / "test_scaled.csv", index=False)

In [11]:
print("Train ranges:\n", train[num_cols].agg(["min", "max"]))

Train ranges:
      tenure  MonthlyCharges  TotalCharges
min     0.0             0.0           0.0
max     1.0             1.0           1.0


In [12]:
ok = (
    train[num_cols].max().le(1.0).all() and 
    train[num_cols].min().ge(0.0).all()
)
print("✅ Scaling check passed" if ok else "⚠️ Scaling check failed")

✅ Scaling check passed


In [None]:
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

md = f"""# Scaling Report

**Datum/Zeit:** {now}  
**random_state:** 42  

---

## Overview
- Numerical features scaled using **Min-Max Normalization (0–1)**
- Applied to: `tenure`, `MonthlyCharges`, `TotalCharges`
- Fitted on training data only, applied to validation/test for consistency
- Ensures feature comparability and stability in distance-based and gradient models

---

## Value Ranges (after scaling)
| Feature | Min | Max |
|----------|----:|----:|
| tenure | {train['tenure'].min():.2f} | {train['tenure'].max():.2f} |
| MonthlyCharges | {train['MonthlyCharges'].min():.2f} | {train['MonthlyCharges'].max():.2f} |
| TotalCharges | {train['TotalCharges'].min():.2f} | {train['TotalCharges'].max():.2f} |

---

## Output
- `data/scaled/train_scaled.csv`  
- `data/scaled/val_scaled.csv`  
- `data/scaled/test_scaled.csv`

---

## Reproducibility
- Python {sys.version.split()[0]}  
- pandas {pd.__version__}  
- scikit-learn {sklearn.__version__}

---

## Note
Scaling improves numerical stability, speeds up convergence,  
and prevents bias in algorithms sensitive to feature magnitudes (e.g., XGBoost, kNN, logistic regression).
"""

REPORT_DIR = ROOT_DIR / "reports" / "data_preparation"
REPORT_DIR.mkdir(parents=True, exist_ok=True)
Path(REPORT_DIR / "scaling_report.md").write_text(md, encoding="utf-8")

947