In [18]:
%pip install -q pandas scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [19]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
import datetime, sys, sklearn

# Projekt-Root automatisch finden (z. B. wenn das Notebook in /notebooks/ liegt)
ROOT_DIR = Path(__file__).resolve().parents[1] if "__file__" in locals() else Path.cwd().parents[0]

RAW_PATH = ROOT_DIR / "data" / "raw" / "IBMTelco_Datensatz.csv"
OUT_DIR = ROOT_DIR / "data" / "splits"
OUT_DIR.mkdir(parents=True, exist_ok=True)
random_state = 42

In [20]:
df = pd.read_csv(RAW_PATH)
df = df.rename(columns=lambda c: c.strip())
df = df.drop(columns=["customerID"])
df["Churn"] = df["Churn"].map({"Yes":1,"No":0}).astype(int)

In [21]:
y = df["Churn"]
sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=random_state)
train_val_idx, test_idx = next(sss1.split(df, y))
train_val = df.iloc[train_val_idx].reset_index(drop=True)
test = df.iloc[test_idx].reset_index(drop=True)

In [22]:
sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.1764705882, random_state=random_state)
train_idx, val_idx = next(sss2.split(train_val, train_val["Churn"]))
train = train_val.iloc[train_idx].reset_index(drop=True)
val = train_val.iloc[val_idx].reset_index(drop=True)

In [23]:
train.to_csv(OUT_DIR / "train.csv", index=False)
val.to_csv(OUT_DIR / "val.csv", index=False)
test.to_csv(OUT_DIR / "test.csv", index=False)

In [24]:
def stats(d):
    n = len(d)
    c = d["Churn"].sum()
    r = c / n
    return n, c, r

tot_n, tot_c, tot_r = stats(df)
tr_n, tr_c, tr_r = stats(train)
va_n, va_c, va_r = stats(val)
te_n, te_c, te_r = stats(test)

summary = pd.DataFrame({
    "Split":["Gesamt","Train","Validation","Test"],
    "n":[tot_n,tr_n,va_n,te_n],
    "Churn n":[tot_c,tr_c,va_c,te_c],
    "Churn %":[round(tot_r*100,1),round(tr_r*100,1),round(va_r*100,1),round(te_r*100,1)]
})
summary

Unnamed: 0,Split,n,Churn n,Churn %
0,Gesamt,7043,1869,26.5
1,Train,4929,1308,26.5
2,Validation,1057,281,26.6
3,Test,1057,280,26.5


In [25]:
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

md = f"""# Data Selection

- Datum/Zeit: {now}
- random_state: {random_state}
- Quelle: {RAW_PATH}

## Umfang
| Split | n | Churn n | Churn % |
|---|---:|---:|---:|
| Gesamt | {tot_n} | {tot_c} | {tot_r*100:.1f}% |
| Train | {tr_n} | {tr_c} | {tr_r*100:.1f}% |
| Validation | {va_n} | {va_c} | {va_r*100:.1f}% |
| Test | {te_n} | {te_c} | {te_r*100:.1f}% |

## Vorgehen
- Alle Variablen außer `customerID`
- Stratified 70/15/15 (auf `Churn`)
- Reproduzierbar mit random_state={random_state}

## Reproduzierbarkeit
- Python {sys.version.split()[0]} • pandas {pd.__version__} • scikit-learn {sklearn.__version__}
"""
Path("data_selection.md").write_text(md, encoding="utf-8")

546