In [1]:
from pathlib import Path
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import numpy as np
import datetime, sys, sklearn

ROOT_DIR = Path(__file__).resolve().parents[1] if "__file__" in locals() else Path.cwd().parents[0]
DATA_DIR = ROOT_DIR / "data" / "cleaned"
OUT_DIR = ROOT_DIR / "data" / "encoded"
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [2]:
train = pd.read_csv(DATA_DIR / "train_clean.csv")
val = pd.read_csv(DATA_DIR / "val_clean.csv")
test = pd.read_csv(DATA_DIR / "test_clean.csv")

train.shape, val.shape, test.shape

((4929, 20), (1057, 20), (1057, 20))

In [3]:
categorical = train.select_dtypes(include=["object"]).columns.tolist()
numeric = train.select_dtypes(exclude=["object"]).columns.tolist()
print("Categorical:", categorical)
print("Numeric:", numeric)

Categorical: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
Numeric: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn']


In [4]:
mapping_contract = {"Month-to-month":1, "One year":12, "Two year":24}
mapping_internet = {"No":0, "DSL":1, "Fiber optic":2}

train["Contract"] = train["Contract"].map(mapping_contract)
val["Contract"] = val["Contract"].map(mapping_contract)
test["Contract"] = test["Contract"].map(mapping_contract)

train["InternetService"] = train["InternetService"].map(mapping_internet)
val["InternetService"] = val["InternetService"].map(mapping_internet)
test["InternetService"] = test["InternetService"].map(mapping_internet)

In [5]:
onehot_cols = [
    'gender','Partner','Dependents','PhoneService','MultipleLines',
    'OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport',
    'StreamingTV','StreamingMovies','PaperlessBilling','PaymentMethod'
]

encoder = OneHotEncoder(sparse_output=False, drop="first", handle_unknown="ignore")

encoded_train = pd.DataFrame(
    encoder.fit_transform(train[onehot_cols]),
    columns=encoder.get_feature_names_out(onehot_cols),
    index=train.index
)

encoded_val = pd.DataFrame(
    encoder.transform(val[onehot_cols]),
    columns=encoder.get_feature_names_out(onehot_cols),
    index=val.index
)

encoded_test = pd.DataFrame(
    encoder.transform(test[onehot_cols]),
    columns=encoder.get_feature_names_out(onehot_cols),
    index=test.index
)

In [6]:
X_train = pd.concat([train.drop(columns=onehot_cols), encoded_train], axis=1)
X_val = pd.concat([val.drop(columns=onehot_cols), encoded_val], axis=1)
X_test = pd.concat([test.drop(columns=onehot_cols), encoded_test], axis=1)

X_train.columns = X_train.columns.str.replace(" ", "_").str.replace("-", "_")
X_val.columns = X_val.columns.str.replace(" ", "_").str.replace("-", "_")
X_test.columns = X_test.columns.str.replace(" ", "_").str.replace("-", "_")

X_train.to_csv(OUT_DIR / "train_encoded.csv", index=False)
X_val.to_csv(OUT_DIR / "val_encoded.csv", index=False)
X_test.to_csv(OUT_DIR / "test_encoded.csv", index=False)

In [7]:
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

md = f"""# Encoding Strategy

**Datum/Zeit:** {now}  
**random_state:** 42  

---

## Overview
- Nominal variables encoded with **One-Hot-Encoding** (drop='first')
- Ordinal variables encoded via **manual mapping**
- Cleaned column names (no spaces or hyphens)
- Handled unknown categories using `handle_unknown="ignore"`

---

## Encoded Columns Summary
**Original columns:** {len(train.columns)}  
**After encoding:** {len(X_train.columns)}  

---

## Output
- data/encoded/train_encoded.csv  
- data/encoded/val_encoded.csv  
- data/encoded/test_encoded.csv  

---

## Reproducibility
- Python {sys.version.split()[0]}  
- pandas {pd.__version__}  
- scikit-learn {sklearn.__version__}
"""

REPORT_DIR = ROOT_DIR / "reports" / "data_preparation"
REPORT_DIR.mkdir(parents=True, exist_ok=True)
Path(REPORT_DIR / "encoding_strategy.md").write_text(md, encoding="utf-8")

611