In [None]:
import pandas as pd
import numpy as np
import os

# Raw dataset path (example)
raw_path = "./telecom_raw.csv"

# Create staged folder
staged_dir = "./data/staged"
os.makedirs(staged_dir, exist_ok=True)

output_path = os.path.join(staged_dir, "telecom_transformed.csv")

raw_path, output_path


In [None]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()


In [None]:
# Convert TotalCharges to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

num_cols = ["tenure", "MonthlyCharges", "TotalCharges"]
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

cat_cols = df.select_dtypes(include="object").columns
for col in cat_cols:
    df[col] = df[col].fillna("Unknown")

df.head()


In [None]:
# 1) tenure_group
df["tenure_group"] = pd.cut(
    df["tenure"],
    bins=[0, 12, 36, 60, df["tenure"].max()],
    labels=["New", "Regular", "Loyal", "Champion"],
    include_lowest=True
)

# 2) monthly_charge_segment
df["monthly_charge_segment"] = pd.cut(
    df["MonthlyCharges"],
    bins=[0, 30, 70, df["MonthlyCharges"].max()],
    labels=["Low", "Medium", "High"],
    include_lowest=True
)

# 3) has_internet_service
df["has_internet_service"] = df["InternetService"].map({
    "DSL": 1,
    "Fiber optic": 1,
    "No": 0
}).fillna(0)

# 4) is_multi_line_user
df["is_multi_line_user"] = df["MultipleLines"].apply(lambda x: 1 if x == "Yes" else 0)

# 5) contract_type_code
df["contract_type_code"] = df["Contract"].map({
    "Month-to-month": 0,
    "One year": 1,
    "Two year": 2
}).fillna(0)

df.head()


In [None]:
df.drop(columns=["customerID", "gender"], inplace=True, errors="ignore")

df.head()


In [None]:
df.to_csv(output_path, index=False)
print(f"âœ… Transform complete!\nSaved at: {output_path}")
