In [None]:
import pandas as pd

df = pd.read_csv("../docs/Sample Data/sample_customer_churn.csv")


print(df.head())

X = df.drop(columns=["churn"])
print(X)


  customer_id  gender  senior_citizen partner dependents  tenure_months  \
0        C001  Female               0     Yes         No             12   
1        C002    Male               0      No         No              3   
2        C003  Female               0     Yes        Yes             48   
3        C004    Male               1      No         No              6   
4        C005  Female               0     Yes        Yes             72   

    contract_type    payment_method  monthly_charges  total_charges  ...  \
0  Month-to-month  Electronic check             70.5          846.0  ...   
1  Month-to-month      Mailed check             89.1          267.3  ...   
2        One year       Credit card             55.2         2649.6  ...   
3  Month-to-month  Electronic check             95.0          570.0  ...   
4        Two year     Bank transfer             45.0         3240.0  ...   

  streaming_tv streaming_movies multiple_lines avg_monthly_usage_gb  \
0          Yes       

In [43]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
numeric_features


['senior_citizen',
 'tenure_months',
 'monthly_charges',
 'total_charges',
 'avg_monthly_usage_gb',
 'support_tickets_last_6m',
 'late_payments_last_year']

In [41]:
categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()
categorical_features


['customer_id',
 'gender',
 'partner',
 'dependents',
 'contract_type',
 'payment_method',
 'internet_service',
 'online_security',
 'tech_support',
 'paperless_billing',
 'streaming_tv',
 'streaming_movies',
 'multiple_lines',
 'autopay_enabled',
 'billing_cycle',
 'region']

In [45]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Numeric pipeline
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical pipeline
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Full preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [47]:
X = df.drop(columns=["churn"])
y = df["churn"]

X_processed = preprocessor.fit_transform(X)

X_processed.shape


(15, 57)