In [1]:
#Import & load cleaned data

import pandas as pd
import numpy as np

df = pd.read_csv('../data/cleaned/cleaned_telco_customer_churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


## Feature Engineering Plan
Here we will be creating new features that focus on three core factors:
- A: Customer Behavior Features - Demogra

In [2]:
## Binary Encoding
binary_cols = [
    'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling'
]

for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

In [3]:
## Category encoding with multi-class features
multi_class_cols = [
    'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
    'Contract', 'PaymentMethod'
]
for col in multi_class_cols:
    df = pd.get_dummies(df, columns=[col], prefix=col, drop_first=True)

In [None]:
## 5.1 Feature Creation: Create AvgMonthlyCharge feature
df['AvgMonthlyCharge'] = df['TotalCharges'] / (df['tenure'] + 1e-5)  # Adding a small constant to avoid division by zero


In [5]:
## 5.2 Customer Tenure Buckets
df['TenureBucket'] = pd.cut(
    df['tenure'],
    bins = [0,6,24,72],
    labels = ['New','Mid-Term','Long-Term'],
    right = False
)

In [6]:
## 6 Create Service Count Feature

service_cols = ['PhoneService','MultipleLines_Yes','OnlineSecurity_Yes','OnlineBackup_Yes',
                'DeviceProtection_Yes','TechSupport_Yes','StreamingTV_Yes','StreamingMovies_Yes']

df['NumServices'] = df[service_cols].sum(axis=1)

In [7]:
## Interaction Features: Tenure * MonthlyCharges
df['MonthlyCharges_Tenure'] = df['MonthlyCharges'] * df['tenure']

## Interaction Features: MonthlyCharges per service
df['MonthlyCharges_per_Service'] = df['MonthlyCharges'] / (df['NumServices'] + 1e-5)  # Avoid division by zero

In [None]:
## 8. Drop CustomerID column
df.drop('customerID', axis=1, inplace=True)

In [9]:
##Save Final Engineered dataset
df.to_csv('../data/cleaned/engineered_telco_customer_churn.csv', index=False)
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,...,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,TenureBucket,NumServices,MonthlyCharges_Tenure,MonthlyCharges_per_Service
0,7590-VHVEG,Female,0,1,0,1,0,1,29.85,29.85,...,False,False,False,False,True,False,New,1,29.85,29.849702
1,5575-GNVDE,Male,0,0,0,34,1,0,56.95,1889.5,...,False,True,False,False,False,True,Long-Term,3,1936.3,18.98327
2,3668-QPYBK,Male,0,0,0,2,1,1,53.85,108.15,...,False,False,False,False,False,True,New,3,107.7,17.94994
3,7795-CFOCW,Male,0,0,0,45,0,0,42.3,1840.75,...,False,True,False,False,False,False,Long-Term,3,1903.5,14.099953
4,9237-HQITU,Female,0,0,0,2,1,1,70.7,151.65,...,False,False,False,False,True,False,New,1,141.4,70.699293
