In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [44]:
df = pd.read_csv('/Users/limengfeiyang/CMOR438-final-project/data/telco_churn.csv')

## 2.1 Drop Irrelevant Columns

In [45]:
df.drop(columns=["customerID"], inplace=True, errors="ignore")


## 2.2 Convert `TotalCharges` to Numeric

In [46]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")


## 2.3 Handle Missing `TotalCharges`

In [47]:
missing_before = df["TotalCharges"].isnull().sum()
print("Missing TotalCharges before handling:", missing_before)


Missing TotalCharges before handling: 11


## 2.3.1 Drop Rows with Missing `TotalCharges`

In [48]:
df = df[df["TotalCharges"].notnull()].copy()
df.reset_index(drop=True, inplace=True)
print("Missing after drop:", df["TotalCharges"].isnull().sum())


Missing after drop: 0


## 2.4 Cast Numeric Columns

In [49]:
num_cols = ["tenure", "MonthlyCharges", "TotalCharges"]
df[num_cols] = df[num_cols].astype(float)
print(df[num_cols].dtypes)


tenure            float64
MonthlyCharges    float64
TotalCharges      float64
dtype: object


## 2.5 Encode Binary Categorical Features

In [50]:
# Yes/No → 1/0 & Female/Male → 1/0
df["gender"] = df["gender"].map({"Female": 1, "Male": 0})
for col in ["Partner", "Dependents", "PhoneService", "PaperlessBilling", "Churn"]:
    df[col] = df[col].map({"Yes": 1, "No": 0})
    
df[["gender", "Partner", "Dependents", "PhoneService", "PaperlessBilling", "Churn"]].head()


Unnamed: 0,gender,Partner,Dependents,PhoneService,PaperlessBilling,Churn
0,1,1,0,0,1,0
1,0,0,0,1,0,0
2,0,0,0,1,1,1
3,0,0,0,0,0,0
4,1,0,0,1,1,1


## 2.6 One‐Hot Encode Multi‐Category Features

In [51]:
multi_cat = [
    "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup",
    "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies",
    "Contract", "PaymentMethod"
]
df = pd.get_dummies(df, columns=multi_cat, drop_first=True)

# Verify new columns
print("Total columns now:", len(df.columns))


Total columns now: 31


## 2.7 Save Cleaned Data

In [52]:
df.to_csv("/Users/limengfeiyang/CMOR438-final-project/data/telco_churn_clean.csv", index=False)
print("Clean file written: /Users/limengfeiyang/CMOR438-final-project/data/telco_churn_clean.csv")


Clean file written: /Users/limengfeiyang/CMOR438-final-project/data/telco_churn_clean.csv
