In [2]:
# Act3: Data Cleaning
import pandas as pd
import numpy as np

# adjust path if your CSV name differs
df = pd.read_csv("customer_churn.csv")  # or the original filename you have
df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# Convert TotalCharges to numeric then fill
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
print("Missing TotalCharges before fill:", df["TotalCharges"].isnull().sum())

# For new customers tenure==0, fill 0
df["TotalCharges"] = df["TotalCharges"].fillna(0)
print("Missing TotalCharges after fill:", df["TotalCharges"].isnull().sum())


Missing TotalCharges before fill: 11
Missing TotalCharges after fill: 0


In [4]:
# Drop customerID if present
if "customerID" in df.columns:
    df = df.drop("customerID", axis=1)

# Remove exact duplicates
df = df.drop_duplicates()
print("Shape after drop:", df.shape)


Shape after drop: (7021, 20)


In [5]:
# Standardize and encode Yes/No columns
yes_no_cols = [
    "Partner","Dependents","PhoneService","PaperlessBilling",
    "OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport",
    "StreamingTV","StreamingMovies","Churn"
]

for col in yes_no_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip().replace({
            "Yes": 1,
            "No": 0,
            "No internet service": 0,
            "No phone service": 0
        })

# Quick check
print(df[yes_no_cols].head())
print(df[yes_no_cols].isnull().sum())


   Partner  Dependents  PhoneService  PaperlessBilling  OnlineSecurity  \
0        1           0             0                 1               0   
1        0           0             1                 0               1   
2        0           0             1                 1               1   
3        0           0             0                 0               1   
4        0           0             1                 1               0   

   OnlineBackup  DeviceProtection  TechSupport  StreamingTV  StreamingMovies  \
0             1                 0            0            0                0   
1             0                 1            0            0                0   
2             1                 0            0            0                0   
3             0                 1            1            0                0   
4             0                 0            0            0                0   

   Churn  
0      0  
1      0  
2      1  
3      0  
4      1  
Partner 

  df[col] = df[col].astype(str).str.strip().replace({
  df[col] = df[col].astype(str).str.strip().replace({
  df[col] = df[col].astype(str).str.strip().replace({
  df[col] = df[col].astype(str).str.strip().replace({
  df[col] = df[col].astype(str).str.strip().replace({
  df[col] = df[col].astype(str).str.strip().replace({
  df[col] = df[col].astype(str).str.strip().replace({
  df[col] = df[col].astype(str).str.strip().replace({
  df[col] = df[col].astype(str).str.strip().replace({
  df[col] = df[col].astype(str).str.strip().replace({
  df[col] = df[col].astype(str).str.strip().replace({


In [6]:
# Save cleaned dataset for modeling and other notebooks
df.to_csv("customer_churn_cleaned.csv", index=False)
print("Saved customer_churn_cleaned.csv")


Saved customer_churn_cleaned.csv
