# Data Cleaning

## dataset : Telco_Customer_Churn.csv

In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

In [3]:
# Load dataset
PROJECT_ROOT = Path("..").resolve()
DATA_DIR = PROJECT_ROOT / "data"
RAW_DATA_PATH = DATA_DIR / "Telco_Customer_Churn.csv"
CLEAN_DATA_PATH = DATA_DIR / "cleaned_data.csv"
print("Root Directory: ", PROJECT_ROOT)
print("Data Directory: ", DATA_DIR)
print("Raw Data Path: ", RAW_DATA_PATH)
print("Cleaned Data Path: ", CLEAN_DATA_PATH)

Root Directory:  D:\study_material\ProjectFinal\churn_prediction
Data Directory:  D:\study_material\ProjectFinal\churn_prediction\data
Raw Data Path:  D:\study_material\ProjectFinal\churn_prediction\data\Telco_Customer_Churn.csv
Cleaned Data Path:  D:\study_material\ProjectFinal\churn_prediction\data\cleaned_data.csv


In [4]:
# Load data
data = pd.read_csv(RAW_DATA_PATH)


In [5]:
# Data back-up
df = data.copy()

In [6]:
# Meta data
print("No. of rows: ", df.shape[0])
print("No. of columns: ", df.shape[1])
print("All columns: \n", df.columns.to_list())
print("Duplicate Count: ", df.duplicated().sum())
print("Null Values: \n", df.isnull().sum())
print("\nInfo:\n")
df.info()


No. of rows:  7043
No. of columns:  21
All columns: 
 ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']
Duplicate Count:  0
Null Values: 
 customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns

In [7]:
# Change data type of "TotalCharges"
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
print("Data Type of TotalCharges: ", df["TotalCharges"].dtypes)


Data Type of TotalCharges:  float64


In [8]:
# Handle missing TotalCharges
print("Missing TotalCharges: ", df["TotalCharges"].isnull().sum())
df = df.dropna(subset=["TotalCharges"])


Missing TotalCharges:  11


In [9]:
# Drop irrelevant columns
df.drop(columns=["customerID"], inplace=True)


In [10]:
# Standardize categorical values
replace_cols = [
    "MultipleLines","OnlineSecurity","OnlineBackup",
    "DeviceProtection","TechSupport","StreamingTV","StreamingMovies"
]
for col in replace_cols:
    df[col] = df[col].replace({"No internet service":"No", "No phone service":"No"})


In [11]:
# Drop duplicates if any
df = df.drop_duplicates()

# Quick summary stats
print("\nFinal list of columns: \n", df.columns.to_list())
print("\nCleaned dataset shape: ", df.shape)
print("\nChurn distribution:\n", df["Churn"].value_counts())



Final list of columns: 
 ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

Cleaned dataset shape:  (7010, 20)

Churn distribution:
 Churn
No     5153
Yes    1857
Name: count, dtype: int64


In [12]:
# Save clean data
df.to_csv(CLEAN_DATA_PATH, index=False)
print("Cleaned dataset saved at:", CLEAN_DATA_PATH)


Cleaned dataset saved at: D:\study_material\ProjectFinal\churn_prediction\data\cleaned_data.csv
