## Data Cleaning

In [251]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [252]:
df = pd.read_csv('DATA/raw/customer_churn_data.csv')
df.head(5)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,CUST0000,Male,0,No,Yes,23,No,No phone service,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Bank transfer,49.85,1146.55,No
1,CUST0001,Female,0,Yes,No,43,No,No phone service,DSL,Yes,...,Yes,No,Yes,No,Month-to-month,No,Mailed check,100.7,4330.1,Yes
2,CUST0002,Male,1,No,No,51,Yes,No,DSL,No,...,Yes,Yes,No,No,One year,No,Electronic check,97.33,4963.83,Yes
3,CUST0003,Male,1,No,No,72,Yes,Yes,DSL,Yes,...,Yes,No,No,No,Month-to-month,No,Credit card,101.38,7299.36,No
4,CUST0004,Male,1,No,No,25,Yes,Yes,DSL,No,...,No,Yes,No,Yes,Month-to-month,No,Electronic check,52.22,1305.5,Yes


In [253]:
df.shape

(5880, 21)

In [254]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5880 entries, 0 to 5879
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        5880 non-null   object 
 1   gender            5880 non-null   object 
 2   SeniorCitizen     5880 non-null   int64  
 3   Partner           5880 non-null   object 
 4   Dependents        5880 non-null   object 
 5   tenure            5880 non-null   int64  
 6   PhoneService      5880 non-null   object 
 7   MultipleLines     5880 non-null   object 
 8   InternetService   5880 non-null   object 
 9   OnlineSecurity    5880 non-null   object 
 10  OnlineBackup      5880 non-null   object 
 11  DeviceProtection  5880 non-null   object 
 12  TechSupport       5880 non-null   object 
 13  StreamingTV       5880 non-null   object 
 14  StreamingMovies   5880 non-null   object 
 15  Contract          5880 non-null   object 
 16  PaperlessBilling  5880 non-null   object 


In [255]:
#Create a copy of base data for manupulation & processing
churn_pred_copy = df.copy()
churn_pred_copy.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,CUST0000,Male,0,No,Yes,23,No,No phone service,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Bank transfer,49.85,1146.55,No
1,CUST0001,Female,0,Yes,No,43,No,No phone service,DSL,Yes,...,Yes,No,Yes,No,Month-to-month,No,Mailed check,100.7,4330.1,Yes
2,CUST0002,Male,1,No,No,51,Yes,No,DSL,No,...,Yes,Yes,No,No,One year,No,Electronic check,97.33,4963.83,Yes
3,CUST0003,Male,1,No,No,72,Yes,Yes,DSL,Yes,...,Yes,No,No,No,Month-to-month,No,Credit card,101.38,7299.36,No
4,CUST0004,Male,1,No,No,25,Yes,Yes,DSL,No,...,No,Yes,No,Yes,Month-to-month,No,Electronic check,52.22,1305.5,Yes


In [256]:
# Convert TotalCharges to numeric (pd.to_numeric(errors="coerce")

churn_pred_copy['TotalCharges'] = pd.to_numeric(churn_pred_copy['TotalCharges'], errors='coerce')
churn_pred_copy["TotalCharges"].isna().sum()

np.int64(0)

In [257]:
# drop customerID column as it is not required for prediction

churn_pred_copy.drop('customerID', axis=1, inplace=True)

# binary encoding for 'Yes'/'No' columns
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']

for col in binary_cols:
    churn_pred_copy[col] = churn_pred_copy[col].map({'Yes':1, 'No':0})

# columns with multiple categories to be one-hot encoded and drop the first category to avoid dummy variable trap

multi_cat_cols = churn_pred_copy.select_dtypes(include="object").columns.tolist()
churn_pred_copy = pd.get_dummies(churn_pred_copy, columns=multi_cat_cols, drop_first=True)

# converting multi_cat_col into 1 and 0
churn_pred_copy = churn_pred_copy.map(
    lambda x: 1 if x is True else (0 if x is False else x)
 )
print(churn_pred_copy.shape)

(5880, 31)


In [258]:
churn_pred_copy.head(10)

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,gender_Male,...,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card,PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,23,0,1,49.85,1146.55,0,1,...,0,1,0,1,0,0,0,0,0,0
1,0,1,0,43,0,0,100.7,4330.1,1,0,...,0,0,1,0,0,0,0,0,0,1
2,1,0,0,51,1,0,97.33,4963.83,1,1,...,1,0,0,0,0,1,0,0,1,0
3,1,0,0,72,1,0,101.38,7299.36,0,1,...,0,0,0,0,0,0,0,1,0,0
4,1,0,0,25,1,0,52.22,1305.5,1,1,...,1,0,0,0,1,0,0,0,1,0
5,0,1,0,35,1,0,116.96,4093.6,0,0,...,1,0,1,0,1,1,0,1,0,0
6,0,1,0,17,0,1,91.53,1556.01,1,1,...,1,0,0,0,1,1,0,0,0,0
7,0,1,1,18,1,0,26.52,477.36,1,1,...,1,0,0,0,1,1,0,0,0,1
8,0,0,0,27,0,0,67.77,1829.79,1,1,...,1,0,0,0,0,1,0,0,0,1
9,0,0,0,15,0,0,86.45,1296.75,1,0,...,0,1,0,1,0,1,0,0,1,0


In [259]:
churn_pred_copy.tail(10)

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,gender_Male,...,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card,PaymentMethod_Electronic check,PaymentMethod_Mailed check
5870,1,0,1,36,0,0,40.28,1450.08,1,1,...,0,1,0,1,0,1,0,0,1,0
5871,0,0,0,2,1,0,107.55,215.1,1,0,...,1,0,0,0,1,1,0,0,0,0
5872,0,1,0,34,1,1,98.1,3335.4,1,1,...,0,0,0,0,1,1,0,0,1,0
5873,1,1,0,27,0,1,34.06,919.62,1,0,...,1,0,0,0,1,0,1,0,1,0
5874,0,0,1,37,1,0,56.74,2099.38,1,1,...,0,1,0,1,0,0,0,0,1,0
5875,0,1,1,71,1,1,74.21,5268.91,0,1,...,1,0,0,0,0,0,0,0,0,1
5876,0,0,0,22,1,0,65.43,1439.46,1,1,...,1,0,0,0,1,1,0,0,0,1
5877,0,0,0,68,0,1,59.78,4065.04,0,0,...,1,0,1,0,0,1,0,0,0,0
5878,0,1,1,14,0,1,91.88,1286.32,0,0,...,1,0,1,0,1,0,0,0,0,1
5879,1,1,0,23,1,0,25.45,585.35,1,0,...,0,1,0,1,0,0,0,0,0,1


In [260]:
print(churn_pred_copy.dtypes)

SeniorCitizen                             int64
Partner                                   int64
Dependents                                int64
tenure                                    int64
PhoneService                              int64
PaperlessBilling                          int64
MonthlyCharges                          float64
TotalCharges                            float64
Churn                                     int64
gender_Male                               int64
MultipleLines_No phone service            int64
MultipleLines_Yes                         int64
InternetService_Fiber optic               int64
InternetService_No                        int64
OnlineSecurity_No internet service        int64
OnlineSecurity_Yes                        int64
OnlineBackup_No internet service          int64
OnlineBackup_Yes                          int64
DeviceProtection_No internet service      int64
DeviceProtection_Yes                      int64
TechSupport_No internet service         

In [261]:
churn_pred_copy.describe(include="all")

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,gender_Male,...,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card,PaymentMethod_Electronic check,PaymentMethod_Mailed check
count,5880.0,5880.0,5880.0,5880.0,5880.0,5880.0,5880.0,5880.0,5880.0,5880.0,...,5880.0,5880.0,5880.0,5880.0,5880.0,5880.0,5880.0,5880.0,5880.0,5880.0
mean,0.50068,0.509354,0.489116,36.54915,0.50017,0.491497,70.157779,2566.813165,0.492857,0.501701,...,0.326361,0.345068,0.330272,0.345068,0.328401,0.332313,0.32415,0.255612,0.253061,0.248639
std,0.500042,0.499955,0.499924,20.909674,0.500042,0.49997,28.804615,1910.017743,0.499991,0.50004,...,0.468921,0.475431,0.470351,0.475431,0.469672,0.471083,0.468096,0.436242,0.434803,0.432261
min,0.0,0.0,0.0,1.0,0.0,0.0,20.0,20.03,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,18.0,0.0,0.0,45.7175,1020.2175,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,1.0,0.0,37.0,1.0,0.0,70.155,2136.445,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,55.0,1.0,1.0,95.4575,3767.665,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
max,1.0,1.0,1.0,72.0,1.0,1.0,119.99,8589.6,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
