In [50]:
import pandas as pd
import numpy as np

In [51]:
df = pd.read_csv('Telco_Customer_Churn.csv')

In [52]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [53]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [55]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [56]:
df.duplicated().sum()

np.int64(0)

In [57]:
df['Contract'].value_counts()

Contract
Month-to-month    3875
Two year          1695
One year          1473
Name: count, dtype: int64

In [58]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [59]:
from sklearn.preprocessing import LabelEncoder

In [60]:
label_cols = ['gender', 'Partner', 'Dependents', 'PhoneService',
               'PaperlessBilling', 'Churn']
le = LabelEncoder()
df[label_cols] = df[label_cols].apply(le.fit_transform)

In [61]:
df['TechSupport'].value_counts()

TechSupport
No                     3473
Yes                    2044
No internet service    1526
Name: count, dtype: int64

In [62]:
onehot_cols = ['MultipleLines', 'InternetService', 'OnlineSecurity', 
               'OnlineBackup', 'DeviceProtection', 'TechSupport', 
               'StreamingTV', 'StreamingMovies', 'PaymentMethod']

df = pd.get_dummies(df, columns=onehot_cols, drop_first=True, dtype=int)

In [63]:
df['Contract'] = le.fit_transform(df['Contract'])

In [64]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,Contract,PaperlessBilling,MonthlyCharges,...,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,0,0,1,0,1,0,0,1,29.85,...,0,0,0,0,0,0,0,0,1,0
1,5575-GNVDE,1,0,0,0,34,1,1,0,56.95,...,1,0,0,0,0,0,0,0,0,1
2,3668-QPYBK,1,0,0,0,2,1,0,1,53.85,...,0,0,0,0,0,0,0,0,0,1
3,7795-CFOCW,1,0,0,0,45,0,1,0,42.3,...,1,0,1,0,0,0,0,0,0,0
4,9237-HQITU,0,0,0,0,2,1,0,1,70.7,...,0,0,0,0,0,0,0,0,1,0


In [65]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [66]:
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']


In [67]:
df[numeric_cols] = sc.fit_transform(df[numeric_cols])

In [68]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,Contract,PaperlessBilling,MonthlyCharges,...,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,0,0,1,0,-1.277445,0,0,1,-1.160323,...,0,0,0,0,0,0,0,0,1,0
1,5575-GNVDE,1,0,0,0,0.066327,1,1,0,-0.259629,...,1,0,0,0,0,0,0,0,0,1
2,3668-QPYBK,1,0,0,0,-1.236724,1,0,1,-0.36266,...,0,0,0,0,0,0,0,0,0,1
3,7795-CFOCW,1,0,0,0,0.514251,0,1,0,-0.746535,...,1,0,1,0,0,0,0,0,0,0
4,9237-HQITU,0,0,0,0,-1.236724,1,0,1,0.197365,...,0,0,0,0,0,0,0,0,1,0


In [69]:
cleaned_df = df.drop(['customerID'], axis=1)

In [70]:
cleaned_df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,Contract,PaperlessBilling,MonthlyCharges,TotalCharges,...,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,-1.277445,0,0,1,-1.160323,-0.994194,...,0,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0.066327,1,1,0,-0.259629,-0.17374,...,1,0,0,0,0,0,0,0,0,1
2,1,0,0,0,-1.236724,1,0,1,-0.36266,-0.959649,...,0,0,0,0,0,0,0,0,0,1
3,1,0,0,0,0.514251,0,1,0,-0.746535,-0.195248,...,1,0,1,0,0,0,0,0,0,0
4,0,0,0,0,-1.236724,1,0,1,0.197365,-0.940457,...,0,0,0,0,0,0,0,0,1,0


In [72]:
cleaned_df.to_csv('cleaned_telco_customer_churn.csv', index=False)