In [89]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder



#load dataset
data = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [90]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [91]:
data['TotalCharges'] = data['TotalCharges'].astype(str)
data['TotalCharges'] = data['TotalCharges'].str.replace('...','')

#remove non numeric values
data['TotalCharges'] = data['TotalCharges'].str.replace(r'[^0-9.]', '')

missing_values = data['TotalCharges'].isnull().sum()
# If there are missing values, replace them with NaN
data['TotalCharges'] = data['TotalCharges'].str.strip()
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
# Convert the TotalCharges column to float
data['TotalCharges'] = data['TotalCharges'].astype(float)
data['TotalCharges'] = data['TotalCharges'].fillna(0)
data['TotalCharges'].isna().sum()


0

In [92]:
data['Churn'] = data['Churn'].astype('category')
data['Churn'] = data['Churn'].cat.codes


In [93]:
numerical = data[['tenure', 'MonthlyCharges', 'TotalCharges']]
categorical = data[['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',

               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',

               'Contract', 'PaperlessBilling', 'PaymentMethod']]

scaler = StandardScaler()
scaler.fit_transform(numerical)


array([[-1.27744458, -1.16032292, -0.99261052],
       [ 0.06632742, -0.25962894, -0.17216471],
       [-1.23672422, -0.36266036, -0.9580659 ],
       ...,
       [-0.87024095, -1.1686319 , -0.85293201],
       [-1.15528349,  0.32033821, -0.87051315],
       [ 1.36937906,  1.35896134,  2.01389665]])

In [98]:
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer([('one-hot-encoder', OneHotEncoder(), ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',

               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',

               'Contract', 'PaperlessBilling', 'PaymentMethod'])], remainder='passthrough')
#
# For OneHotEncoder with drop='first', the code would look like the following
#
ct2 = ColumnTransformer([('one-hot-encoder', OneHotEncoder(drop='first'), ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',

               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',

               'Contract', 'PaperlessBilling', 'PaymentMethod'])], remainder='passthrough')
#
# Execute Fit_Transform
#
encoded = ct.fit_transform(categorical)

In [None]:
one_hot_fruit_features = ct.named_transformers_['one_hot_fruit'].get_feature_names_out(['gender'])
one_hot_color_features = ct.named_transformers_['one_hot_color'].get_feature_names_out(['color'])

In [101]:
encoded_df = pd.get_dummies(categorical, columns=['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',

               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',

               'Contract', 'PaperlessBilling', 'PaymentMethod'], drop_first=True)
encoded_df

Unnamed: 0,gender_Male,SeniorCitizen_1,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No internet service,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,False,False,True,False,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
1,True,False,False,False,True,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
2,True,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
3,True,False,False,False,False,True,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
4,False,False,False,False,True,False,False,True,False,False,...,False,False,False,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,True,False,True,True,True,False,True,False,False,False,...,False,True,False,True,True,False,True,False,False,True
7039,False,False,True,True,True,False,True,True,False,False,...,False,True,False,True,True,False,True,True,False,False
7040,False,False,True,True,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
7041,True,True,True,False,True,False,True,True,False,False,...,False,False,False,False,False,False,True,False,False,True


In [84]:
encoded_df = pd.DataFrame(encoded_cat)
encoded_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
