In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
def data_overview():
    print("Rows :  " , df.shape[0])
    print("Columns:  " , df.shape[1] )
    print()
    print("Feature types:  ")
    print(df.dtypes)
    print()
    print('Missing values : ' , df.isnull().sum().values.sum()) 
    print()
    print('Unique values:')
    print(df.nunique())
data_overview()

Rows :   7043
Columns:   21

Feature types:  
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

Missing values :  0

Unique values:
customerID          7043
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport    

In [4]:
df['TotalCharges']=pd.to_numeric(df['TotalCharges'], errors='coerce').fillna(0)

In [5]:
del_cols = ['customerID', 'gender', 'SeniorCitizen', 'Partner',
            'Dependents', 'PhoneService', 'MultipleLines', 'InternetService']
for col_name in del_cols:
    df = df.drop(col_name, axis=1)

In [6]:
data_overview()

Rows :   7043
Columns:   13

Feature types:  
tenure                int64
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

Missing values :  0

Unique values:
tenure                73
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6531
Churn                  2
dtype: int64


In [7]:
cat_cols=[]
num_cols=[]
for col_name in df.columns:
    if df[col_name].dtypes == 'object':
        cat_cols.append(col_name)
    else:
        num_cols.append(col_name)

num_cols

['tenure', 'MonthlyCharges', 'TotalCharges']

In [8]:
bin_cols = df.nunique()[df.nunique()==2].keys().tolist()
bin_cols  # binary categorical features

['PaperlessBilling', 'Churn']

In [9]:
multi_cols = [col for col in cat_cols if col not in bin_cols]
multi_cols  # multiple categorical features

['OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaymentMethod']

In [10]:
ss = StandardScaler()
df_num = ss.fit_transform(df[num_cols])
df_num = pd.DataFrame(df_num,columns=num_cols)

In [11]:
df_multi = pd.get_dummies(data = df[multi_cols], columns=multi_cols)

In [12]:
le = LabelEncoder()
df_bin = df[bin_cols]
for col in bin_cols:
    df_bin[col] = le.fit_transform(df_bin[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [13]:
new_df = pd.concat([df_bin,df_multi,df_num],axis=1)
new_df.head()

Unnamed: 0,PaperlessBilling,Churn,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,...,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure,MonthlyCharges,TotalCharges
0,1,0,1,0,0,0,0,1,1,0,...,1,0,0,0,0,1,0,-1.277445,-1.160323,-0.992611
1,0,0,0,0,1,1,0,0,0,0,...,0,1,0,0,0,0,1,0.066327,-0.259629,-0.172165
2,1,1,0,0,1,0,0,1,1,0,...,1,0,0,0,0,0,1,-1.236724,-0.36266,-0.958066
3,0,0,0,0,1,1,0,0,0,0,...,0,1,0,1,0,0,0,0.514251,-0.746535,-0.193672
4,1,1,1,0,0,1,0,0,1,0,...,1,0,0,0,0,1,0,-1.236724,0.197365,-0.938874
