In [1]:
#Imports

import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import datetime

In [2]:
# Load Data

df = pd.read_csv('../data/processed/binary_Telco_Data.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,1,0,1,0,No phone service,DSL,0,...,0,0,0,0,Month-to-month,1,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,0,0,34,1,0,DSL,1,...,1,0,0,0,One year,0,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,0,0,2,1,0,DSL,1,...,0,0,0,0,Month-to-month,1,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,0,0,45,0,No phone service,DSL,1,...,1,1,0,0,One year,0,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,0,0,2,1,0,Fiber optic,0,...,0,0,0,0,Month-to-month,1,Electronic check,70.7,151.65,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   int64  
 4   Dependents        7032 non-null   int64  
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   int64  
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   int64  


In [4]:
#Convert Objects to Categories
columns_to_process = ['gender', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']

for col_name in columns_to_process:
    df[col_name] = df[col_name].astype('category')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   customerID        7032 non-null   object  
 1   gender            7032 non-null   category
 2   SeniorCitizen     7032 non-null   int64   
 3   Partner           7032 non-null   int64   
 4   Dependents        7032 non-null   int64   
 5   tenure            7032 non-null   int64   
 6   PhoneService      7032 non-null   int64   
 7   MultipleLines     7032 non-null   category
 8   InternetService   7032 non-null   category
 9   OnlineSecurity    7032 non-null   category
 10  OnlineBackup      7032 non-null   category
 11  DeviceProtection  7032 non-null   category
 12  TechSupport       7032 non-null   category
 13  StreamingTV       7032 non-null   category
 14  StreamingMovies   7032 non-null   category
 15  Contract          7032 non-null   category
 16  PaperlessBilling  7032 n

All my relevant features are now either 'categories' or 'continuous'(floats and ints). I will first drop customerID since this feature has no baring on my modeling.

In [6]:
#Confirm 'CustomerID' is gone.

df.drop(columns=['customerID'], axis=1, inplace=True)
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,1,0,1,0,No phone service,DSL,0,1,0,0,0,0,Month-to-month,1,Electronic check,29.85,29.85,0
1,Male,0,0,0,34,1,0,DSL,1,0,1,0,0,0,One year,0,Mailed check,56.95,1889.5,0
2,Male,0,0,0,2,1,0,DSL,1,1,0,0,0,0,Month-to-month,1,Mailed check,53.85,108.15,1
3,Male,0,0,0,45,0,No phone service,DSL,1,0,1,1,0,0,One year,0,Bank transfer (automatic),42.3,1840.75,0
4,Female,0,0,0,2,1,0,Fiber optic,0,0,0,0,0,0,Month-to-month,1,Electronic check,70.7,151.65,1


CustomerID is gone.  Now I will proceed the preprocessing.  First, I will Scale the numerical values, ie since non-binary numbers are vastly greater.  Then I will encode my categorical variables.  Lastly I will perform a train-test split.

In [7]:
#Scaling Numerical Variables.  I will use Min-Max Scaling to bring all continuous variables between 0(Min) and 1(Max).

bignumbers = ['tenure', 'MonthlyCharges', 'TotalCharges']

scaler = MinMaxScaler()
scaler.fit(df[['tenure']])
df['normalized_tenure'] = scaler.transform(df[['tenure']])
scaler.fit(df[['MonthlyCharges']])
df['normalized_MonthlyCharges'] = scaler.transform(df[['MonthlyCharges']])
scaler.fit(df[['TotalCharges']])
df['normalized_TotalCharges'] = scaler.transform(df[['TotalCharges']])

df.head()


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,normalized_tenure,normalized_MonthlyCharges,normalized_TotalCharges
0,Female,0,1,0,1,0,No phone service,DSL,0,1,...,0,Month-to-month,1,Electronic check,29.85,29.85,0,0.0,0.115423,0.001275
1,Male,0,0,0,34,1,0,DSL,1,0,...,0,One year,0,Mailed check,56.95,1889.5,0,0.464789,0.385075,0.215867
2,Male,0,0,0,2,1,0,DSL,1,1,...,0,Month-to-month,1,Mailed check,53.85,108.15,1,0.014085,0.354229,0.01031
3,Male,0,0,0,45,0,No phone service,DSL,1,0,...,0,One year,0,Bank transfer (automatic),42.3,1840.75,0,0.619718,0.239303,0.210241
4,Female,0,0,0,2,1,0,Fiber optic,0,0,...,0,Month-to-month,1,Electronic check,70.7,151.65,1,0.014085,0.521891,0.01533


In [8]:
#Dropping old columns now that features are normalized.  This is in order to avoid redundancy and potential issues in my ML model.

df.drop(columns=['tenure', 'MonthlyCharges', 'TotalCharges'], axis=1, inplace=True)
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn,normalized_tenure,normalized_MonthlyCharges,normalized_TotalCharges
0,Female,0,1,0,0,No phone service,DSL,0,1,0,0,0,0,Month-to-month,1,Electronic check,0,0.0,0.115423,0.001275
1,Male,0,0,0,1,0,DSL,1,0,1,0,0,0,One year,0,Mailed check,0,0.464789,0.385075,0.215867
2,Male,0,0,0,1,0,DSL,1,1,0,0,0,0,Month-to-month,1,Mailed check,1,0.014085,0.354229,0.01031
3,Male,0,0,0,0,No phone service,DSL,1,0,1,1,0,0,One year,0,Bank transfer (automatic),0,0.619718,0.239303,0.210241
4,Female,0,0,0,1,0,Fiber optic,0,0,0,0,0,0,Month-to-month,1,Electronic check,1,0.014085,0.521891,0.01533


Now I will shift focus to my categorical variables.  I must perform 'Dummy Encoding' in order to get n-1 features for n categories.  In other words, I will drop the old categories that have been encoded.

In [9]:
#First I want to confirm the features available for encoding.

features2encode = df.select_dtypes(include=['category'])
print(features2encode.columns)
                                   

Index(['gender', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaymentMethod'],
      dtype='object')


In [11]:
#Dummy Encoding

df_dummies = pd.get_dummies(df, columns = ['gender', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaymentMethod'], drop_first=True, dtype=int)
df_dummies.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,PaperlessBilling,Churn,normalized_tenure,normalized_MonthlyCharges,normalized_TotalCharges,gender_Male,...,TechSupport_No internet service,StreamingTV_1,StreamingTV_No internet service,StreamingMovies_1,StreamingMovies_No internet service,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,0,0,1,0,0.0,0.115423,0.001275,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,1,0,0,0.464789,0.385075,0.215867,1,...,0,0,0,0,0,1,0,0,0,1
2,0,0,0,1,1,1,0.014085,0.354229,0.01031,1,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0.619718,0.239303,0.210241,1,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,1,1,1,0.014085,0.521891,0.01533,0,...,0,0,0,0,0,0,0,0,1,0


As seen above, all categorical variables have now been encoded.  I can now move on to splitting the data into a training set and a testing set.

In [12]:
#Train-Test Split - getting my list of variables.
print(df_dummies.columns)

Index(['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
       'PaperlessBilling', 'Churn', 'normalized_tenure',
       'normalized_MonthlyCharges', 'normalized_TotalCharges', 'gender_Male',
       'MultipleLines_1', 'MultipleLines_No phone service',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'OnlineSecurity_1', 'OnlineSecurity_No internet service',
       'OnlineBackup_1', 'OnlineBackup_No internet service',
       'DeviceProtection_1', 'DeviceProtection_No internet service',
       'TechSupport_1', 'TechSupport_No internet service', 'StreamingTV_1',
       'StreamingTV_No internet service', 'StreamingMovies_1',
       'StreamingMovies_No internet service', 'Contract_One year',
       'Contract_Two year', 'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')


In [14]:
#Train-Test Split (Cont) defining X and y('Churn')
#Stratifying Y variable to ensure same proportion of classes to original dataset. 
#This is particularly useful when dealing with imbalanced datasets

X = df_dummies[['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
       'PaperlessBilling', 'normalized_tenure',
       'normalized_MonthlyCharges', 'normalized_TotalCharges', 'gender_Male',
       'MultipleLines_1', 'MultipleLines_No phone service',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'OnlineSecurity_1', 'OnlineSecurity_No internet service',
       'OnlineBackup_1', 'OnlineBackup_No internet service',
       'DeviceProtection_1', 'DeviceProtection_No internet service',
       'TechSupport_1', 'TechSupport_No internet service', 'StreamingTV_1',
       'StreamingTV_No internet service', 'StreamingMovies_1',
       'StreamingMovies_No internet service', 'Contract_One year',
       'Contract_Two year', 'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']].values
y = df_dummies['Churn'].values

X_train, X_test,y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y)


In [15]:
filename = 'preprocessed_telco_data.csv'
df_dummies.to_csv(filename, index=False) 