# Telco Churn Classification Project

### Kwame Taylor, Darden Cohort
#### Sept. 2020

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

from acquire import get_telco_data, new_telco_data
from prepare import prep_telco

import warnings
warnings.filterwarnings("ignore")

## acquire.py

In [30]:
# import acquire.py and use get_telco_data() to acquire data
# if I want to replace my cached/local csv file with a fresh copy from SQL, I can use the function new_telco_data

df = get_telco_data()
df.head(3)

Unnamed: 0,customer_id,contract_type_id,phone_service,internet_service_type_id,gender,senior_citizen,partner,dependents,tenure,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,monthly_charges,total_charges,churn
0,0002-ORFBO,2,Yes,1,Female,0,Yes,Yes,9,No,Yes,No,Yes,Yes,No,65.6,593.3,No
1,0003-MKNFE,1,Yes,1,Male,0,No,No,9,No,No,No,No,No,Yes,59.9,542.4,No
2,0004-TLHLJ,1,Yes,2,Male,0,No,No,4,No,No,Yes,No,No,No,73.9,280.85,Yes


In [31]:
# these are the columns I decided were relevant to acquire from the SQL data to use in exploration
# if I don't need them all I can drop them when the time comes

df.columns

Index(['customer_id', 'contract_type_id', 'phone_service',
       'internet_service_type_id', 'gender', 'senior_citizen', 'partner',
       'dependents', 'tenure', 'online_security', 'online_backup',
       'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies',
       'monthly_charges', 'total_charges', 'churn'],
      dtype='object')

## prepare.py

In [None]:
# import prepare.py and use prep_telco() to prepare the data for use

df = prep_telco()
df.head(3)

In [32]:
# set the index to be customer_id
df = df.set_index('customer_id')
df.head(3)

Unnamed: 0_level_0,contract_type_id,phone_service,internet_service_type_id,gender,senior_citizen,partner,dependents,tenure,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,monthly_charges,total_charges,churn
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0002-ORFBO,2,Yes,1,Female,0,Yes,Yes,9,No,Yes,No,Yes,Yes,No,65.6,593.3,No
0003-MKNFE,1,Yes,1,Male,0,No,No,9,No,No,No,No,No,Yes,59.9,542.4,No
0004-TLHLJ,1,Yes,2,Male,0,No,No,4,No,No,Yes,No,No,No,73.9,280.85,Yes


In [33]:
# First I'm going to cut down on the complexity of data by combining the variables for various add-on packages
# into one variable, num_add_ons, that adds up the number of add-ons services each customer has.

df.online_security = df.online_security.map({'Yes': 1, 'No': 0, 'No internet service': 0})
df.online_backup = df.online_backup.map({'Yes': 1, 'No': 0, 'No internet service': 0})
df.device_protection = df.device_protection.map({'Yes': 1, 'No': 0, 'No internet service': 0})
df.tech_support = df.tech_support.map({'Yes': 1, 'No': 0, 'No internet service': 0})
df.streaming_tv = df.streaming_tv.map({'Yes': 1, 'No': 0, 'No internet service': 0})
df.streaming_movies = df.streaming_movies.map({'Yes': 1, 'No': 0, 'No internet service': 0})
df.head(3)

Unnamed: 0_level_0,contract_type_id,phone_service,internet_service_type_id,gender,senior_citizen,partner,dependents,tenure,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,monthly_charges,total_charges,churn
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0002-ORFBO,2,Yes,1,Female,0,Yes,Yes,9,0,1,0,1,1,0,65.6,593.3,No
0003-MKNFE,1,Yes,1,Male,0,No,No,9,0,0,0,0,0,1,59.9,542.4,No
0004-TLHLJ,1,Yes,2,Male,0,No,No,4,0,0,1,0,0,0,73.9,280.85,Yes


In [34]:
df['num_add_ons'] = (df.online_security + df.online_backup + df.device_protection + df.tech_support + df.streaming_tv + df.streaming_movies)
df.head(3)

Unnamed: 0_level_0,contract_type_id,phone_service,internet_service_type_id,gender,senior_citizen,partner,dependents,tenure,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,monthly_charges,total_charges,churn,num_add_ons
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0002-ORFBO,2,Yes,1,Female,0,Yes,Yes,9,0,1,0,1,1,0,65.6,593.3,No,3
0003-MKNFE,1,Yes,1,Male,0,No,No,9,0,0,0,0,0,1,59.9,542.4,No,1
0004-TLHLJ,1,Yes,2,Male,0,No,No,4,0,0,1,0,0,0,73.9,280.85,Yes,1


In [35]:
# Now drop the add-on columns we don't need anymore.

df = df.drop(columns=['online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies'])
df.head(3)

Unnamed: 0_level_0,contract_type_id,phone_service,internet_service_type_id,gender,senior_citizen,partner,dependents,tenure,monthly_charges,total_charges,churn,num_add_ons
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0002-ORFBO,2,Yes,1,Female,0,Yes,Yes,9,65.6,593.3,No,3
0003-MKNFE,1,Yes,1,Male,0,No,No,9,59.9,542.4,No,1
0004-TLHLJ,1,Yes,2,Male,0,No,No,4,73.9,280.85,Yes,1


In [36]:
df.phone_service = df.phone_service.map({'Yes': 1, 'No': 0})
df.head(3)

Unnamed: 0_level_0,contract_type_id,phone_service,internet_service_type_id,gender,senior_citizen,partner,dependents,tenure,monthly_charges,total_charges,churn,num_add_ons
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0002-ORFBO,2,1,1,Female,0,Yes,Yes,9,65.6,593.3,No,3
0003-MKNFE,1,1,1,Male,0,No,No,9,59.9,542.4,No,1
0004-TLHLJ,1,1,2,Male,0,No,No,4,73.9,280.85,Yes,1


In [38]:
df['is_male'] = df.gender.map({'Male': 1, 'Female': 0})
df.head(3)

Unnamed: 0_level_0,contract_type_id,phone_service,internet_service_type_id,gender,senior_citizen,partner,dependents,tenure,monthly_charges,total_charges,churn,num_add_ons,is_male
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0002-ORFBO,2,1,1,Female,0,Yes,Yes,9,65.6,593.3,No,3,0
0003-MKNFE,1,1,1,Male,0,No,No,9,59.9,542.4,No,1,1
0004-TLHLJ,1,1,2,Male,0,No,No,4,73.9,280.85,Yes,1,1


In [39]:
# Now drop gender column

df = df.drop(columns=['gender'])
df.head(3)

Unnamed: 0_level_0,contract_type_id,phone_service,internet_service_type_id,senior_citizen,partner,dependents,tenure,monthly_charges,total_charges,churn,num_add_ons,is_male
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0002-ORFBO,2,1,1,0,Yes,Yes,9,65.6,593.3,No,3,0
0003-MKNFE,1,1,1,0,No,No,9,59.9,542.4,No,1,1
0004-TLHLJ,1,1,2,0,No,No,4,73.9,280.85,Yes,1,1


In [40]:
# Encode - Turn Yes's and No's into 1s and 0s.

df.partner = df.partner.map({'Yes': 1, 'No': 0})
df.dependents = df.dependents.map({'Yes': 1, 'No': 0})
df.churn = df.churn.map({'Yes': 1, 'No': 0})
df.head(3)

Unnamed: 0_level_0,contract_type_id,phone_service,internet_service_type_id,senior_citizen,partner,dependents,tenure,monthly_charges,total_charges,churn,num_add_ons,is_male
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0002-ORFBO,2,1,1,0,1,1,9,65.6,593.3,0,3,0
0003-MKNFE,1,1,1,0,0,0,9,59.9,542.4,0,1,1
0004-TLHLJ,1,1,2,0,0,0,4,73.9,280.85,1,1,1


In [41]:
# Rename columns to shorter names
# contract_type_id --> contract_type
# phone_service --> phone
# internet_service_type_id --> internet_type
# senior_citizen --> senior
# dependents --> depend

df = df.rename(columns={"contract_type_id": "contract_type", "phone_service": "phone",
                   "internet_service_type_id": "internet_type", "senior_citizen": "senior", "dependents": "depend"})
df.head(3)

Unnamed: 0_level_0,contract_type,phone,internet_type,senior,partner,depend,tenure,monthly_charges,total_charges,churn,num_add_ons,is_male
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0002-ORFBO,2,1,1,0,1,1,9,65.6,593.3,0,3,0
0003-MKNFE,1,1,1,0,0,0,9,59.9,542.4,0,1,1
0004-TLHLJ,1,1,2,0,0,0,4,73.9,280.85,1,1,1


In [42]:
# Data is now tidy and ready to split into train, validate, and test.

In [43]:
# Now I will put this all in my prepare.py file and import it to use on the data after acquire.py