In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from prepare import telco_work, telco_pipeline
from acquire import get_telco_data

In [3]:
df = telco_work()
df.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,Female,0,Yes,Yes,9,Yes,No,No,Yes,No,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,Male,0,No,No,9,Yes,Yes,No,No,No,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,Male,0,No,No,4,Yes,No,No,No,Yes,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,Male,1,Yes,No,13,Yes,No,No,Yes,Yes,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,Female,1,Yes,No,3,Yes,No,No,No,No,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


In [4]:
print(df.dtypes)

gender                    object
senior_citizen             int64
partner                   object
dependents                object
tenure                     int64
phone_service             object
multiple_lines            object
online_security           object
online_backup             object
device_protection         object
tech_support              object
streaming_tv              object
streaming_movies          object
paperless_billing         object
monthly_charges          float64
total_charges             object
churn                     object
contract_type             object
internet_service_type     object
payment_type              object
dtype: object


It seems total charges should be a float

In [5]:
#conveert blanks to 0 and change type to float
df['total_charges'] = df['total_charges'].replace(' ', 0).astype(float)

In [6]:
print(df.dtypes)

gender                    object
senior_citizen             int64
partner                   object
dependents                object
tenure                     int64
phone_service             object
multiple_lines            object
online_security           object
online_backup             object
device_protection         object
tech_support              object
streaming_tv              object
streaming_movies          object
paperless_billing         object
monthly_charges          float64
total_charges            float64
churn                     object
contract_type             object
internet_service_type     object
payment_type              object
dtype: object


In [7]:
#find any nulls in the data
df.isna().sum()

gender                      0
senior_citizen              0
partner                     0
dependents                  0
tenure                      0
phone_service               0
multiple_lines              0
online_security             0
online_backup               0
device_protection           0
tech_support                0
streaming_tv                0
streaming_movies            0
paperless_billing           0
monthly_charges             0
total_charges               0
churn                       0
contract_type               0
internet_service_type    1526
payment_type                0
dtype: int64

Seems like internet sercice has 1500 nulls. 
lets investigate why

In [8]:
df[df.internet_service_type.isna()].head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
20,Female,1,Yes,No,50,Yes,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,25.2,1306.3,No,One year,,Electronic check
23,Female,0,No,No,3,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,19.85,57.2,No,Month-to-month,,Mailed check
24,Female,0,Yes,Yes,4,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,20.35,76.35,Yes,Month-to-month,,Mailed check
27,Male,0,Yes,Yes,54,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,20.4,1090.6,No,Two year,,Credit card (automatic)
28,Male,0,No,No,26,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Yes,19.6,471.85,No,One year,,Bank transfer (automatic)


It was null because the customer has no internet service.
Lets replace the null values with no internet service

In [9]:
df['internet_service_type'] = df['internet_service_type'].fillna("No internet service")

In [10]:
#confirm no nulls, and categorical options
print(df['internet_service_type'].isna().sum())
print(df['internet_service_type'].unique())

0
['DSL' 'Fiber optic' 'No internet service']


Lets check the unique values of evertying else

In [None]:
df.columns

In [None]:
print(df[['gender', 'senior_citizen', 'partner', 'dependents', 'tenure',
          'phone_service', 'multiple_lines', 'online_security', 'online_backup',
          'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies',
          'paperless_billing', 'monthly_charges', 'total_charges', 'churn',
          'contract_type', 'internet_service_type', 'payment_type']].unique())


In [13]:
columns_to_check = ['gender', 'senior_citizen', 'partner', 'dependents', 'tenure',
                    'phone_service', 'multiple_lines', 'online_security', 'online_backup',
                    'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies',
                    'paperless_billing', 'churn',
                    'contract_type', 'internet_service_type', 'payment_type']

# Loop through the columns and print unique values
for column in columns_to_check:
    unique_values = df[column].unique()
    print(f"Unique in {column}: {unique_values}")

Unique in gender: ['Female' 'Male']
Unique in senior_citizen: [0 1]
Unique in partner: ['Yes' 'No']
Unique in dependents: ['Yes' 'No']
Unique in tenure: [ 9  4 13  3 71 63  7 65 54 72  5 56 34  1 45 50 23 55 26 69 37 49 66 67
 20 43 59 12 27  2 25 29 14 35 64 39 40 11  6 30 70 57 58 16 32 33 10 21
 61 15 44 22 24 19 47 62 46 52  8 60 48 28 41 53 68 31 36 17 18 51 38 42
  0]
Unique in phone_service: ['Yes' 'No']
Unique in multiple_lines: ['No' 'Yes' 'No phone service']
Unique in online_security: ['No' 'Yes' 'No internet service']
Unique in online_backup: ['Yes' 'No' 'No internet service']
Unique in device_protection: ['No' 'Yes' 'No internet service']
Unique in tech_support: ['Yes' 'No' 'No internet service']
Unique in streaming_tv: ['Yes' 'No' 'No internet service']
Unique in streaming_movies: ['No' 'Yes' 'No internet service']
Unique in paperless_billing: ['Yes' 'No']
Unique in churn: ['No' 'Yes']
Unique in contract_type: ['One year' 'Month-to-month' 'Two year']
Unique in internet_ser

lets create another DF to create the dummies so we have a place to revert to

In [36]:
dfdummy = df

In [30]:
dfdummy.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,Female,0,Yes,Yes,9,Yes,No,No,Yes,No,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,Male,0,No,No,9,Yes,Yes,No,No,No,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,Male,0,No,No,4,Yes,No,No,No,Yes,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,Male,1,Yes,No,13,Yes,No,No,Yes,Yes,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,Female,1,Yes,No,3,Yes,No,No,No,No,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


In [37]:
dfdummy = pd.get_dummies(dfdummy)

In [32]:
dfdummy.columns

Index(['senior_citizen', 'tenure', 'monthly_charges', 'total_charges',
       'gender_Female', 'gender_Male', 'partner_No', 'partner_Yes',
       'dependents_No', 'dependents_Yes', 'phone_service_No',
       'phone_service_Yes', 'multiple_lines_No',
       'multiple_lines_No phone service', 'multiple_lines_Yes',
       'online_security_No', 'online_security_No internet service',
       'online_security_Yes', 'online_backup_No',
       'online_backup_No internet service', 'online_backup_Yes',
       'device_protection_No', 'device_protection_No internet service',
       'device_protection_Yes', 'tech_support_No',
       'tech_support_No internet service', 'tech_support_Yes',
       'streaming_tv_No', 'streaming_tv_No internet service',
       'streaming_tv_Yes', 'streaming_movies_No',
       'streaming_movies_No internet service', 'streaming_movies_Yes',
       'paperless_billing_No', 'paperless_billing_Yes', 'churn_No',
       'churn_Yes', 'contract_type_Month-to-month', 'contract_type

In [33]:
dfdummy = dfdummy.drop(columns= [ 'gender_Female', 'partner_No', 'dependents_No', 'phone_service_No', 'multiple_lines_No', 'multiple_lines_No phone service', 'online_security_No', 'online_security_No internet service', 'online_backup_No', 'online_backup_No internet service', 'device_protection_No', 'device_protection_No internet service', 'tech_support_No', 'tech_support_No internet service', 'streaming_tv_No', 'streaming_tv_No internet service', 'streaming_movies_No', 'streaming_movies_No internet service', 'paperless_billing_No', 'churn_No', 'internet_service_type_No internet service'])

In [38]:
dfdummy = dfdummy.drop(
                        columns= ['gender_Female', 'partner_No', 'dependents_No', 'phone_service_No', 'multiple_lines_No', 
                                    'multiple_lines_No phone service', 'online_security_No', 'online_security_No internet service', 
                                    'online_backup_No', 'online_backup_No internet service', 'device_protection_No', 'device_protection_No internet service', 
                                    'tech_support_No', 'tech_support_No internet service', 'streaming_tv_No', 'streaming_tv_No internet service', 
                                    'streaming_movies_No', 'streaming_movies_No internet service', 'paperless_billing_No', 'churn_No', 
                                    'internet_service_type_No internet service'])

In [39]:
dfdummy.head()

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,gender_Male,partner_Yes,dependents_Yes,phone_service_Yes,multiple_lines_Yes,online_security_Yes,...,churn_Yes,contract_type_Month-to-month,contract_type_One year,contract_type_Two year,internet_service_type_DSL,internet_service_type_Fiber optic,payment_type_Bank transfer (automatic),payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
0,0,9,65.6,593.3,False,True,True,True,False,False,...,False,False,True,False,True,False,False,False,False,True
1,0,9,59.9,542.4,True,False,False,True,True,False,...,False,True,False,False,True,False,False,False,False,True
2,0,4,73.9,280.85,True,False,False,True,False,False,...,True,True,False,False,False,True,False,False,True,False
3,1,13,98.0,1237.85,True,True,False,True,False,False,...,True,True,False,False,False,True,False,False,True,False
4,1,3,83.9,267.4,False,True,False,True,False,False,...,True,True,False,False,False,True,False,False,False,True


In [35]:
column_mapping = {
    'senior_citizen': 'senior_citizen',
    'tenure': 'tenure',
    'monthly_charges': 'monthly_charges',
    'total_charges': 'total_charges',
    'gender_Male': 'Male',
    'partner_Yes': 'partner',
    'dependents_Yes': 'dependents',
    'phone_service_Yes': 'phone_service',
    'multiple_lines_Yes': 'multiple_lines',
    'online_security_Yes': 'online_security',
    'online_backup_Yes': 'online_backup',
    'device_protection_Yes': 'device_protection',
    'tech_support_Yes': 'tech_support',
    'streaming_tv_Yes': 'streaming_tv',
    'streaming_movies_Yes': 'streaming_movies',
    'paperless_billing_Yes': 'paperless_billing',
    'churn_Yes': 'churn',
    'contract_type_Month-to-month': 'Contract Month',
    'contract_type_One year': 'Contract One year',
    'contract_type_Two year': 'Contract Two year',
    'internet_service_type_DSL': 'internet DSL',
    'internet_service_type_Fiber optic': 'internet Fiber optic',
    'payment_type_Bank transfer (automatic)': 'payment Bank transfer',
    'payment_type_Credit card (automatic)': 'payment Credit card',
    'payment_type_Electronic check': 'payment Electronic check',
    'payment_type_Mailed check': 'payment Mailed check'
}

# Rename the columns using the mapping
df.rename(columns=column_mapping, inplace=True)

# Print the new column names
print(df.columns)

Index(['senior_citizen', 'tenure', 'monthly_charges', 'total_charges',
       'gender_Male', 'partner_Yes', 'dependents_Yes', 'phone_service_Yes',
       'multiple_lines_Yes', 'online_security_Yes', 'online_backup_Yes',
       'device_protection_Yes', 'tech_support_Yes', 'streaming_tv_Yes',
       'streaming_movies_Yes', 'paperless_billing_Yes', 'churn_Yes',
       'contract_type_Month-to-month', 'contract_type_One year',
       'contract_type_Two year', 'internet_service_type_DSL',
       'internet_service_type_Fiber optic',
       'payment_type_Bank transfer (automatic)',
       'payment_type_Credit card (automatic)', 'payment_type_Electronic check',
       'payment_type_Mailed check'],
      dtype='object')

In [None]:
pd.set_option('display.max_columns', None)

In [42]:
train, val, test = telco_pipeline()

In [43]:
train.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
5609,Male,0,No,No,14,Yes,No,No,No,Yes,No,No,No,No,76.45,1117.55,No,Month-to-month,Fiber optic,Electronic check
2209,Male,0,No,No,5,Yes,No,No,No,Yes,No,Yes,Yes,Yes,70.0,347.4,Yes,One year,DSL,Mailed check
6919,Male,0,Yes,No,35,Yes,Yes,No,No,No,No,No,No,Yes,75.2,2576.2,Yes,Month-to-month,Fiber optic,Electronic check
2284,Male,0,Yes,No,58,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,86.1,4890.5,No,Two year,DSL,Electronic check
845,Female,0,No,No,2,Yes,No,No,Yes,No,No,No,No,Yes,49.6,114.7,Yes,Month-to-month,DSL,Mailed check
