In [1]:
# IMPORTS

import os
import env

import pandas as pd
from pydataset import data

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

from math import sqrt
from scipy import stats

# import our own acquire module
import acquire
import prepare


# import splitting and imputing functions
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from prepare import my_train_test_split

# turn off pink warning boxes
import warnings
warnings.filterwarnings("ignore")


In [2]:
# obtaining the prepared Telco dataset.

telco_df = acquire.get_telco_data(env.sql_connexion)
tidy_telco = prepare.prep_telco(telco_df)


In [9]:
tidy_telco.info()

# there are 7043 entries (unique customers) in this dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   customer_id            7043 non-null   object 
 1   gender                 7043 non-null   object 
 2   senior_citizen         7043 non-null   int64  
 3   partner                7043 non-null   object 
 4   dependents             7043 non-null   object 
 5   tenure                 7043 non-null   int64  
 6   phone_service          7043 non-null   object 
 7   multiple_lines         7043 non-null   object 
 8   online_security        7043 non-null   object 
 9   online_backup          7043 non-null   object 
 10  device_protection      7043 non-null   object 
 11  tech_support           7043 non-null   object 
 12  streaming_tv           7043 non-null   object 
 13  streaming_movies       7043 non-null   object 
 14  paperless_billing      7043 non-null   object 
 15  mont

In [6]:
# checking to see how many senior citizens 
# (0 = not a senior citizen, 5901 ; 1 = senior citizen, 1142)

tidy_telco.senior_citizen.value_counts()


0    5901
1    1142
Name: senior_citizen, dtype: int64

**The majority of customers are NOT senior citizens (5901 not vs 1142 are).**

In [7]:
# checking to see how many senior citizens 
# (yes = has phone service, 6361 ; no = no phone service, 682)

tidy_telco.phone_service.value_counts()

Yes    6361
No      682
Name: phone_service, dtype: int64

In [8]:
# checking whether customer has online_security
# (no = 3498 ; yes = 2019 ; no internet service = 1526)

tidy_telco.online_security.value_counts()

No                     3498
Yes                    2019
No internet service    1526
Name: online_security, dtype: int64

In [11]:
# checking number of customers with online_backup 
# (no = 3088 ; yes = 2429 ; no internet service = 1526)

tidy_telco.online_backup.value_counts()

# at least the 'no internet service' is consistent.

No                     3088
Yes                    2429
No internet service    1526
Name: online_backup, dtype: int64

In [12]:
# checking number of customers with device_protection
# (no = 3095 ; yes = 2422 ; no internet service = 1526)

tidy_telco.device_protection.value_counts()


No                     3095
Yes                    2422
No internet service    1526
Name: device_protection, dtype: int64

In [13]:
# checking number of customers with tech_support
# (no = 3473 ; yes = 2044 ; no internet service = 1526)

tidy_telco.tech_support.value_counts()

No                     3473
Yes                    2044
No internet service    1526
Name: tech_support, dtype: int64

**A similar number of customers have or have not the optional online_security, online_backup, device_protection and tech_support.  
These may come as a package, but, clearly, not all customers are on board.  
In all cases, more customers do not elect to purchase these options than customers who elect to purchase them.**