In [1]:
import acquire
import pandas as pd
import numpy as np

In [4]:
# acquire telco data from acquire.py file and save it in raw_telco

raw_telco = acquire.acquire_telco_data()

In [17]:
# a sorted list of columns

columns = raw_telco.columns.sort_values().to_list()
columns

['churn',
 'contract_type',
 'contract_type_id',
 'customer_id',
 'dependents',
 'device_protection',
 'gender',
 'internet_service_type',
 'internet_service_type_id',
 'monthly_charges',
 'multiple_lines',
 'online_backup',
 'online_security',
 'paperless_billing',
 'partner',
 'payment_type',
 'payment_type_id',
 'phone_service',
 'senior_citizen',
 'streaming_movies',
 'streaming_tv',
 'tech_support',
 'tenure',
 'total_charges']

In [21]:
# resorted columns in DataFrame

raw_telco = raw_telco.reindex(sorted(raw_telco.columns), axis=1)

In [35]:
# the datatype for each column

raw_telco.dtypes

churn                        object
contract_type                object
contract_type_id              int64
customer_id                  object
dependents                   object
device_protection            object
gender                       object
internet_service_type        object
internet_service_type_id      int64
monthly_charges             float64
multiple_lines               object
online_backup                object
online_security              object
paperless_billing            object
partner                      object
payment_type                 object
payment_type_id               int64
phone_service                object
senior_citizen                int64
streaming_movies             object
streaming_tv                 object
tech_support                 object
tenure                        int64
total_charges                object
dtype: object

In [32]:
# using .select_dtypes(include = 'dtype') to get columns with object datatype

raw_telco.select_dtypes(include = 'object').columns#head(2)

Index(['churn', 'contract_type', 'customer_id', 'dependents',
       'device_protection', 'gender', 'internet_service_type',
       'multiple_lines', 'online_backup', 'online_security',
       'paperless_billing', 'partner', 'payment_type', 'phone_service',
       'streaming_movies', 'streaming_tv', 'tech_support', 'total_charges'],
      dtype='object')

In [26]:
# filtering to show columns with object datatype

raw_telco.columns[[raw_telco[col].dtype == 'object' for col in raw_telco.columns]]

Index(['churn', 'contract_type', 'customer_id', 'dependents',
       'device_protection', 'gender', 'internet_service_type',
       'multiple_lines', 'online_backup', 'online_security',
       'paperless_billing', 'partner', 'payment_type', 'phone_service',
       'streaming_movies', 'streaming_tv', 'tech_support', 'total_charges'],
      dtype='object')

In [42]:
for col in raw_telco.columns:
#     if raw_telco[col].dtypes == 'object':
        print(f'{col} has {raw_telco[col].nunique()} unique values')
        if raw_telco[col].isnull().sum() > 0:
            print(f'{col} has {raw_telco[col].isnull().sum()} null values')
        print('--------------')

churn has 2 unique values
--------------
contract_type has 3 unique values
--------------
contract_type_id has 3 unique values
--------------
customer_id has 7043 unique values
--------------
dependents has 2 unique values
--------------
device_protection has 3 unique values
--------------
gender has 2 unique values
--------------
internet_service_type has 3 unique values
--------------
internet_service_type_id has 3 unique values
--------------
monthly_charges has 1585 unique values
--------------
multiple_lines has 3 unique values
--------------
online_backup has 3 unique values
--------------
online_security has 3 unique values
--------------
paperless_billing has 2 unique values
--------------
partner has 2 unique values
--------------
payment_type has 4 unique values
--------------
payment_type_id has 4 unique values
--------------
phone_service has 2 unique values
--------------
senior_citizen has 2 unique values
--------------
streaming_movies has 3 unique values
--------------


In [43]:
# used for loop to print each unique value and value count for each column

for col in raw_telco.columns:
    print(col)
    print(raw_telco[col].value_counts())
    print('----------------')

churn
No     5174
Yes    1869
Name: churn, dtype: int64
----------------
contract_type
Month-to-month    3875
Two year          1695
One year          1473
Name: contract_type, dtype: int64
----------------
contract_type_id
1    3875
3    1695
2    1473
Name: contract_type_id, dtype: int64
----------------
customer_id
7009-PCARS    1
8132-YPVBX    1
9611-CTWIH    1
6959-UWKHF    1
6650-BWFRT    1
             ..
8263-JQAIK    1
2107-FBPTK    1
6838-YAUVY    1
2073-QBVBI    1
2207-NHRJK    1
Name: customer_id, Length: 7043, dtype: int64
----------------
dependents
No     4933
Yes    2110
Name: dependents, dtype: int64
----------------
device_protection
No                     3095
Yes                    2422
No internet service    1526
Name: device_protection, dtype: int64
----------------
gender
Male      3555
Female    3488
Name: gender, dtype: int64
----------------
internet_service_type
Fiber optic    3096
DSL            2421
None           1526
Name: internet_service_type, dtype: in

In [47]:
raw_telco.payment_type.value_counts()

Electronic check             2365
Mailed check                 1612
Bank transfer (automatic)    1544
Credit card (automatic)      1522
Name: payment_type, dtype: int64

In [48]:
pd.crosstab(raw_telco.payment_type, raw_telco.payment_type_id)

payment_type_id,1,2,3,4
payment_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bank transfer (automatic),0,0,1544,0
Credit card (automatic),0,0,0,1522
Electronic check,2365,0,0,0
Mailed check,0,1612,0,0


In [57]:
raw_telco.payment_type.replace('Electronic check', 1,  inplace = True)#, 
#                                'Mailed check': 2,
#                                'Bank transfer (automatic)': 3, 
#                                'Credit card (automatic)': 4)

In [58]:
raw_telco.payment_type.value_counts()

1                            2365
Mailed check                 1612
Bank transfer (automatic)    1544
Credit card (automatic)      1522
Name: payment_type, dtype: int64