In [1]:
# imports
import pandas as pd
import numpy as np
import os
from scipy import stats

# visualize
import seaborn as sns
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(11, 9))
plt.rc('font', size=13)


# turn off pink warning boxes
import warnings
warnings.filterwarnings("ignore")


# acquire
from env import host, user, password
from pydataset import data


#skelearn imports
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text, export_graphviz
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression





# docs
import acquire
import prepare


from io import StringIO


# Plan

- Describe the project and goals.

- Task out how you will work through the pipeline in as much detail as you need to keep on track.

- Include a data dictionary to provide context for and explain your data.

- Clearly state your starting hypotheses (and add the testing of these to your task list).

- Create README.md with data dictionary, project and business goals, come up with initial hypotheses.
- Acquire data from the Codeup Database and create a function to automate this process. Save the function in an acquire.py file to import into the Final Report Notebook.
- Clean and prepare data for the first iteration through the pipeline, MVP preparation. Create a function to automate the process, store the function in a prepare.py module, and prepare data in Final Report Notebook by importing and using the funtion.
- Clearly define two hypotheses, set an alpha, run the statistical tests needed, reject or fail to reject the Null Hypothesis, and document findings and takeaways.
- Establish a baseline accuracy and document well.
- Train three different classification models.
- Evaluate models on train and validate datasets.
- Choose the model with that performs the best and evaluate that single model on the test dataset.
- Create csv file with the measurement id, the probability of the target values, and the model's prediction for each observation in my test dataset.
- Document conclusions, takeaways, and next steps in the Final Report Notebook.

# Acquire

In [2]:
#Acquiring telco data from our SQL servers. 

telco = acquire.get_telco_data()

In [3]:
telco.head() # Looking at first five values of dataframe

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,3,0016-QLJIS,Female,0,Yes,Yes,65,Yes,...,Yes,Yes,Yes,Yes,90.45,5957.9,No,Two year,DSL,Mailed check
1,4,1,3,0017-DINOC,Male,0,No,No,54,No,...,Yes,Yes,No,No,45.2,2460.55,No,Two year,DSL,Credit card (automatic)
2,3,1,3,0019-GFNTW,Female,0,No,No,56,No,...,Yes,No,No,No,45.05,2560.1,No,Two year,DSL,Bank transfer (automatic)
3,4,1,3,0056-EPFBG,Male,0,Yes,Yes,20,No,...,Yes,No,No,Yes,39.4,825.4,No,Two year,DSL,Credit card (automatic)
4,3,1,3,0078-XZMHT,Male,0,Yes,No,72,Yes,...,Yes,Yes,Yes,Yes,85.15,6316.2,No,Two year,DSL,Bank transfer (automatic)


In [4]:
# Looking at rows and columns
telco.shape

(7043, 24)

In [5]:
telco.info() # Looking at datatypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   payment_type_id           7043 non-null   int64  
 1   internet_service_type_id  7043 non-null   int64  
 2   contract_type_id          7043 non-null   int64  
 3   customer_id               7043 non-null   object 
 4   gender                    7043 non-null   object 
 5   senior_citizen            7043 non-null   int64  
 6   partner                   7043 non-null   object 
 7   dependents                7043 non-null   object 
 8   tenure                    7043 non-null   int64  
 9   phone_service             7043 non-null   object 
 10  multiple_lines            7043 non-null   object 
 11  online_security           7043 non-null   object 
 12  online_backup             7043 non-null   object 
 13  device_protection         7043 non-null   object 
 14  tech_sup

In [6]:
telco.describe() # Looking at statistics for telco dataframe

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,senior_citizen,tenure,monthly_charges
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0
mean,2.315633,1.872923,1.690473,0.162147,32.371149,64.761692
std,1.148907,0.737796,0.833755,0.368612,24.559481,30.090047
min,1.0,1.0,1.0,0.0,0.0,18.25
25%,1.0,1.0,1.0,0.0,9.0,35.5
50%,2.0,2.0,1.0,0.0,29.0,70.35
75%,3.0,2.0,2.0,0.0,55.0,89.85
max,4.0,3.0,3.0,1.0,72.0,118.75


In [7]:
telco.columns.tolist() # looking at list of columns

['payment_type_id',
 'internet_service_type_id',
 'contract_type_id',
 'customer_id',
 'gender',
 'senior_citizen',
 'partner',
 'dependents',
 'tenure',
 'phone_service',
 'multiple_lines',
 'online_security',
 'online_backup',
 'device_protection',
 'tech_support',
 'streaming_tv',
 'streaming_movies',
 'paperless_billing',
 'monthly_charges',
 'total_charges',
 'churn',
 'contract_type',
 'internet_service_type',
 'payment_type']

# Prepare

In [8]:
telco[telco['total_charges'] == ' '] # Looking at which columns in total charges that is empty

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
85,4,1,3,1371-DWPAZ,Female,0,Yes,Yes,0,No,...,Yes,Yes,No,No,56.05,,No,Two year,DSL,Credit card (automatic)
156,3,1,3,2775-SEFEE,Male,0,No,Yes,0,Yes,...,Yes,No,No,Yes,61.9,,No,Two year,DSL,Bank transfer (automatic)
236,2,1,3,4075-WKNIU,Female,0,Yes,Yes,0,Yes,...,Yes,Yes,No,No,73.35,,No,Two year,DSL,Mailed check
255,3,1,3,4472-LVYGI,Female,0,Yes,Yes,0,No,...,Yes,Yes,No,Yes,52.55,,No,Two year,DSL,Bank transfer (automatic)
339,2,1,3,5709-LVOEQ,Female,0,Yes,Yes,0,Yes,...,No,Yes,Yes,No,80.85,,No,Two year,DSL,Mailed check
5681,2,3,3,2520-SGTTA,Female,0,Yes,Yes,0,Yes,...,No internet service,No internet service,No internet service,No,20.0,,No,Two year,,Mailed check
5717,2,3,3,3115-CZMZD,Male,0,No,Yes,0,Yes,...,No internet service,No internet service,No internet service,No,20.25,,No,Two year,,Mailed check
5727,2,3,3,3213-VVOLG,Male,0,Yes,Yes,0,Yes,...,No internet service,No internet service,No internet service,No,25.35,,No,Two year,,Mailed check
5798,2,3,3,4367-NUYAO,Male,0,Yes,Yes,0,Yes,...,No internet service,No internet service,No internet service,No,25.75,,No,Two year,,Mailed check
6007,2,3,3,7644-OMVMY,Male,0,Yes,Yes,0,Yes,...,No internet service,No internet service,No internet service,No,19.85,,No,Two year,,Mailed check


In [9]:
# Cleaning total charges column by adding zeros to empty values and changing data type to float using function from 
#prepare file
clean_df= prepare.clean_data(telco) 

In [10]:
#Looking at dataframe before implementing prep_telco_data function from prepapre.py file
clean_df.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,3,0016-QLJIS,Female,0,Yes,Yes,65,Yes,...,Yes,Yes,Yes,Yes,90.45,5957.9,No,Two year,DSL,Mailed check
1,4,1,3,0017-DINOC,Male,0,No,No,54,No,...,Yes,Yes,No,No,45.2,2460.55,No,Two year,DSL,Credit card (automatic)
2,3,1,3,0019-GFNTW,Female,0,No,No,56,No,...,Yes,No,No,No,45.05,2560.1,No,Two year,DSL,Bank transfer (automatic)
3,4,1,3,0056-EPFBG,Male,0,Yes,Yes,20,No,...,Yes,No,No,Yes,39.4,825.4,No,Two year,DSL,Credit card (automatic)
4,3,1,3,0078-XZMHT,Male,0,Yes,No,72,Yes,...,Yes,Yes,Yes,Yes,85.15,6316.2,No,Two year,DSL,Bank transfer (automatic)


In [11]:
#Looking at all unique values for each column that is an object except for the first index
for col in telco.columns:
    obj_list= list(telco.select_dtypes('object').columns)[1:]
    if col in obj_list:
            print(f'\n {col}')
            print(telco[col].value_counts())
    


 gender
Male      3555
Female    3488
Name: gender, dtype: int64

 partner
No     3641
Yes    3402
Name: partner, dtype: int64

 dependents
No     4933
Yes    2110
Name: dependents, dtype: int64

 phone_service
Yes    6361
No      682
Name: phone_service, dtype: int64

 multiple_lines
No                  3390
Yes                 2971
No phone service     682
Name: multiple_lines, dtype: int64

 online_security
No                     3498
Yes                    2019
No internet service    1526
Name: online_security, dtype: int64

 online_backup
No                     3088
Yes                    2429
No internet service    1526
Name: online_backup, dtype: int64

 device_protection
No                     3095
Yes                    2422
No internet service    1526
Name: device_protection, dtype: int64

 tech_support
No                     3473
Yes                    2044
No internet service    1526
Name: tech_support, dtype: int64

 streaming_tv
No                     2810
Yes           

In [12]:
#Cleaning, separating, and changing datatypes of all object columns using prep_telco_data function from prepare.py file
clean_df= prepare.prep_telco_data(clean_df)

In [13]:
#Looking at first 5 values from  the newly created dataframe with all object columns changed datatype to numeric
clean_df.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,contract_type_Month-to-month,contract_type_One year,contract_type_Two year,internet_service_type_DSL,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Bank transfer (automatic),payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
0,2,1,3,0016-QLJIS,Female,0,Yes,Yes,65,Yes,...,0,0,1,1,0,0,0,0,0,1
1,4,1,3,0017-DINOC,Male,0,No,No,54,No,...,0,0,1,1,0,0,0,1,0,0
2,3,1,3,0019-GFNTW,Female,0,No,No,56,No,...,0,0,1,1,0,0,1,0,0,0
3,4,1,3,0056-EPFBG,Male,0,Yes,Yes,20,No,...,0,0,1,1,0,0,0,1,0,0
4,3,1,3,0078-XZMHT,Male,0,Yes,No,72,Yes,...,0,0,1,1,0,0,1,0,0,0


# Explore

In [14]:
# Creating correlation table using corr_telco_data function from prepare.py file
prepare.corr_telco_data(clean_df)

Unnamed: 0,index,churn_No,churn_Yes,81,82
1,payment_type_id,0.262818,-0.262818,0.262818,-0.262818
2,internet_service_type_id,0.0472914,-0.0472914,0.0472914,-0.0472914
3,contract_type_id,0.396713,-0.396713,0.396713,-0.396713
4,senior_citizen,-0.150889,0.150889,-0.150889,0.150889
5,tenure,0.352229,-0.352229,0.352229,-0.352229
6,monthly_charges,-0.193356,0.193356,-0.193356,0.193356
7,total_charges,0.0987746,-0.0987746,0.0987746,-0.0987746
8,gender_Female,-0.0086121,0.0086121,-0.0086121,0.0086121
9,gender_Male,0.0086121,-0.0086121,0.0086121,-0.0086121
10,partner_No,-0.150448,0.150448,-0.150448,0.150448


## Stats test

In [15]:
alpha = .05 # defining alpha value

## Hypothesis:
- H0: contract month to month has no affect on churn  (they are independent)
    
- Ha: contract month to month has an affect on churn  (they are dependent)

In [16]:
clean_df['contract_type_Month-to-month'].value_counts() # Looking at value counts to make sure didn't lose anything

1    3875
0    3168
Name: contract_type_Month-to-month, dtype: int64

In [19]:
#Performing chi^2 test
contract_m2m_crosstab= pd.crosstab(clean_df['contract_type_Month-to-month'], clean_df['churn_Yes'])

contract_m2m_crosstab

churn_Yes,0,1
contract_type_Month-to-month,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2954,214
1,2220,1655


In [24]:
chi, p, degf, expected= stats.chi2_contingency(contract_m2m_crosstab)

In [25]:
chi3, p3, degf3, expected

(1153.9716611093477,
 6.147585925549194e-253,
 1,
 array([[2327.30824933,  840.69175067],
        [2846.69175067, 1028.30824933]]))

In [26]:
print('Observed\n')
print(contract_m2m_crosstab.values)
print('---\nExpected\n')
print(expected.astype(int))
print('---\n')
print(f'chi^2 = {chi3:.4f}')
print(f'p     = {p3:.4f}')

Observed

[[2954  214]
 [2220 1655]]
---
Expected

[[2327  840]
 [2846 1028]]
---

chi^2 = 1153.9717
p     = 0.0000


In [27]:
null_hypothesis = "Contract month to month makes no difference in churn (they are independent)"

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")

We reject the null hypothesis


# Model and Evaluate

# Deliver