# Telco Churn Draft Notebook

In [1]:
# Importing neccessary libraries and modules
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import export_graphviz
from sklearn.metrics import confusion_matrix
from acquire import get_telco_data, summarize_df
from prepare import num_distributions, prep_telco
import graphviz
from graphviz import Graph
import warnings
warnings.filterwarnings('ignore')
import explore

## Hypotheses and Hypothesis Testing
### Initial Hypothesis
- $H_{i}$: Fiber optic customers whos tenure is less than one year 

## Acquisition
- Acquire uncleaned `telco_churn` dataset
- Garner basic understanding of dataset using functions from `acquire.py` such as shape, info, describe, etc.
- Create basic distributions of numeric columns

In [2]:
# Pull df from acquire.py and look at the first few entries using df.head()
unclean_telco = get_telco_data()
unclean_telco.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
customer_id,0002-ORFBO,0003-MKNFE,0004-TLHLJ,0011-IGKFF,0013-EXCHZ,0013-MHZWF,0013-SMEOE,0014-BMAQU,0015-UOCOJ,0016-QLJIS
gender,Female,Male,Male,Male,Female,Female,Female,Male,Female,Female
senior_citizen,0,0,0,1,1,0,1,0,1,0
partner,Yes,No,No,Yes,Yes,No,Yes,Yes,No,Yes
dependents,Yes,No,No,No,No,Yes,No,No,No,Yes
tenure,9,9,4,13,3,9,71,63,7,65
phone_service,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes
multiple_lines,No,Yes,No,No,No,No,No,Yes,No,Yes
internet_service_type_id,1,1,2,2,2,1,2,2,1,1
online_security,No,No,No,No,No,No,Yes,Yes,Yes,Yes


In [3]:
# High level summary of data including value_counts, dtypes, 
summarize_df(unclean_telco)

This dataframe has 7043 rows and 21 columns.
------------------------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               7043 non-null   object 
 1   gender                    7043 non-null   object 
 2   senior_citizen            7043 non-null   int64  
 3   partner                   7043 non-null   object 
 4   dependents                7043 non-null   object 
 5   tenure                    7043 non-null   int64  
 6   phone_service             7043 non-null   object 
 7   multiple_lines            7043 non-null   object 
 8   internet_service_type_id  7043 non-null   int64  
 9   online_security           7043 non-null   object 
 10  online_backup             7043 non-null   object 
 11  device_protection         7043 non-null   object 
 12  tech_support              7043 non-null   objec

### Pre-cleaning extrapolations
- Lots of variables need encoding.
- Unneccesary columns such as `customer_id`
- See <a href="placeholder.com">README.md</a> for data dictionary that explains each variable
- Non-normal distributions for numeric categories.

## Preparation
- Clean data using the <a href="placeholder.com">prepare.py</a> functions `prep_telco()`
- Split data into train, validate, and test sets
- 

In [4]:
train, validate, test = prep_telco(get_telco_data())
train.shape, validate.shape, test.shape

((3937, 20), (1688, 20), (1407, 20))

In [6]:
train

Unnamed: 0,is_male,senior_citizen,has_partner,has_dependents,tenure,has_phone,has_multi_line,internet_service_type_id,has_onl_sec,has_backup,has_dev_pro,has_tech_supp,has_tv_strm,has_mv_strm,contract_type_id,has_pprless_bill,payment_type_id,monthly_charges,total_charges,has_churned
5919,0,0,0,0,58,1,1,1,1,1,1,1,0,0,2,1,4,71.10,4299.2,0
1915,1,0,0,1,71,1,1,2,1,1,0,0,0,0,2,1,4,85.45,6028.95,0
5054,0,0,1,1,35,1,1,3,0,0,0,0,0,0,1,1,1,25.75,882.55,0
2355,1,0,1,1,1,1,0,3,0,0,0,0,0,0,1,0,2,19.20,19.2,0
6279,1,1,0,0,20,1,0,1,1,0,0,1,0,0,2,0,2,54.00,1055.9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,0,0,0,0,24,1,1,2,0,0,1,0,1,0,1,1,4,88.95,2072.75,0
3763,0,0,1,1,10,1,1,2,1,0,0,0,1,1,1,1,1,100.25,1064.65,1
3444,0,0,1,1,22,1,1,3,0,0,0,0,0,0,3,0,4,25.25,555.4,0
1002,1,0,0,1,22,1,1,2,0,1,0,0,1,0,1,1,4,89.10,1949.4,0


In [12]:
import itertools

features = ["monthly_charges", "tech_support", "internet_service_type_id", "tenure"]
            
possible_combinations = []
            
for i in range(2, len(features) + 1):
    possible_combinations.extend(list(itertools.combinations(features, i)))

possible_combinations

[('monthly_charges', 'tech_support'),
 ('monthly_charges', 'internet_service_type_id'),
 ('monthly_charges', 'tenure'),
 ('tech_support', 'internet_service_type_id'),
 ('tech_support', 'tenure'),
 ('internet_service_type_id', 'tenure'),
 ('monthly_charges', 'tech_support', 'internet_service_type_id'),
 ('monthly_charges', 'tech_support', 'tenure'),
 ('monthly_charges', 'internet_service_type_id', 'tenure'),
 ('tech_support', 'internet_service_type_id', 'tenure'),
 ('monthly_charges', 'tech_support', 'internet_service_type_id', 'tenure')]

In [None]:
for combination in possible_combinations:
    features = list(combination)
    model = model.fit(X_train[[features]])
    # then continue on with your model evaluation