# Telco Churn Draft Notebook

In [3]:
# Importing neccessary libraries and modules
import pandas as pd
import numpy as np
import os
import scipy.stats as stats

import matplotlib.pyplot as plt
import seaborn as sns

import random
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from acquire import get_telco_data, summarize_df
from prepare import num_distributions, prep_telco
import graphviz
from graphviz import Graph
import warnings
warnings.filterwarnings('ignore')
import explore

## Hypotheses and Hypothesis Testing
### Initial Hypothesis
- $H_{i}$: Fiber optic customers whos tenure is less than one year 

## Acquisition
- Acquire uncleaned `telco_churn` dataset
- Garner basic understanding of dataset using functions from `acquire.py` such as shape, info, describe, etc.
- Create basic distributions of numeric columns

In [2]:
# Pull df from acquire.py and look at the first few entries using df.head()
unclean_telco = get_telco_data()
unclean_telco.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
customer_id,0002-ORFBO,0003-MKNFE,0004-TLHLJ,0011-IGKFF,0013-EXCHZ,0013-MHZWF,0013-SMEOE,0014-BMAQU,0015-UOCOJ,0016-QLJIS
gender,Female,Male,Male,Male,Female,Female,Female,Male,Female,Female
senior_citizen,0,0,0,1,1,0,1,0,1,0
partner,Yes,No,No,Yes,Yes,No,Yes,Yes,No,Yes
dependents,Yes,No,No,No,No,Yes,No,No,No,Yes
tenure,9,9,4,13,3,9,71,63,7,65
phone_service,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes
multiple_lines,No,Yes,No,No,No,No,No,Yes,No,Yes
internet_service_type_id,1,1,2,2,2,1,2,2,1,1
online_security,No,No,No,No,No,No,Yes,Yes,Yes,Yes


In [3]:
# High level summary of data including value_counts, dtypes, 
summarize_df(unclean_telco)

This dataframe has 7043 rows and 21 columns.
------------------------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               7043 non-null   object 
 1   gender                    7043 non-null   object 
 2   senior_citizen            7043 non-null   int64  
 3   partner                   7043 non-null   object 
 4   dependents                7043 non-null   object 
 5   tenure                    7043 non-null   int64  
 6   phone_service             7043 non-null   object 
 7   multiple_lines            7043 non-null   object 
 8   internet_service_type_id  7043 non-null   int64  
 9   online_security           7043 non-null   object 
 10  online_backup             7043 non-null   object 
 11  device_protection         7043 non-null   object 
 12  tech_support              7043 non-null   objec

#### Pre-cleaning extrapolations
- Lots of variables need encoding and renaming
- Unneccesary columns such as `customer_id`
- See <a href="https://github.com/Marley-C-Robinson-99/Classification-Project#readme">README.md</a> for data dictionary that explains each variable
- Non-normal distributions for numeric categories.

## Preparation
- Clean data using the <a href="https://github.com/Marley-C-Robinson-99/Classification-Project/blob/main/prepare.py">prepare.py</a> functions `prep_telco()`
    - Renamed columns to increase understandability
    - Removed duplicates and entries with empty(null) total_charges
    - Created dummies for `internet_service_type_id`, `payment_type_id`, and `contract_type_id`
- Split data into train, validate, and test sets

In [4]:
train, validate, test = prep_telco(get_telco_data())
train.shape, validate.shape, test.shape

((3937, 29), (1688, 29), (1407, 29))

In [5]:
train.head().T

Unnamed: 0,5919,1915,5054,2355,6279
is_male,0.0,1.0,0.0,1.0,1.0
is_senior,0.0,0.0,0.0,0.0,1.0
has_partner,0.0,0.0,1.0,1.0,0.0
has_dependents,0.0,1.0,1.0,1.0,0.0
tenure,58.0,71.0,35.0,1.0,20.0
has_phone,1.0,1.0,1.0,1.0,1.0
has_multi_line,1.0,1.0,1.0,0.0,0.0
has_onl_sec,1.0,1.0,0.0,0.0,1.0
has_backup,1.0,1.0,0.0,0.0,0.0
has_dev_pro,1.0,0.0,0.0,0.0,0.0


## Exploration
- Test hypotheses
- Run statistical tests to evaluate any potential correllations
- Utilize visualization to 

In [6]:
# Creating a list of tuples containing possible combinations of categorical features

import itertools

features = [col for col in train.columns if train[f'{col}'].nunique() <= 2] #Filters out any continuos vars
            
possible_combinations = []
            
for i in range(2, len(features) + 1):
    possible_combinations.extend(list(itertools.combinations(features, i)))

possible_combinations[0:5]

[('is_male', 'is_senior'),
 ('is_male', 'has_partner'),
 ('is_male', 'has_dependents'),
 ('is_male', 'has_phone'),
 ('is_male', 'has_multi_line')]

In [None]:
explr = train[[random.choice()]]

## Modeling
- Establish a baseline accuracy to compare models to
- Create a combination of features to feed models
- Train three different classification models
    - 1
    - 2
    - 3
- Compare accuracy to baseline accuracy
- Compare model accuracy for in-set to out-of-set accuracy

In [None]:
for combination in possible_combinations:
    features = list(combination)
    model = model.fit(X_train[[features]])
    # then continue on with your model evaluation