# Churn - Hyper-Parameters

## Load Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown
plt.style.use("seaborn-darkgrid")
pd.set_option('display.max_columns', None)  

DEBUG = True
SEED = 76

In [2]:
import os
for d in ['src','data','output']: os.makedirs(d, exist_ok=True)

In [3]:
from IPython.display import Markdown, display

In [4]:
# utility funciton for typesetting percentages
display_fraction = lambda n,d: (n/d*100, n, d)

## Load and Prepare the Data 

I have made a slight change of naming convention which will simply code below - and also avoid mistakes in lab sessions when I rerun cells out of order to demo parts of the code.   Rather than using __df__ to store the full dataset I will use __df_all__, and use __df__ as an alias for various dataset as needed - see [Feature Engineering](#Feature_Engineering). So will try to follow naming convention:

 * __df__ alias for various datasets (trwated link a tmp variable, more later). 
 * __df_all__ full dataset after loading and prepped (columns renamed, value recoded).
 * __df_model__ dataset with target and a subset of the original attributes that may appear in model or be used to construct other attributes.
 * __df_train__ dataset 
 * __df_test__ dataset 

In [5]:
df_churn = pd.read_csv("data/churn.csv")
print("Churn", df_churn.shape)
df_states = pd.read_csv("data/states.csv")
print("States", df_states.shape)

df_all = df_churn.merge(df_states, on="State")

message = (" * Data set consists of %d cases (rows) with %s attributes (cols) and a single target."  
% (df_all.shape[0], df_all.shape[1]-1))
Markdown(message)

Churn (3333, 20)
States (52, 4)


 * Data set consists of 3333 cases (rows) with 22 attributes (cols) and a single target.

## Pre-Processing Data

 * Filter features - for simplicity doing next to nothing here, and getting of state information

In [6]:
target = "Churn"

attributes = df_all.columns.tolist()
attributes.remove(target)
for c in ["Churn", "State", "Name", "Longitude", "Latitude"]: 
    if c in attributes: attributes.remove(c)

df_model = df_all.loc[:, attributes + [target]]

## Feature Engineering

 * To keep a level playing field here, we are not going to perform any feature engineering steps.

## Model Building

### Train-Test Split

In [7]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_model, stratify=df_model[target], test_size=.40, random_state=SEED)
print(df_train.shape, df_test.shape)

(1999, 19) (1334, 19)


In [8]:
df_train.head(1)

Unnamed: 0,Account_Length,Area_Code,Intl_Plan,VMail_Plan,VMail_Message,Day_Mins,Day_Calls,Day_Charge,Eve_Mins,Eve_Calls,Eve_Charge,Night_Mins,Night_Calls,Night_Charge,Intl_Mins,Intl_Calls,Intl_Charge,CustServ_Calls,Churn
3219,106,510,0,1,33,81.6,120,13.87,235.6,85,20.03,150.9,113,6.79,9.9,4,2.67,1,0


### Data normalizing and scaling

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(df_train.loc[:,attributes].astype(float))
y_train = df_train[target].values

X_test = scaler.transform(df_test.loc[:,attributes].astype(float))
y_test = df_test[target].values

### DIY BAGGING

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [11]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9077961019490255


In [12]:
def generate_sample(X_train, y_train):
    n = X_train.shape[0]
    sample_idx = np.random.choice(n, size=n,replace=True)
    
    return X_train[sample_idx], y_train[sample_idx]

In [13]:
n = 10000
sample_idx = np.random.choice(n,size=n,replace=True)
len(set(sample_idx)) / n

0.6361

In [14]:
M = 44
models = [None] * M
for m in range(M):
    models[m] = DecisionTreeClassifier()
    X_train_sample, y_train_sample = generate_sample(X_train,y_train)
    models[m].fit( X_train_sample, y_train_sample)

In [15]:
models

[DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 

In [16]:
y_vote = np.zeros( (M, y_test.shape[0]) )
for m in range(M):
    y_vote[m] = models[m].predict(X_test)
y_pred = (y_vote.sum(axis=0) > M//2).astype(int)

In [17]:
print(accuracy_score(y_test,y_pred))

0.95952023988006


### Example use of Pipelines

In [18]:
df_all.columns

Index(['State', 'Account_Length', 'Area_Code', 'Intl_Plan', 'VMail_Plan',
       'VMail_Message', 'Day_Mins', 'Day_Calls', 'Day_Charge', 'Eve_Mins',
       'Eve_Calls', 'Eve_Charge', 'Night_Mins', 'Night_Calls', 'Night_Charge',
       'Intl_Mins', 'Intl_Calls', 'Intl_Charge', 'CustServ_Calls', 'Churn',
       'Latitude', 'Longitude', 'Name'],
      dtype='object')

In [26]:
numeric_features_1 = ["Account_Length","VMail_Message","Day_Mins","Day_Calls","Day_Charge","Eve_Mins","Eve_Calls","Eve_Charge","Night_Mins","Night_Calls","Night_Charge","Intl_Mins","Intl_Calls","Intl_Charge","CustServ_Calls"]
numeric_features_2 = ['Account_Length']

categorical_features = ['Area_Code',"Intl_Plan","VMail_Plan"]
features =  numeric_features_1 + categorical_features
target = 'Churn'

In [27]:
X_train,X_test,y_train,y_test = train_test_split(df_all[features], df_all[target],train_size=0.60)

In [28]:
from sklearn.compose import ColumnTransformer
from sklearn import preprocessing
from sklearn import impute
from sklearn.pipeline import Pipeline

In [29]:
column_transformer = ColumnTransformer([
    ('numeric_features_1',preprocessing.StandardScaler(),numeric_features_1),
    ('numeric_features_2',preprocessing.StandardScaler(),numeric_features_2),
    ('categorica_featutes',preprocessing.OneHotEncoder(),categorical_features)
])

In [30]:
pipeline = Pipeline([
    ('column_transformer',column_transformer ),
    ('model', DecisionTreeClassifier())
])

In [31]:
pipeline.fit(X_train,y_train);

In [32]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test,y_pred)

0.8973013493253373

### Evaluation (Using Test)

 * Using best classifier found above with best hyper-parameters fit to data and evaluate against `test` data.

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

#### Accuracy

In [None]:
accuracy_score(y_test, y_pred)

#### Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
import seaborn as sns
sns.heatmap(cm/ np.sum(cm), annot=True, fmt=".2%", cmap="Blues");
plt.savefig("confusion_matrix.png", bbox_inches="tight")

#### Classification Report

In [None]:
print(classification_report(y_test, y_pred,  digits=4))