# Churn - Hyper-Parameters

## Load Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown
plt.style.use("seaborn-darkgrid")
pd.set_option('display.max_columns', None)  

DEBUG = True
SEED = 76

In [2]:
import os
for d in ['src','data','output']: os.makedirs(d, exist_ok=True)

In [3]:
from IPython.display import Markdown, display

In [4]:
# utility funciton for typesetting percentages
display_fraction = lambda n,d: (n/d*100, n, d)

## Load and Prepare the Data 

I have made a slight change of naming convention which will simply code below - and also avoid mistakes in lab sessions when I rerun cells out of order to demo parts of the code.   Rather than using __df__ to store the full dataset I will use __df_all__, and use __df__ as an alias for various dataset as needed - see [Feature Engineering](#Feature_Engineering). So will try to follow naming convention:

 * __df__ alias for various datasets (trwated link a tmp variable, more later). 
 * __df_all__ full dataset after loading and prepped (columns renamed, value recoded).
 * __df_model__ dataset with target and a subset of the original attributes that may appear in model or be used to construct other attributes.
 * __df_train__ dataset 
 * __df_test__ dataset 

In [5]:
df_churn = pd.read_csv("data/churn.csv")
print("Churn", df_churn.shape)
df_states = pd.read_csv("data/states.csv")
print("States", df_states.shape)

df_all = df_churn.merge(df_states, on="State")

message = (" * Data set consists of %d cases (rows) with %s attributes (cols) and a single target."  
% (df_all.shape[0], df_all.shape[1]-1))
Markdown(message)

Churn (3333, 20)
States (52, 4)


 * Data set consists of 3333 cases (rows) with 22 attributes (cols) and a single target.

## Pre-Processing Data

 * Filter features - for simplicity doing next to nothing here, and getting of state information

In [6]:
target = "Churn"

attributes = df_all.columns.tolist()
attributes.remove(target)
for c in ["Churn", "State", "Name", "Longitude", "Latitude"]: 
    if c in attributes: attributes.remove(c)

df_model = df_all.loc[:, attributes + [target]]

## Feature Engineering

 * To keep a level playing field here, we are not going to perform any feature engineering steps.

## Model Building

### Train-Test Split

In [7]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_model, stratify=df_model[target], test_size=.40, random_state=SEED)
print(df_train.shape, df_test.shape)

(1999, 19) (1334, 19)


In [8]:
df_train.head(1)

Unnamed: 0,Account_Length,Area_Code,Intl_Plan,VMail_Plan,VMail_Message,Day_Mins,Day_Calls,Day_Charge,Eve_Mins,Eve_Calls,Eve_Charge,Night_Mins,Night_Calls,Night_Charge,Intl_Mins,Intl_Calls,Intl_Charge,CustServ_Calls,Churn
3219,106,510,0,1,33,81.6,120,13.87,235.6,85,20.03,150.9,113,6.79,9.9,4,2.67,1,0


### Data normalizing and scaling

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(df_train.loc[:,attributes].astype(float))
y_train = df_train[target].values

X_test = scaler.transform(df_test.loc[:,attributes].astype(float))
y_test = df_test[target].values

---
### Training &mdash; TODO

 * Pick any classifier from sklearn (EXCEPT SVC and neural networks for time reasons).
 * Select hyper-parameters to tune.
 * Generate parameter search space.
 * Perform search.
 
Thing I'm looking for:

 * (Obviously) best score (accuracy)
 * Mixture of grid and random search
 * Optional - using hyperopt

In [10]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

#### Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [12]:
param_range = np.logspace(-4, 4, 9) 

param_grid = [{
    'solver': ['newton-cg', 'lbfgs', 'saga'],
     'C': param_range, 
    'penalty': ['l1', 'none']
}]

In [13]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
gs = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)

gs_result = gs.fit(X_train, y_train)



In [14]:
print("Best: %f using %s" % (gs_result.best_score_, gs_result.best_params_))

Best: 0.857762 using {'C': 10.0, 'penalty': 'l1', 'solver': 'saga'}


In [15]:
means = gs_result.cv_results_['mean_test_score']
stds = gs_result.cv_results_['std_test_score']
params = gs_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.000000 (0.000000) with: {'C': 0.0001, 'penalty': 'l1', 'solver': 'newton-cg'}
0.000000 (0.000000) with: {'C': 0.0001, 'penalty': 'l1', 'solver': 'lbfgs'}
0.854927 (0.000219) with: {'C': 0.0001, 'penalty': 'l1', 'solver': 'saga'}
0.857095 (0.017569) with: {'C': 0.0001, 'penalty': 'none', 'solver': 'newton-cg'}
0.857262 (0.018311) with: {'C': 0.0001, 'penalty': 'none', 'solver': 'lbfgs'}
0.857595 (0.019673) with: {'C': 0.0001, 'penalty': 'none', 'solver': 'saga'}
0.000000 (0.000000) with: {'C': 0.001, 'penalty': 'l1', 'solver': 'newton-cg'}
0.000000 (0.000000) with: {'C': 0.001, 'penalty': 'l1', 'solver': 'lbfgs'}
0.854927 (0.000219) with: {'C': 0.001, 'penalty': 'l1', 'solver': 'saga'}
0.857095 (0.017569) with: {'C': 0.001, 'penalty': 'none', 'solver': 'newton-cg'}
0.857262 (0.018311) with: {'C': 0.001, 'penalty': 'none', 'solver': 'lbfgs'}
0.857595 (0.019673) with: {'C': 0.001, 'penalty': 'none', 'solver': 'saga'}
0.000000 (0.000000) with: {'C': 0.01, 'penalty': 'l1', 'solver': 'newt

#### GaussianNB

In [16]:
from sklearn.naive_bayes import GaussianNB as GNB
model = GNB()
model.get_params().keys()

param_grid = [{
    "priors" : [None],
    "var_smoothing" : np.logspace(0,15, num=250),
}]


In [17]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
gs = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)

gs_result = gs.fit(X_train, y_train)

In [18]:
print("Best: %f using %s" % (gs_result.best_score_, gs_result.best_params_))

Best: 0.861261 using {'priors': None, 'var_smoothing': 1.3197203930613752}


#### GaussianProcessClassifier

In [19]:
from sklearn.gaussian_process import GaussianProcessClassifier as GPC
from sklearn.gaussian_process.kernels import RBF
model = GPC()
GPC().get_params().keys()

kernel = 1.0 * RBF(1.0)

param_grid = [{
#     'kernel': [kernel], do not uncomment unless you've got 
    'n_restarts_optimizer':[3],
    'random_state':[3],
    'multi_class':['one_vs_rest']
    }]


In [20]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
gs = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)

gs_result = gs.fit(X_train, y_train)

In [21]:
print("Best: %f using %s" % (gs_result.best_score_, gs_result.best_params_))

Best: 0.886947 using {'multi_class': 'one_vs_rest', 'n_restarts_optimizer': 3, 'random_state': 3}


#### AdaBoostClassifier

In [22]:
from sklearn.ensemble import AdaBoostClassifier as ABC

model = ABC()
ABC().get_params().keys()

param_grid = [{
    'algorithm': ["SAMME.R"],
    'random_state': [3],
    }]


In [23]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
gs = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)

gs_result = gs.fit(X_train, y_train)

In [24]:
print("Best: %f using %s" % (gs_result.best_score_, gs_result.best_params_))

Best: 0.875106 using {'algorithm': 'SAMME.R', 'random_state': 3}


#### KNeighborsClassifier

In [25]:
from sklearn.neighbors import KNeighborsClassifier as KNN
KNN().get_params().keys()

model = KNN()

param_grid = [{
    'n_neighbors': [5],
    'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
    'weights':['uniform']
}]



In [26]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
gs = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)

gs_result = gs.fit(X_train, y_train)

In [27]:
print("Best: %f using %s" % (gs_result.best_score_, gs_result.best_params_))

Best: 0.889117 using {'algorithm': 'auto', 'n_neighbors': 5, 'weights': 'uniform'}


#### DecisionTreeClassifier

In [28]:
from sklearn.tree import DecisionTreeClassifier as DTC
model = DTC()
DTC().get_params().keys()


param_grid = [{
    'criterion':["entropy","gini"],
    'max_features':[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15],
    'splitter':["best","random"],
    'random_state':[125,100,75,50],
    

}]


In [29]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
gs = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)

gs_result = gs.fit(X_train, y_train)

In [30]:
print("Best: %f using %s" % (gs_result.best_score_, gs_result.best_params_))

Best: 0.915455 using {'criterion': 'entropy', 'max_features': 10, 'random_state': 50, 'splitter': 'best'}


#### RandomForestClassifier

In [31]:
from sklearn.ensemble import RandomForestClassifier as RFC
model = RFC()
RFC().get_params().keys()

param_grid = [{
    'max_features':[9,12,17],
    'n_jobs':[-1],
    'criterion':["gini","entropy"],
    'random_state':[3,6]
#     'max_samples_split':[1,2,3,4],
# #     'min_samples_leaf':[1,2,3,4],
}]

In [32]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
gs = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)

gs_result = gs.fit(X_train, y_train)

In [33]:
print("Best: %f using %s" % (gs_result.best_score_, gs_result.best_params_))

Best: 0.951309 using {'criterion': 'entropy', 'max_features': 9, 'n_jobs': -1, 'random_state': 3}


#### HyperOpt

In [35]:
from hpsklearn import HyperoptEstimator as HOE
from hpsklearn import random_forest, any_classifier, any_preprocessing, any_sparse_classifier, tfidf
from hyperopt import hp, tpe
# model=HOE(classifier="clf")


estim = HOE(classifier=any_classifier('my_clf'), 
                            preprocessing=any_preprocessing('tfidf'),
                            algo=tpe.suggest,max_evals=20,trial_timeout=300)





WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely


In [None]:
estim.fit(X_train, y_train)

100%|██████████| 1/1 [00:00<00:00,  7.66trial/s, best loss: 0.17000000000000004]
100%|██████████| 2/2 [00:00<00:00,  1.36trial/s, best loss: 0.15749999999999997]
100%|██████████| 3/3 [05:00<00:00, 300.16s/trial, best loss: 0.15749999999999997]
 75%|███████▌  | 3/4 [00:00<?, ?trial/s, best loss=?]

In [None]:
print(estim.score(X_test, y_test))

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
gs = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)

gs_result = gs.fit(X_train, y_train)

In [None]:
print("Best: %f using %s" % (gs_result.best_score_, gs_result.best_params_))

### Evaluation (Using Test)

 * Using best classifier found above with best hyper-parameters fit to data and evaluate against `test` data.

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# model = LogisticRegression()
# model = GNB(priors= None, var_smoothing= 1.3197203930613752)
# model = GPC(multi_class= "one_vs_rest", n_restarts_optimizer= 2, random_state= 1)
# model = KNN(algorithm='auto', n_neighbors= 5, weights='uniform')
model = RFC(criterion='entropy', max_features=9, n_jobs=-1,random_state=3,)

In [None]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

#### Accuracy

In [None]:
accuracy_score(y_test, y_pred)

#### Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
import seaborn as sns
sns.heatmap(cm/ np.sum(cm), annot=True, fmt=".2%", cmap="Blues");
plt.savefig("confusion_matrix.png", bbox_inches="tight")

#### Classification Report

In [None]:
print(classification_report(y_test, y_pred,  digits=4))