# Churn - 03 - Model Building

## Setup

In [158]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import yaml

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
# pd.set_option('display.max_rows', None)

sns.set_style("darkgrid")

from IPython.display import display, Markdown
from pprint import pprint 

DEBUG = True
SEED = 666

In [159]:
DATASET = "[Dataset name]"

import os, sys
COLAB = 'google.colab' in sys.modules
ROOT = "./"

if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
  if not os.path.isdir(ROOT): os.makedirs(ROOT)


def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

## Load data

In [160]:
df = pd.read_pickle(ROOT+"data/data.pkl")
print(df.shape)
df.head()

(7043, 20)


Unnamed: 0,Gender,Seniorcitizen,Partner,Dependents,Tenure,Phoneservice,Multiplelines,Internetservice,Onlinesecurity,Onlinebackup,Deviceprotection,Techsupport,Streamingtv,Streamingmovies,Contract,Paperlessbilling,Paymentmethod,Monthlycharges,Totalcharges,Churn
0,Female,Yes,Yes,No,1,No,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,Yes,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,Yes,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,Yes,No,No,45,No,No,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,Yes,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Preprocessing

+ Impute missing values
+ Identify target and feature groups (cat and num)
+ train(+validation)+test split
+ preprocessing cat features
+ preprocessing num features

### Impute missing values

### Identify target and feature groups (cat and num)

In [161]:
target = "Churn"

features = list(df.columns)

if target in features: features.remove(target)
cat_features = [f for f in features if f in df.select_dtypes(include='category').columns]
num_features = [f for f in features if f not in cat_features]

print(f"target: {target} \n")
print(f"cat_features: {cat_features} \n")
print(f"num_features: {num_features} \n")

target: Churn 

cat_features: ['Gender', 'Seniorcitizen', 'Partner', 'Dependents', 'Phoneservice', 'Multiplelines', 'Internetservice', 'Onlinesecurity', 'Onlinebackup', 'Deviceprotection', 'Techsupport', 'Streamingtv', 'Streamingmovies', 'Contract', 'Paperlessbilling', 'Paymentmethod'] 

num_features: ['Tenure', 'Monthlycharges', 'Totalcharges'] 


### train(+validation)+test split

In [162]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, stratify=df.Churn, test_size=.30, random_state=SEED)

### Encode Target

In [163]:
from sklearn.preprocessing import LabelEncoder
le_target = LabelEncoder()

le_target.fit(df_train[target])
y_train = le_target.transform(df_train[target])
y_test = le_target.transform(df_test[target])

le_target.classes_

# --- OR ---

# y_train = le_target.fit_transform(df_train[target])
# y_test = le_target.transform(df_test[target])

# --- NEVER ---
# It Fits twice, and it's not good

# y_train = le_target.fit_transform(df_train[target])
# y_test = le_target.fit_transform(df_test[target])

array(['No', 'Yes'], dtype=object)

In [164]:
y_pred = np.zeros_like(y_train)

from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred, zero_division=True))

[[3622    0]
 [1308    0]]
              precision    recall  f1-score   support

           0       0.73      1.00      0.85      3622
           1       1.00      0.00      0.00      1308

    accuracy                           0.73      4930
   macro avg       0.87      0.50      0.42      4930
weighted avg       0.81      0.73      0.62      4930


### preprocessing cat features

+ One hot encoding on all cat features

In [165]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()

ohe.fit(df_train[cat_features]) 

X_cat_train = ohe.transform(df_train[cat_features])
X_cat_test = ohe.transform(df_test[cat_features])

In [166]:
df_cat_train = pd.DataFrame(X_cat_train.toarray(), columns=ohe.get_feature_names_out())
df_cat_test = pd.DataFrame(X_cat_test.toarray(), columns=ohe.get_feature_names_out())
print(df_cat_train.shape, df_cat_test.shape)

(4930, 36) (2113, 36)


### preprocessing num features

+ Impute missing values
    + TotalCharges is skewed, so we'll use mean 
+ Standardize all num features

In [167]:
df_train[num_features].isna().sum()

Tenure            0
Monthlycharges    0
Totalcharges      8
dtype: int64

In [168]:
value = df_train["Totalcharges"].mean()
df_train["Totalcharges"] = df_train["Totalcharges"].fillna(value)
df_test["Totalcharges"] = df_test["Totalcharges"].fillna(value)

In [169]:
df_train[num_features].describe()

Unnamed: 0,Tenure,Monthlycharges,Totalcharges
count,4930.0,4930.0,4930.0
mean,32.541379,64.85783,2306.749553
std,24.69541,30.285885,2287.807302
min,0.0,18.4,18.85
25%,9.0,35.5,389.8375
50%,29.0,70.35,1408.575
75%,56.0,89.9375,3886.75
max,72.0,118.75,8672.45


In [170]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(df_train[num_features])
X_num_train = ss.transform(df_train[num_features])
X_num_test = ss.transform(df_test[num_features])

In [171]:
df_num_train = pd.DataFrame(X_num_train, columns=num_features)
df_num_test = pd.DataFrame(X_num_test, columns=num_features)
print(df_num_train.shape, df_num_test.shape)

(4930, 3) (2113, 3)


In [172]:
# Stick the cat and num features back together
df_model_train = pd.concat([df_cat_train, df_num_train], axis=1)
df_model_test = pd.concat([df_cat_test, df_num_test], axis=1)
print(df_model_train.shape, df_model_test.shape)

(4930, 39) (2113, 39)


## Model Building

In [177]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

classifiers = {
    # Some issue with sklearn KNN does not work!!
    # "KNN": KNeighborsClassifier(),
    # "KNN(3)": KNeighborsClassifier(n_neighbors=3),
    # "KNN(7)": KNeighborsClassifier(n_neighbors=7),
    "LR": LogisticRegression(solver='lbfgs', max_iter=10000),
    "DT": DecisionTreeClassifier(),
    "DT(max_depth=5)": DecisionTreeClassifier(max_depth=5),
    "RF": RandomForestClassifier(),
    "AdaB": AdaBoostClassifier(algorithm="SAMME"),
}

In [178]:
from sklearn.metrics import accuracy_score

for name, model in classifiers.items():  
  model.fit(df_model_train, y_train)
  y_pred = model.predict(df_model_train)
  accuracy_train = accuracy_score(y_train, y_pred)
  
  y_pred = model.predict(df_model_test)
  accuracy_test = accuracy_score(y_test, y_pred)
  
  print(f"{name:20s} accuracy_train={accuracy_train:.2%}\taccuracy_test={accuracy_test:.2%}")

LR                   accuracy_train=80.67%	accuracy_test=80.22%
DT                   accuracy_train=99.74%	accuracy_test=71.37%
DT(max_depth=5)      accuracy_train=80.43%	accuracy_test=77.99%
RF                   accuracy_train=99.74%	accuracy_test=78.85%
AdaB                 accuracy_train=79.90%	accuracy_test=78.85%


In [182]:
from sklearn.model_selection import cross_val_score

for name, model in classifiers.items():
  scores = cross_val_score(model, df_model_train, y_train, cv=10)
  print(f"{name:20s} accuracy_train={scores.mean():.2%} +/- {scores.std():.2%}")

LR                   accuracy_train=80.41% +/- 1.55%
DT                   accuracy_train=73.14% +/- 1.31%
DT(max_depth=5)      accuracy_train=78.62% +/- 1.80%
RF                   accuracy_train=79.11% +/- 1.23%
AdaB                 accuracy_train=79.74% +/- 2.08%


## Model Evaluation 