# Churn &mdash; Baseline Model

## Setup

In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown
plt.style.use("seaborn-darkgrid")
pd.set_option('display.max_columns', None)  

import sys, os, yaml

DATASET = "Churn"

COLAB = 'google.colab' in sys.modules
if COLAB:
    ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
else:
    ROOT = "./"

DEBUG = False
SEED = 666

  plt.style.use("seaborn-darkgrid")


In [61]:
if COLAB:
    from google.colab import drive
    if not os.path.isdir("/content/gdrive"):
        drive.mount("/content/gdrive")

## Load Dataset

In [62]:
df = pd.read_pickle(f"{ROOT}/data/data.pkl")
print(df.shape)
df.head(1)

(7032, 20)


Unnamed: 0,Gender,SeniorCitizen,Partner,Dependents,Tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,No,Yes,No,1,No,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No


In [63]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, train_size=0.60, random_state=SEED, stratify=df.Churn)
print(df_train.shape, df_test.shape)

(4219, 20) (2813, 20)


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   Gender            7032 non-null   category
 1   SeniorCitizen     7032 non-null   category
 2   Partner           7032 non-null   category
 3   Dependents        7032 non-null   category
 4   Tenure            7032 non-null   int64   
 5   PhoneService      7032 non-null   category
 6   MultipleLines     7032 non-null   category
 7   InternetService   7032 non-null   category
 8   OnlineSecurity    7032 non-null   category
 9   OnlineBackup      7032 non-null   category
 10  DeviceProtection  7032 non-null   category
 11  TechSupport       7032 non-null   category
 12  StreamingTV       7032 non-null   category
 13  StreamingMovies   7032 non-null   category
 14  Contract          7032 non-null   category
 15  PaperlessBilling  7032 non-null   category
 16  PaymentMethod     7032 n

## Preprocessing Dataset

### Identify target and features ###

In [65]:
target = "Churn"
print(f"target : {target}")

cat_features = [c for c in df.select_dtypes("category").columns if c != target]
print(f"Categorical Features ({len(cat_features)}) : {cat_features}")

num_features = [c for c in df.select_dtypes(["int", "float"]).columns if c != target]
print(f"Numerical Features ({len(num_features)}) : {num_features}")

target : Churn
Categorical Features (16) : ['Gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
Numerical Features (3) : ['Tenure', 'MonthlyCharges', 'TotalCharges']


### Train/test split ###

In [66]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.40, stratify=df[target], random_state=SEED)
print(df_train.shape, df_test.shape)

(4219, 20) (2813, 20)


### Encode target ###

In [67]:
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
labelEncoder.fit(df_train[target])
y_train = labelEncoder.transform(df_train[target])
y_test = labelEncoder.transform(df_test[target])

In [68]:
df_train[target].head()

193      No
6108     No
3580    Yes
2195     No
3809     No
Name: Churn, dtype: category
Categories (2, object): ['No', 'Yes']

In [69]:
y_train[:5]

array([0, 0, 1, 0, 0])

In [70]:
labelEncoder.inverse_transform(y_train[:5])

array(['No', 'No', 'Yes', 'No', 'No'], dtype=object)

### Encode categorical features ###

In [71]:
from sklearn.preprocessing import OneHotEncoder

oneHotEncoder = OneHotEncoder()
oneHotEncoder.fit(df_train[cat_features])
x_cat_train = oneHotEncoder.transform(df_train[cat_features])
x_cat_test = oneHotEncoder.transform(df_test[cat_features])

In [72]:
df_cat_train = pd.DataFrame(x_cat_train.toarray(), columns=oneHotEncoder.get_feature_names_out())
df_cat_test = pd.DataFrame(x_cat_test.toarray(), columns=oneHotEncoder.get_feature_names_out())
print(df_cat_train.shape, df_cat_test.shape)
df_cat_train.head(1)

(4219, 36) (2813, 36)


Unnamed: 0,Gender_Female,Gender_Male,SeniorCitizen_No,SeniorCitizen_Yes,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_Yes,TechSupport_No,TechSupport_Yes,StreamingTV_No,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0


### Scale / transform numerical features

In [73]:
from sklearn.preprocessing import StandardScaler

standardScaler = StandardScaler()

standardScaler.fit(df_train[num_features])
x_num_train = standardScaler.transform(df_train[num_features])
x_num_test = standardScaler.transform(df_test[num_features])

In [74]:
df_num_train = pd.DataFrame(x_num_train, columns=standardScaler.get_feature_names_out())
df_num_test = pd.DataFrame(x_num_test, columns=standardScaler.get_feature_names_out())
print(df_num_train.shape, df_num_test.shape)
df_num_train.head(1)

(4219, 3) (2813, 3)


Unnamed: 0,Tenure,MonthlyCharges,TotalCharges
0,1.597275,1.487874,2.557942


### Construct dataframe for model features ###

In [75]:
df_model_train = pd.concat([df_cat_train, df_num_train], axis=1)
df_model_test = pd.concat([df_cat_test, df_num_test], axis=1)
print(df_model_train.shape, df_model_test.shape)

(4219, 39) (2813, 39)


## Model selection ##

In [76]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

classifiers = {
    "KNN" : KNeighborsClassifier(),
    "KNN (3)" : KNeighborsClassifier(3),
    "DT" : DecisionTreeClassifier(),
    "DT (max_depth=5" : DecisionTreeClassifier(max_depth=5),
    "LR" : LogisticRegression(max_iter=1000),
    "RF" : RandomForestClassifier(),
    "AdaBoost" : AdaBoostClassifier()
}

In [77]:
from sklearn.metrics import accuracy_score
for name, model in classifiers.items():
    model.fit(df_model_train, y_train)

    # Scoring on SEEN data - effectively "useless"
    y_pred = model.predict(df_model_train)
    train_accuracy = accuracy_score(y_train, y_pred)

    # Scoring on UNSEEN data - important
    y_pred = model.predict(df_model_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"{name:20s} accuracy: train = {train_accuracy:.2%} test = {test_accuracy:.2%}")

KNN                  accuracy: train = 84.14% test = 75.72%
KNN (3)              accuracy: train = 87.08% test = 74.76%
DT                   accuracy: train = 99.79% test = 71.99%
DT (max_depth=5      accuracy: train = 80.28% test = 78.74%
LR                   accuracy: train = 80.47% test = 80.38%
RF                   accuracy: train = 99.76% test = 78.78%
AdaBoost             accuracy: train = 80.80% test = 79.88%


In [78]:
from sklearn.model_selection import cross_val_score
for name, model in classifiers.items():

    scores = cross_val_score(model, df_model_train, y_train, cv=10)

    print(f"{name:20s} accuracy CV = {scores.mean():.2%} std = {scores.std():.2%}")

KNN                  accuracy CV = 76.61% std = 1.64%
KNN (3)              accuracy CV = 74.99% std = 1.96%
DT                   accuracy CV = 71.75% std = 2.00%
DT (max_depth=5      accuracy CV = 78.38% std = 1.22%
LR                   accuracy CV = 80.21% std = 1.58%
RF                   accuracy CV = 78.55% std = 1.30%
AdaBoost             accuracy CV = 79.85% std = 1.60%


In [79]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
metrics = {
    "accuracy": accuracy_score,
    "precision": precision_score,
    "recall": recall_score,
    "f1": f1_score,
    "roc_auc": roc_auc_score
}

In [80]:
data = []
for name, model in classifiers.items():
    row = {"model": name }
    model.fit(df_model_train, y_train)
    for key, metric in metrics.items():

        # Scoring on SEEN data - effectively "useless"
        y_pred = model.predict(df_model_train)
        #train_accuracy = metric(y_train, y_pred)
        row["train_" + key] = metric(y_train, y_pred)

        # Scoring on UNSEEN data - important
        y_pred = model.predict(df_model_test)
        #test_accuracy = metric(y_test, y_pred)
        row["test_" + key] = metric(y_test, y_pred)
        #print(f"{name} : {key:20s} train = {train_accuracy:.2%} test = {test_accuracy:.2%}")
    data.append(row)
df_results = pd.DataFrame(data)

In [81]:
df_results

Unnamed: 0,model,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_roc_auc,test_roc_auc
0,KNN,0.841432,0.757199,0.720273,0.543276,0.659233,0.545455,0.688402,0.544363,0.783296,0.689676
1,KNN (3),0.870822,0.7476,0.779612,0.525333,0.716325,0.526738,0.746629,0.526035,0.821526,0.67717
2,DT,0.997867,0.718095,0.999102,0.471627,0.992864,0.5,0.995973,0.485399,0.99627,0.648547
3,DT (max_depth=5,0.802797,0.787416,0.622561,0.589286,0.654773,0.661765,0.638261,0.623426,0.755566,0.747347
4,LR,0.804693,0.803768,0.660888,0.649847,0.544157,0.568182,0.596869,0.606277,0.721562,0.728643
5,RF,0.997867,0.786349,0.995544,0.625641,0.996432,0.489305,0.995988,0.549137,0.997409,0.691626
6,AdaBoost,0.808011,0.798791,0.672204,0.646302,0.541481,0.537433,0.599802,0.586861,0.722968,0.715448


In [None]:
def highlight_col(x):
    r = 'background-color: red'
    df1 = pd.DataFrame('', index=x.index, columns=x.columns)
    df1.iloc[:, 0] = r
    return df1#%% md
# Churn &mdash; Baseline Model

## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown
plt.style.use("seaborn-darkgrid")
pd.set_option('display.max_columns', None)

import sys, os, yaml

DATASET = "Churn"

COLAB = 'google.colab' in sys.modules
if COLAB:
    ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
else:
    ROOT = "./"

DEBUG = False
SEED = 666

In [None]:
if COLAB:
    from google.colab import drive
    if not os.path.isdir("/content/gdrive"):
        drive.mount("/content/gdrive")

## Load Dataset

In [None]:
df = pd.read_pickle(f"{ROOT}/data/data.pkl")
print(df.shape)
df.head(1)

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, train_size=0.60, random_state=SEED, stratify=df.Churn)
print(df_train.shape, df_test.shape)

In [None]:
df.info()

## Preprocessing Dataset

### Identify target and features ###

In [None]:
target = "Churn"
print(f"target : {target}")

cat_features = [c for c in df.select_dtypes("category").columns if c != target]
print(f"Categorical Features ({len(cat_features)}) : {cat_features}")

num_features = [c for c in df.select_dtypes(["int", "float"]).columns if c != target]
print(f"Numerical Features ({len(num_features)}) : {num_features}")

### Train/test split ###

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.40, stratify=df[target], random_state=SEED)
print(df_train.shape, df_test.shape)

### Encode target ###

In [None]:
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
labelEncoder.fit(df_train[target])
y_train = labelEncoder.transform(df_train[target])
y_test = labelEncoder.transform(df_test[target])

In [None]:
df_train[target].head()

In [None]:
y_train[:5]

In [None]:
labelEncoder.inverse_transform(y_train[:5])

### Encode categorical features ###

In [None]:
from sklearn.preprocessing import OneHotEncoder

oneHotEncoder = OneHotEncoder()
oneHotEncoder.fit(df_train[cat_features])
x_cat_train = oneHotEncoder.transform(df_train[cat_features])
x_cat_test = oneHotEncoder.transform(df_test[cat_features])

In [None]:
df_cat_train = pd.DataFrame(x_cat_train.toarray(), columns=oneHotEncoder.get_feature_names_out())
df_cat_test = pd.DataFrame(x_cat_test.toarray(), columns=oneHotEncoder.get_feature_names_out())
print(df_cat_train.shape, df_cat_test.shape)
df_cat_train.head(1)

### Scale / transform numerical features

In [None]:
from sklearn.preprocessing import StandardScaler

standardScaler = StandardScaler()

standardScaler.fit(df_train[num_features])
x_num_train = standardScaler.transform(df_train[num_features])
x_num_test = standardScaler.transform(df_test[num_features])

In [None]:
df_num_train = pd.DataFrame(x_num_train, columns=standardScaler.get_feature_names_out())
df_num_test = pd.DataFrame(x_num_test, columns=standardScaler.get_feature_names_out())
print(df_num_train.shape, df_num_test.shape)
df_num_train.head(1)

### Construct dataframe for model features ###

In [None]:
df_model_train = pd.concat([df_cat_train, df_num_train], axis=1)
df_model_test = pd.concat([df_cat_test, df_num_test], axis=1)
print(df_model_train.shape, df_model_test.shape)

## Model selection ##

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

classifiers = {
    "KNN" : KNeighborsClassifier(),
    "KNN (3)" : KNeighborsClassifier(3),
    "DT" : DecisionTreeClassifier(),
    "DT (max_depth=5" : DecisionTreeClassifier(max_depth=5),
    "LR" : LogisticRegression(max_iter=1000),
    "RF" : RandomForestClassifier(),
    "AdaBoost" : AdaBoostClassifier()
}

In [None]:
from sklearn.metrics import accuracy_score
for name, model in classifiers.items():
    model.fit(df_model_train, y_train)

    # Scoring on SEEN data - effectively "useless"
    y_pred = model.predict(df_model_train)
    train_accuracy = accuracy_score(y_train, y_pred)

    # Scoring on UNSEEN data - important
    y_pred = model.predict(df_model_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"{name:20s} accuracy: train = {train_accuracy:.2%} test = {test_accuracy:.2%}")

In [None]:
from sklearn.model_selection import cross_val_score
for name, model in classifiers.items():

    scores = cross_val_score(model, df_model_train, y_train, cv=10)

    print(f"{name:20s} accuracy CV = {scores.mean():.2%} std = {scores.std():.2%}")

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
metrics = {
    "accuracy": accuracy_score,
    "precision": precision_score,
    "recall": recall_score,
    "f1": f1_score,
    "roc_auc": roc_auc_score
}

In [184]:
from sklearn.metrics import confusion_matrix
def business_benefit_score(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    # TN, FP, FN, TP
    # false positive : 60% of population
    # true positive: 0.63% * 975 & 0.27 * -25
    # false positive:
    benefits = [
        1000 * cm[0][0],
        (1000 * (cm[0][1] * 0.4)) + (975 * (cm[0][1] * 0.6)),
        0,
        (975 * (cm[1][1] * (0.9 * 0.7))) - (25 * (cm[1][1] * (0.9 * 0.3)))
    ]
    return sum(benefits) / cm.sum()
#.sum() / cm.sum()
#(benefits * cm)

In [None]:
data = []
for name, model in classifiers.items():
    row = {"model": name }
    model.fit(df_model_train, y_train)
    for key, metric in metrics.items():

        # Scoring on SEEN data - effectively "useless"
        y_pred = model.predict(df_model_train)
        #train_accuracy = metric(y_train, y_pred)
        row["train_" + key] = metric(y_train, y_pred)

        # Scoring on UNSEEN data - important
        y_pred = model.predict(df_model_test)
        #test_accuracy = metric(y_test, y_pred)
        row["test_" + key] = metric(y_test, y_pred)
        #print(f"{name} : {key:20s} train = {train_accuracy:.2%} test = {test_accuracy:.2%}")
    data.append(row)
df_results = pd.DataFrame(data)

In [84]:
df_results

Unnamed: 0,model,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_roc_auc,test_roc_auc
0,KNN,0.841432,0.757199,0.720273,0.543276,0.659233,0.545455,0.688402,0.544363,0.783296,0.689676
1,KNN (3),0.870822,0.7476,0.779612,0.525333,0.716325,0.526738,0.746629,0.526035,0.821526,0.67717
2,DT,0.997867,0.718095,0.999102,0.471627,0.992864,0.5,0.995973,0.485399,0.99627,0.648547
3,DT (max_depth=5,0.802797,0.787416,0.622561,0.589286,0.654773,0.661765,0.638261,0.623426,0.755566,0.747347
4,LR,0.804693,0.803768,0.660888,0.649847,0.544157,0.568182,0.596869,0.606277,0.721562,0.728643
5,RF,0.997867,0.786349,0.995544,0.625641,0.996432,0.489305,0.995988,0.549137,0.997409,0.691626
6,AdaBoost,0.808011,0.798791,0.672204,0.646302,0.541481,0.537433,0.599802,0.586861,0.722968,0.715448


In [129]:
def highlight_col(x):
    green = 'background-color: darkgreen'
    blue = "background-color: darkblue"
    yellow = "background-color: darkred"
    df1 = pd.DataFrame('', index=x.index, columns=x.columns)
    df1.iloc[:, 0] = green
    df1.iloc[:, [1, 2, 5, 6, 9, 10]] = blue
    df1.iloc[:, [3, 4, 7, 8]] = yellow
    return df1

In [130]:
df_results.style.apply(highlight_col, axis=None)

Unnamed: 0,model,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_roc_auc,test_roc_auc
0,KNN,0.841432,0.757199,0.720273,0.543276,0.659233,0.545455,0.688402,0.544363,0.783296,0.689676
1,KNN (3),0.870822,0.7476,0.779612,0.525333,0.716325,0.526738,0.746629,0.526035,0.821526,0.67717
2,DT,0.997867,0.718095,0.999102,0.471627,0.992864,0.5,0.995973,0.485399,0.99627,0.648547
3,DT (max_depth=5,0.802797,0.787416,0.622561,0.589286,0.654773,0.661765,0.638261,0.623426,0.755566,0.747347
4,LR,0.804693,0.803768,0.660888,0.649847,0.544157,0.568182,0.596869,0.606277,0.721562,0.728643
5,RF,0.997867,0.786349,0.995544,0.625641,0.996432,0.489305,0.995988,0.549137,0.997409,0.691626
6,AdaBoost,0.808011,0.798791,0.672204,0.646302,0.541481,0.537433,0.599802,0.586861,0.722968,0.715448


In [182]:
confusion_matrix(y_test, y_pred)

array([[1845,  220],
       [ 346,  402]], dtype=int64)

In [183]:
business_benefit_score(y_test, y_pred)

819.7351581940989

In [177]:
216700

216700