# Assignment 3
Churn is a major problem in Telecom industry. Being able to understand who is likely to churn based on some indicators can help firms focus on those individuals and stop them from going over to competitors. 
A dataset of customer with their usage patter in available in Assignment03_ Telco-Customer-Churn.csv file. 

https://drive.google.com/drive/folders/1Jl8iDu7nGmrqCECbrLqmVafgwE5PYfiU

Build a decision tree classifier to predict which customers are likely to churn.Use 10-Fold cross validation to report your results.
    
    1) Tree with no pruning
    2) Tree with pre-pruning
    3) Tree with post-pruning
With the best model above - make the following changes:
    
    4) Up-sample the majority class and fit the model with best result
    5) Weight the class and fit the model with best result

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.utils import resample

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('Assignment03_Telco-Customer-Churn.csv')
df.head(2)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure(month),PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


In [3]:
cat_var_list = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
                'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 
                'PaperlessBilling', 'PaymentMethod'
               ]
num_var_list = ['tenure(month)', 'MonthlyCharges', 'TotalCharges']
df_dummy = pd.get_dummies(df, columns=cat_var_list, prefix_sep="_")
for vrb in num_var_list:
    df_dummy[vrb] = pd.to_numeric(df_dummy[vrb], errors="coerce")

df_dummy['target'] = np.where(df['Churn'] == "Yes", 1, 0)
df_dummy.drop(columns=['Churn', 'customerID'], inplace=True)
df_dummy.dropna(inplace=True)

In [4]:
#df_dummy.info()

In [5]:
x_cols = list(df_dummy.columns)
x_cols.remove('target')
#x_cols

In [6]:
x_train, x_test, y_train, y_test = train_test_split(df_dummy[x_cols], df_dummy['target'], test_size=0.2, random_state=0, 
                                                    stratify=df_dummy['target'])

# 1) Tree with no pruning

In [7]:
tree_classifier_no_pruning = DecisionTreeClassifier(random_state=0)
tree_classifier_no_pruning.fit(x_train, y_train)

y_pred_train = tree_classifier_no_pruning.predict(x_train)
y_pred_test = tree_classifier_no_pruning.predict(x_test) 
train_accuracy = np.round(accuracy_score(y_pred_train, y_train) * 100, 2)
test_accuracy = np.round(accuracy_score(y_pred_test, y_test) * 100, 2)

# roc-auc score
y_prob_train = tree_classifier_no_pruning.predict_proba(x_train)[:, 1]
y_prob_test = tree_classifier_no_pruning.predict_proba(x_test)[:, 1]
roc_train = np.round(roc_auc_score(y_train, y_prob_train), 2)
roc_test  = np.round(roc_auc_score(y_test, y_prob_test), 2)

print(f'Train score with no pruning: {train_accuracy}%')
print(f'Test  score with no pruning: {test_accuracy}%')
print(f'Train roc value with no pruning: {roc_train}%')
print(f'Test  roc vlaue with no pruning: {roc_test}%')

Train score with no pruning: 99.77%
Test  score with no pruning: 71.93%
Train roc value with no pruning: 1.0%
Test  roc vlaue with no pruning: 0.64%


# 2) Tree with pre-pruning

In [8]:
# Grid paramter
params = {'max_depth': [2, 4, 6, 8, 10, 12],
          'min_samples_split': [2, 3, 4, 5, 10, 15, 20, 30],
          'min_samples_leaf': [1, 2, 5, 10, 20, 30]
         }

tree_classifier_pre_pruning = DecisionTreeClassifier(random_state=0)
tree_classifier_pre_pruning = GridSearchCV(estimator=tree_classifier_pre_pruning, param_grid=params, cv=10)
tree_classifier_pre_pruning.fit(x_train,y_train)
print(tree_classifier_pre_pruning.best_estimator_)
print("")

y_pred_train = tree_classifier_pre_pruning.predict(x_train)
y_pred_test = tree_classifier_pre_pruning.predict(x_test) 
train_accuracy = np.round(accuracy_score(y_pred_train, y_train) * 100, 2)
test_accuracy = np.round(accuracy_score(y_pred_test, y_test) * 100, 2)

# roc-auc score
y_prob_train = tree_classifier_pre_pruning.predict_proba(x_train)[:, 1]
y_prob_test = tree_classifier_pre_pruning.predict_proba(x_test)[:, 1]
roc_train = np.round(roc_auc_score(y_train, y_prob_train), 2)
roc_test  = np.round(roc_auc_score(y_test, y_prob_test), 2)

print(f'Train score with pre-pruning: {train_accuracy}%')
print(f'Test  score with pre-pruning: {test_accuracy}%')
print(f'Train with pre-pruning, roc value: {roc_train}%')
print(f'Test  with pre-pruning, roc vlaue: {roc_test}%')

DecisionTreeClassifier(max_depth=6, min_samples_leaf=20, random_state=0)

Train score with pre-pruning: 80.76%
Test  score with pre-pruning: 78.61%
Train with pre-pruning, roc value: 0.86%
Test  with pre-pruning, roc vlaue: 0.83%


# 3) Tree with post-pruning

In [9]:
# Grid paramter
params = {'ccp_alpha':[0.0, 0.01, 0.02, 0.04, 0.07, 0.10, 0.15, 0.20, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.995]
         }

tree_classifier_post_pruning = DecisionTreeClassifier(random_state=0)
tree_classifier_post_pruning = GridSearchCV(estimator=tree_classifier_post_pruning, param_grid=params, cv=10)
tree_classifier_post_pruning.fit(x_train,y_train)
print(tree_classifier_post_pruning.best_estimator_)
print("")

y_pred_train = tree_classifier_post_pruning.predict(x_train)
y_pred_test = tree_classifier_post_pruning.predict(x_test) 
train_accuracy = np.round(accuracy_score(y_pred_train, y_train) * 100, 2)
test_accuracy = np.round(accuracy_score(y_pred_test, y_test) * 100, 2)

# roc-auc score
y_prob_train = tree_classifier_post_pruning.predict_proba(x_train)[:, 1]
y_prob_test = tree_classifier_post_pruning.predict_proba(x_test)[:, 1]
roc_train = np.round(roc_auc_score(y_train, y_prob_train), 2)
roc_test  = np.round(roc_auc_score(y_test, y_prob_test), 2)

print(f'Train score with post-pruning: {train_accuracy}%')
print(f'Test  score with post-pruning: {test_accuracy}%')
print(f'Train with post-pruning, roc value: {roc_train}%')
print(f'Test  with post-pruning, roc vlaue: {roc_test}%')

DecisionTreeClassifier(ccp_alpha=0.01, random_state=0)

Train score with post-pruning: 78.99%
Test  score with post-pruning: 79.46%
Train with post-pruning, roc value: 0.8%
Test  with post-pruning, roc vlaue: 0.79%


## Best model
Best model, based on accuracy and roc value, we see that model with pre-pruning is best

DecisionTreeClassifier(max_depth=6, min_samples_leaf=20, random_state=0)

# 4) Up-sample the majority class and fit the model with best result

In [10]:
class_zero_size = y_train.value_counts()[0]
class_one_size = y_train.value_counts()[1]

df_train = x_train.copy()
df_train['target'] = y_train

df_majority = df_train[df_train.target == 0]
df_minority = df_train[df_train.target == 1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,                 # sample with replacement
                                 n_samples=class_zero_size,    # to match majority class
                                 random_state=123)             # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.target.value_counts()

0    4130
1    4130
Name: target, dtype: int64

In [11]:
y_train_upsample = df_upsampled['target']
x_train_upsample = df_upsampled.drop(columns=['target'])

In [12]:
# Grid paramter
params = {'max_depth': [6],
          'min_samples_split': [2, 3, 4, 5, 10, 15, 20, 30],
          'min_samples_leaf': [20]
         }

tree_classifier_pre_pruning_up_sampling = DecisionTreeClassifier(random_state=0)
tree_classifier_pre_pruning_up_sampling = GridSearchCV(estimator=tree_classifier_pre_pruning_up_sampling, 
                                                       param_grid=params, cv=10)
tree_classifier_pre_pruning_up_sampling.fit(x_train_upsample, y_train_upsample)
print(tree_classifier_pre_pruning_up_sampling.best_estimator_)
print("")

y_pred_train = tree_classifier_pre_pruning_up_sampling.predict(x_train_upsample)
y_pred_test = tree_classifier_pre_pruning_up_sampling.predict(x_test) 
train_accuracy = np.round(accuracy_score(y_pred_train, y_train_upsample) * 100, 2)
test_accuracy = np.round(accuracy_score(y_pred_test, y_test) * 100, 2)
# roc-auc score
y_prob_train = tree_classifier_pre_pruning_up_sampling.predict_proba(x_train_upsample)[:, 1]
y_prob_test = tree_classifier_pre_pruning_up_sampling.predict_proba(x_test)[:, 1]
roc_train = np.round(roc_auc_score(y_train_upsample, y_prob_train), 3)
roc_test  = np.round(roc_auc_score(y_test, y_prob_test), 3)

print(f'Train score with pre-pruning and upsampling: {train_accuracy}%')
print(f'Test  score with pre-pruning and upsampling: {test_accuracy}%')
print(f'Train with pre-pruning and upsampling, roc value: {roc_train}%')
print(f'Test  with pre-pruning and upsampling, roc vlaue: {roc_test}%')

DecisionTreeClassifier(max_depth=6, min_samples_leaf=20, random_state=0)

Train score with pre-pruning and upsampling: 78.79%
Test  score with pre-pruning and upsampling: 72.92%
Train with pre-pruning and upsampling, roc value: 0.866%
Test  with pre-pruning and upsampling, roc vlaue: 0.825%


# 5) Weight the class and fit the model with best result

In [13]:
# Grid paramter
params = {'max_depth': [6],
          'min_samples_split': [2, 3, 4, 5, 10, 15, 20, 30],
          'min_samples_leaf': [20]
         }

tree_classifier_pre_pruning_weight = DecisionTreeClassifier(random_state=0, class_weight="balanced")
tree_classifier_pre_pruning_weight = GridSearchCV(estimator=tree_classifier_pre_pruning_weight, 
                                                  param_grid=params, cv=10)
tree_classifier_pre_pruning_weight.fit(x_train, y_train)

#---------------------------------------------------------------------
print(tree_classifier_pre_pruning_weight.best_estimator_)
print("")

y_pred_train = tree_classifier_pre_pruning_weight.predict(x_train)
y_pred_test = tree_classifier_pre_pruning_weight.predict(x_test) 
train_accuracy = np.round(accuracy_score(y_pred_train, y_train) * 100, 2)
test_accuracy = np.round(accuracy_score(y_pred_test, y_test) * 100, 2)
# roc-auc score
y_prob_train = tree_classifier_pre_pruning_weight.predict_proba(x_train)[:, 1]
y_prob_test = tree_classifier_pre_pruning_weight.predict_proba(x_test)[:, 1]
roc_train = np.round(roc_auc_score(y_train, y_prob_train), 3)
roc_test  = np.round(roc_auc_score(y_test, y_prob_test), 3)

print(f'Train score with pre-pruning and upsampling: {train_accuracy}%')
print(f'Test  score with pre-pruning and upsampling: {test_accuracy}%')
print(f'Train with pre-pruning and upsampling, roc value: {roc_train}%')
print(f'Test  with pre-pruning and upsampling, roc vlaue: {roc_test}%')

DecisionTreeClassifier(class_weight='balanced', max_depth=6,
                       min_samples_leaf=20, random_state=0)

Train score with pre-pruning and upsampling: 75.68%
Test  score with pre-pruning and upsampling: 73.7%
Train with pre-pruning and upsampling, roc value: 0.865%
Test  with pre-pruning and upsampling, roc vlaue: 0.834%
