In [1]:
# setup
# Data Processing
import pandas as pd
import numpy as np

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image

# file
import os


# folder
folder_input = '02_input'


# Note about package and environment:

# 1. Environment: 
# Choose myenv as kernel. This was set up following guideline from vs code. A command line was typed in minicoda, enabling vs code to use packages from miniconda.
# Ref: https://code.visualstudio.com/docs/datascience/data-science-tutorial#_prerequisites

# 2. Add packages to the environment already created:
# Open the minicoda > anaconda prompt > type 'conda install -n <env_name> <package>' > then the package can be used in myenv




In [2]:
# original data
org = pd.read_csv(os.path.join('..', folder_input, 'WA_Fn-UseC_-Telco-Customer-Churn.csv'))

org.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [3]:
# data cleansing

# target
cleaned_data = org.copy()
cleaned_data['Churn'] = cleaned_data['Churn'].str.replace("Yes", "1", case=False, regex=False)
cleaned_data['Churn'] = cleaned_data['Churn'].str.replace("No", "0", case=False, regex=False)
cleaned_data = cleaned_data.astype({'Churn': 'int64'})


# # Change column type to category for columns: 'gender', 'SeniorCitizen' and 15 other columns
# cleaned_data = org.astype({
#     'gender': 'category', 
#     'SeniorCitizen': 'category', 
#     'Partner': 'category', 
#     'Dependents': 'category', 
#     'PhoneService': 'category', 
#     'MultipleLines': 'category', 
#     'InternetService': 'category', 
#     'OnlineSecurity': 'category', 
#     'OnlineBackup': 'category', 
#     'DeviceProtection': 'category', 
#     'TechSupport': 'category', 
#     'StreamingTV': 'category', 
#     'StreamingMovies': 'category', 
#     'Contract': 'category', 
#     'PaperlessBilling': 'category', 
#     'PaymentMethod': 'category', 
#     'Churn': 'category'
#     })


# drop useless cols
cleaned_data = cleaned_data.drop(
    labels=['TotalCharges', 'customerID'],
    axis=1
)

# inspect data structure
cleaned_data.head()
cleaned_data.describe()
cleaned_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [4]:
# A. random forest

from sklearn.model_selection import train_test_split

# seperate target from predictors
X = cleaned_data.copy()            # use copy, not to affect original dataset
y = X.pop('Churn')        # y = "churn", and delete churn from X

# train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [5]:
# define preprocessing steps

# select categorical & numericial columns
categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]


# pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
# numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        #('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [6]:
# define model

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=0)

# To be read
# rf ref: https://www.analyticsvidhya.com/blog/2021/06/understanding-random-forest/
# kaggle ref for classification: https://www.kaggle.com/code/prashant111/random-forest-classifier-tutorial


In [8]:
# create and evaluate pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', rf_model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_test)

# # check the unique value of the output - 1, 0, so it's classification output
# np.unique(preds)

# Evaluate the model
# PS: Accuracy score = % of accuracy in prediction,
#PS: accuracy is not a great measure of classifier performance when the classes are imbalanced
accuracy_score(y_test, preds)



0.7518455423055083

In [9]:
# confusion matrix
# Get and reshape confusion matrix data
confusion_matrix(y_test, preds)


array([[1120,  178],
       [ 259,  204]], dtype=int64)

In [29]:
# classification report
# View the classification report for test data and predictions
print(classification_report(y_test, preds))

# Precision = acurracy of the positive prediction (ie TP/(TP + FP))
# Recall = % of positive being correctly identified (ie TP/(TP+FN))
# percent of positive predictions were correct (ie 2*(Recall * Precision) / (Recall + Precision))


              precision    recall  f1-score   support

           0       0.82      0.89      0.85      1298
           1       0.59      0.46      0.52       463

    accuracy                           0.77      1761
   macro avg       0.71      0.67      0.69      1761
weighted avg       0.76      0.77      0.76      1761



In [None]:
# run model without pipeline to get feature importance


In [54]:
# view the feature scores
# TBC: Seem no easy way to get feature importance from pipeline
my_pipeline.steps[1][1].feature_importances_

# ref: https://stackoverflow.com/questions/38787612/how-to-extract-feature-importances-from-an-sklearn-pipeline#comment91594552_38788087

array([0.03350508, 0.03325889, 0.02679477, 0.02645615, 0.02269356,
       0.02207425, 0.00495815, 0.0055726 , 0.03050596, 0.00550736,
       0.0298771 , 0.01883244, 0.04029614, 0.00263534, 0.04351384,
       0.0024463 , 0.02119201, 0.03063093, 0.00389934, 0.02379819,
       0.03293993, 0.00542439, 0.02635651, 0.04270106, 0.00159131,
       0.0212643 , 0.02796514, 0.0027891 , 0.02840908, 0.02722069,
       0.00212311, 0.02765094, 0.10296754, 0.01969152, 0.02738032,
       0.02643403, 0.02603606, 0.02426284, 0.02672168, 0.04991253,
       0.02170955])

In [73]:
# cannot figure out how to get the feature importance from pipeline yet

feature_importances = my_pipeline.named_steps['model'].feature_importances_

# Get the selected feature indices from SelectKBest
selected_feature_indices = my_pipeline.named_steps['preprocessor'].
print(selected_feature_indices)


# # Get the names of the selected features
# selected_feature_names = [feature_names[i] for i in selected_feature_indices]

# # Combine feature names and their importances into a dictionary
# feature_importance_dict = dict(zip(selected_feature_names, feature_importances))

ColumnTransformer(transformers=[('cat',
                                 Pipeline(steps=[('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['gender', 'Partner', 'Dependents',
                                  'PhoneService', 'MultipleLines',
                                  'InternetService', 'OnlineSecurity',
                                  'OnlineBackup', 'DeviceProtection',
                                  'TechSupport', 'StreamingTV',
                                  'StreamingMovies', 'Contract',
                                  'PaperlessBilling', 'PaymentMethod'])])
