In [1]:
#Read in libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [2]:
#Set pandas preferences
pd.set_option('display.max_columns',100)

#Set matplotlib preferences
plt.style.use('ggplot')

In [3]:
#Import raw data
import os
os.chdir('/Users/ksharma/Documents/Data Science/Projects/TelcoCustomerChurn/Data/CleanedData/')

df= pd.read_csv('cleanedData.csv', index_col= 0)

In [4]:
display(df.shape)
display(df['Churn'].value_counts())
display(df.head())

(7043, 20)

No     5174
Yes    1869
Name: Churn, dtype: int64

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,No,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,No,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,No,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,No,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Preprocessing for Machine Learning

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder

#Subset col names for oneHotEncoding
oneHotEncodeCols= df.select_dtypes(include='object').columns.tolist()
oneHotEncodeCols= oneHotEncodeCols[:-1] #remove churn

In [7]:
#Encode data and split into X and y
X= pd.get_dummies(df.iloc[:,:-1], columns= oneHotEncodeCols, drop_first=False).values
y= LabelEncoder().fit_transform(df['Churn'].values)

#Split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=24)

display(X_train.shape, y_train.shape)
display(X_test.shape, y_test.shape)

(5634, 46)

(5634,)

(1409, 46)

(1409,)

In [8]:
#Standardize training data
from sklearn.preprocessing import StandardScaler

scaler= StandardScaler()
X_train[:,0:3]= scaler.fit_transform(X_train[:,0:3])
X_test[:,0:3]= scaler.transform(X_test[:,0:3])

Base Model for Churn Classification - Logistic Regression

In [55]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold, cross_validate
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score

In [56]:
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}

In [58]:
kfold= KFold(n_splits=5, random_state=None)

In [67]:
def evaluate_classifier(estimator, kfold, scoring, trainingData, target):
    results= cross_validate(estimator= estimator, X= trainingData, y= target, 
                        cv= kfold, n_jobs=-1, scoring=scoring)

    print('Logistic Regression Metrics:\n==============================')
    print('Avg F1 Score: %s SD: %s' % (np.round(np.mean(results['test_f1_score']),4), np.round(np.std(results['test_f1_score']),4)))
    print('Avg Precision Score: %s SD: %s' % (np.round(np.mean(results['test_precision']),4), np.round(np.std(results['test_precision']),4)))
    print('Avg Recall Score: %s SD: %s' % (np.round(np.mean(results['test_recall']),4), np.round(np.std(results['test_recall']),4)))

In [68]:
logiReg= LogisticRegression(n_jobs= -1,penalty= 'l2', C= 1.0, random_state= 24 )

evaluate_classifier(estimator= logiReg, kfold= kfold, scoring= scoring,
                   trainingData= X_train, target= y_train)

Logistic Regression Metrics:
Avg F1 Score: 0.5962 SD: 0.0078
Avg Precision Score: 0.6534 SD: 0.0192
Avg Recall Score: 0.5488 SD: 0.0138


Base Model for Churn Classification - Linear SVC

In [70]:
from sklearn.svm import LinearSVC

linSVC= LinearSVC(penalty= 'l2', C= 1.0, random_state= 24)

evaluate_classifier(estimator= linSVC, kfold= kfold, scoring= scoring,
                   trainingData= X_train, target= y_train)

Logistic Regression Metrics:
Avg F1 Score: 0.5885 SD: 0.0077
Avg Precision Score: 0.6552 SD: 0.0196
Avg Recall Score: 0.5352 SD: 0.0204


K-Nearest-Neighbors

Random Forest