In [90]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
# import pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Import roc_auc_score
from sklearn.metrics import roc_auc_score
import pickle

In [2]:
pd.set_option("display.max_columns", None)

In [15]:
# to check the current directory
import os
cwd = os.getcwd()
cwd

'/Users/macbook/Documents/Data_Science/DataScience_Project/churn-prediction/notebooks'

In [103]:
input_directory = r'/Users/macbook/Documents/Data_Science/DataScience_Project/churn-prediction/data/'
output_directory = r'/Users/macbook/Documents/Data_Science/DataScience_Project/churn-prediction/models/'
telco = pd.read_csv(input_directory + "telco_customer_churn.csv")

telco.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Preprocessing

In [52]:
# dropping missing value 
telco = telco.dropna()

In [53]:
# Store customerID and Churn column names
custid = ['customerID']
target = ['Churn']

# Store categorical column names
categorical = telco.nunique()[telco.nunique() < 5].keys().tolist()

# Remove target from the list of categorical variables
categorical.remove(target[0])

# Store numerical column names
numerical = [x for x in telco.columns if x not in custid + target + categorical]

In [55]:
# Perform one-hot encoding to categorical variables 
telco = pd.get_dummies(data = telco, columns = categorical, drop_first=True)

# Replace 'no' with 0 and 'yes' with 1 in 'Churn'
telco['Churn'] = telco['Churn'].replace({'No':0,'Yes':1})

telco = telco.drop('customerID', axis=1)

telco.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,SeniorCitizen_1,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,29.85,29.85,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0
1,34,56.95,1889.5,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1
2,2,53.85,108.15,1,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1
3,45,42.3,1840.75,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0
4,2,70.7,151.65,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0


In [56]:
telco_df = telco[['tenure', 'MonthlyCharges','InternetService_Fiber optic',
            'PaymentMethod_Electronic check','gender_Male','Contract_Two year',
           'Contract_One year', 'PaperlessBilling_Yes','Partner_Yes',
           'OnlineSecurity_Yes','OnlineBackup_Yes','TechSupport_Yes',
            'Churn']]

telco_df.shape

(7032, 13)

In [68]:
## Divide the df into df_train and df_test
df_train_all, df_test = train_test_split(telco_df, test_size = 0.3, random_state = 0)

## Divide the df_train_all into train and validation dataset
df_train, df_val = train_test_split(df_train_all, test_size = 0.3, random_state = 0)

In [69]:
# Create feature variable
x_train =  df_train.drop('Churn', axis=1).values
x_val =  df_val.drop('Churn', axis=1).values
x_test =  df_test.drop('Churn', axis=1).values

# Create target variable
y_train = df_train.Churn.values
y_val = df_val.Churn.values
y_test = df_test.Churn.values

## Logistic Regression

In [92]:
# initiate Pipeline
pipe = make_pipeline(StandardScaler(), LogisticRegression())

# Setup the hyperparameter grid
param_grid = {'logisticregression__C': [0.1, 1, 10, 100]}

# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(pipe, param_grid, cv=5) 

# Fit it to the data
logreg_cv.fit(x_train, y_train)

y_pred = logreg_cv.predict(x_val)

logreg_best_score = logreg_cv.best_score_

print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
print("Validate-set score: {:.3f}".format(logreg_cv.score(x_val, y_val)))
print("Best score is {:.3f}".format(logreg_best_score))

Tuned Logistic Regression Parameters: {'logisticregression__C': 1}
Validate-set score: 0.793
Best score is 0.800


In [97]:
# Compute predicted probabilities: y_pred_prob
y_pred_prob = Pipeline.predict_proba(x_val)[:,1]

# calcuate roc score
auc = roc_auc_score(y_val, y_pred)

print(f'The validation score is: {auc:0.3f}')

The validation score is: 0.701


## Random Forest

In [98]:
# initiate Pipeline

Pipeline = make_pipeline(
      StandardScaler()
    , RandomForestClassifier(n_estimators = 100, random_state = 22)
    )

# Fit it to the data
clf_pipeline = Pipeline.fit(x_train, y_train)

y_pred = clf_pipeline.predict(x_val)

print("Validate-set score: {:.3f}".format(clf_pipeline.score(x_val, y_val)))

Validate-set score: 0.776


In [99]:
# Compute predicted probabilities: y_pred_prob
y_pred_prob = Pipeline.predict_proba(x_val)[:,1]

# calcuate roc score
auc = roc_auc_score(y_val, y_pred)

print(f'The validation score is: {auc:0.3f}')

The validation score is: 0.676


### Saved Models

In [106]:
filename1 = output_directory +'log_regression_model.pkl'

# save the logistic regression model
with open (filename, 'wb' ) as f_out: 
    pickle.dump(logreg_cv, f_out)

In [107]:
filename2 = output_directory +'randomforest_model.pkl'

# save the random forest model
with open (filename2, 'wb' ) as f_out: 
    pickle.dump(clf_pipeline, f_out)