In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from tensorflow import keras
from keras.regularizers import l1, l2
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# Load data
data = shuffle(pd.read_csv('dataset/data_churn.csv').drop("Unnamed: 0", axis=1))
x_variables = ['dateCreated', 'page_turn_count', 'view_duration','InstallDate']
X = data[x_variables]
y = data['churn'].values

# One-hot encoder for churn variable (3 classes --> low, mid, and high risk)
ohe = OneHotEncoder(sparse=False)
y_encoded = ohe.fit_transform(y.reshape(-1, 1))

# data split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=50)
# Standardizing data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)     
X_test = scaler.transform(X_test) 
# Further splitting test data into validation and evaluation data
X_validation, X_train_split = np.split(X_train,[int(0.1*len(X_train))])                # 10 % training data for validation, Rest is used for traininig.
y_validation, y_train_split = np.split(y_train,[int(0.1*len(y_train))])         



### NN

In [3]:
# Building and compiling NN from keras
# Initializing the Neural Network
nn = keras.models.Sequential()
# Adding input layer
nn.add(keras.layers.Input(shape=(X_train.shape[1],)))               # Adding number of neurons based on selected relevant variables)
# Adding Hidden layers
nn.add(keras.layers.Dense(units =50 , activation = 'PReLU', kernel_regularizer = l1(0.01)))
nn.add(keras.layers.Dense(units =50 , activation = 'PReLU', kernel_regularizer = l2(0.01)))
nn.add(keras.layers.Dense(units = 3 , activation = 'softmax'))      # 3 categories of risk based on churn prediction (High, mid, and low risk)
nn.compile(optimizer='adam',loss='categorical_crossentropy', metrics='accuracy')

In [4]:
# Training NN
# Adding early stopping
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 10)
# Setting custom weights, to increase training prirority for page turn counts and view duration based on data analysis.
# custom_weights = np.array([1.0, 1.5, 1.5, 1.0])
# X_train_weighted = X_train * custom_weights
nn.fit(X_train_split, y_train_split, batch_size=32, epochs = 15 ,verbose = 1, validation_data = [X_validation, y_validation], callbacks = [early_stopping])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x19dd1e6b710>

In [5]:
# Evaluate the model
nn_loss, nn_accuracy = nn.evaluate(X_test, y_test)
nn_pred = nn.predict(X_test)
nn_pred_classes = np.argmax(nn_pred, axis=1)




### Random_Forest

In [6]:
# Building a random forest
rf = RandomForestClassifier(n_estimators=100, criterion='gini', random_state=50,n_jobs=10,verbose=1)
# Training random forest
rf.fit(X_train, y_train)

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    4.4s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:   14.7s finished


In [7]:
rf_pred = rf.predict(X_test)
rf_pred_classes = np.argmax(rf_pred, axis=1)
rf_accuracy = accuracy_score(y_test, rf_pred)

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.4s finished


### SVM

In [8]:
# # Reversing one hot encoder transformation
y_train_original = ohe.inverse_transform(y_train).ravel()
y_test_original = ohe.inverse_transform(y_test).ravel()

In [9]:
svm = SVC(kernel='linear', C=1.0, decision_function_shape='ovr', random_state=50)
# Training svm
svm.fit(X_train, y_train_original)

In [10]:
svm_pred = svm.predict(X_test)
svm_accuracy = accuracy_score(y_test_original, svm_pred)

#### Model accuracy score, classification report, and confusion matrix

In [11]:
#  Accuracy score
print(f"NN loss: {nn_loss:.4f}, NN accuracy: {nn_accuracy:.4f}")
print(f"RF accuracy: {rf_accuracy:.4f}")
print(f"SVM accuracy: {svm_accuracy:.4f}")

NN loss: 0.0427, NN accuracy: 0.9947
RF accuracy: 0.9933
SVM accuracy: 0.9995


In [12]:
# Generate classification report
nn_report = classification_report(nn_pred_classes, y_test_original, target_names = ["low Risk", "Mid Risk", "High Risk"])
rf_report = classification_report(rf_pred_classes, y_test_original, target_names = ["low Risk", "Mid Risk", "High Risk"])
svm_report = classification_report(svm_pred, y_test_original, target_names = ["low Risk", "Mid Risk", "High Risk"])
print("Neural Network classification report: \n\n {} \n\n Random Forest classification report: \n\n {} \n\n SVM classification report: \n\n {}".format(nn_report, rf_report, svm_report))

Neural Network classification report: 

               precision    recall  f1-score   support

    low Risk       0.99      1.00      0.99      9036
    Mid Risk       1.00      0.99      1.00     19794
   High Risk       0.99      1.00      0.99     22181

    accuracy                           0.99     51011
   macro avg       0.99      1.00      0.99     51011
weighted avg       0.99      0.99      0.99     51011
 

 Random Forest classification report: 

               precision    recall  f1-score   support

    low Risk       0.98      0.98      0.98      9124
    Mid Risk       1.00      1.00      1.00     19633
   High Risk       1.00      1.00      1.00     22254

    accuracy                           0.99     51011
   macro avg       0.99      0.99      0.99     51011
weighted avg       0.99      0.99      0.99     51011
 

 SVM classification report: 

               precision    recall  f1-score   support

    low Risk       1.00      1.00      1.00      9101
    Mid Risk

In [13]:
nn_cm = confusion_matrix (nn_pred_classes, y_test_original)
rf_cm = confusion_matrix (rf_pred_classes, y_test_original)
svm_cm = confusion_matrix (svm_pred, y_test_original)
print("Neural Network confusion matrix: \n\n {} \n\n Random Forest confusion matrix: \n\n {} \n\n SVM confusion matrix: \n\n {}".format(nn_cm, rf_cm, svm_cm))

Neural Network confusion matrix: 

 [[ 9011    18     7]
 [   24 19621   149]
 [   68     3 22110]] 

 Random Forest confusion matrix: 

 [[ 8963    70    91]
 [   63 19559    11]
 [   77    13 22164]] 

 SVM confusion matrix: 

 [[ 9095     5     1]
 [    0 19629     6]
 [    8     8 22259]]
