In [1]:
# Import Libraries.
import pandas as pd
import numpy as np

# Understanding the Dataset

In [2]:
# Read in the dataset.
df = pd.read_csv('Churn_Data copy.csv')

In [3]:
display(df.shape)
display(df.head(10))

(14000, 13)

Unnamed: 0,CUSTID,COLLEGE,INCOME,OVERAGE,LEFTOVER,HOUSE,HANDSET_PRICE,OVER_15MINS_CALLS_PER_MONTH,AVERAGE_CALL_DURATION,REPORTED_SATISFACTION,REPORTED_USAGE_LEVEL,CONSIDERING_CHANGE_OF_PLAN,LEAVE
0,1,zero,31953,0,6,313378,161,0,4,unsat,little,no,STAY
1,2,one,36147,0,13,800586,244,0,6,unsat,little,considering,STAY
2,3,one,27273,230,0,305049,201,16,15,unsat,very_little,perhaps,STAY
3,5,one,29215,208,85,224784,241,21,1,very_unsat,little,never_thought,STAY
4,6,zero,133728,64,48,632969,626,3,2,unsat,high,no,STAY
5,7,zero,42052,224,0,697949,191,10,5,very_unsat,little,actively_looking_into_it,STAY
6,9,zero,38171,0,7,274218,190,0,5,very_sat,little,actively_looking_into_it,STAY
7,10,zero,105824,174,18,153560,687,25,4,very_sat,little,never_thought,LEAVE
8,11,zero,20120,43,0,623166,209,5,8,very_sat,little,never_thought,STAY
9,12,one,50939,76,13,587207,336,3,5,avg,little,considering,STAY


In [4]:
df['LEAVE'].value_counts()

STAY     7104
LEAVE    6896
Name: LEAVE, dtype: int64

# Data Preparation and Transformation

#### Target Variable

In [5]:
# Use LabelEncoder to convert target variable values to numerical values (0 or 1).
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['LEAVE'] = le.fit_transform(df['LEAVE'])

# Create dataset to be used for modeling. 
df_clean = pd.DataFrame(df[['LEAVE', 'CUSTID']])

In [6]:
#display(df.head())
#display(df_clean.head())

#### Boolean Variables

In [7]:
df['COLLEGE'] = df['COLLEGE'].map({'one': 1, 'zero': 0})
df_clean = df_clean.join(df['COLLEGE'])

In [8]:
#display(df.head())
#display(df_clean.head())

#### Categorical Variables

In [9]:
# Create dummy variables for categorical features.
catVals = ['REPORTED_SATISFACTION', 'REPORTED_USAGE_LEVEL', 'CONSIDERING_CHANGE_OF_PLAN']
dummies = pd.get_dummies(df[catVals], drop_first=True)

df_clean = df_clean.join(dummies)

In [10]:
#display(df.head())
#display(df_clean.head())
#display(df_clean.shape)

#### Numerical Variables

In [11]:
# StandardScaler() computes Z-scores. Transform data such that its distribution will have a mean of 0 and standard 
# deviation of 1.
# Apply to all numerical columns.
from sklearn.preprocessing import StandardScaler

col_names = df.columns[2:9]
numVals = df.loc[:,'INCOME':'AVERAGE_CALL_DURATION']
scaler = StandardScaler()
stdVals = pd.DataFrame(scaler.fit_transform(numVals))
stdVals.columns = col_names

In [12]:
df_clean = df_clean.join(stdVals)

In [13]:
display(df_clean.head())
display(df_clean.shape)
display(df_clean.columns)

Unnamed: 0,LEAVE,CUSTID,COLLEGE,REPORTED_SATISFACTION_sat,REPORTED_SATISFACTION_unsat,REPORTED_SATISFACTION_very_sat,REPORTED_SATISFACTION_very_unsat,REPORTED_USAGE_LEVEL_high,REPORTED_USAGE_LEVEL_little,REPORTED_USAGE_LEVEL_very_high,...,CONSIDERING_CHANGE_OF_PLAN_never_thought,CONSIDERING_CHANGE_OF_PLAN_no,CONSIDERING_CHANGE_OF_PLAN_perhaps,INCOME,OVERAGE,LEFTOVER,HOUSE,HANDSET_PRICE,OVER_15MINS_CALLS_PER_MONTH,AVERAGE_CALL_DURATION
0,1,1,0,0,1,0,0,0,1,0,...,0,1,0,-1.155877,-0.998969,-0.668289,-0.715088,-1.066728,-0.894694,-0.457388
1,1,2,1,0,1,0,0,0,1,0,...,0,0,0,-1.055255,-0.998969,-0.407784,1.210287,-0.679422,-0.894694,-0.004817
2,1,3,1,0,1,0,0,0,0,0,...,0,0,1,-1.26816,1.672415,-0.891579,-0.748003,-0.880074,0.89246,2.031755
3,1,5,1,0,0,0,1,0,1,0,...,1,0,0,-1.221568,1.416891,2.271697,-1.065198,-0.693421,1.450945,-1.136245
4,1,6,0,0,1,0,0,1,0,0,...,0,1,0,1.285912,-0.255627,0.894742,0.547889,1.103122,-0.559603,-0.909959


(14000, 22)

Index(['LEAVE', 'CUSTID', 'COLLEGE', 'REPORTED_SATISFACTION_sat',
       'REPORTED_SATISFACTION_unsat', 'REPORTED_SATISFACTION_very_sat',
       'REPORTED_SATISFACTION_very_unsat', 'REPORTED_USAGE_LEVEL_high',
       'REPORTED_USAGE_LEVEL_little', 'REPORTED_USAGE_LEVEL_very_high',
       'REPORTED_USAGE_LEVEL_very_little',
       'CONSIDERING_CHANGE_OF_PLAN_considering',
       'CONSIDERING_CHANGE_OF_PLAN_never_thought',
       'CONSIDERING_CHANGE_OF_PLAN_no', 'CONSIDERING_CHANGE_OF_PLAN_perhaps',
       'INCOME', 'OVERAGE', 'LEFTOVER', 'HOUSE', 'HANDSET_PRICE',
       'OVER_15MINS_CALLS_PER_MONTH', 'AVERAGE_CALL_DURATION'],
      dtype='object')

# Partitioning the dataset

In [14]:
X, Y = df_clean.loc[:,'COLLEGE':], df_clean.loc[:,'LEAVE']

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 7)
print(X_train.shape)
print(X_test.shape)

(11200, 20)
(2800, 20)


In [16]:
#Y_train

# CART Decision Tree Classifier

In [17]:
# stratify = y led to a lower accuracy rate

In [18]:
from sklearn.tree import DecisionTreeClassifier

# A CART decision tree built using the training data set.
cart01 = DecisionTreeClassifier(criterion = "gini", 
            max_leaf_nodes = 20).fit(X_train, Y_train)

In [19]:
from sklearn.metrics import accuracy_score

# Predict the outcome for the test data.
pred_test = cart01.predict(X_test)

# Compute the accuracy rate. 
accuracy_score(pred_test, Y_test)*100

70.25

In [20]:
# Another method to compute accuracy rate. 
(pred_test == Y_test).mean()*100

70.25

# Random Forest Classifier

In [21]:
from sklearn.ensemble import RandomForestClassifier

# Create the Random Forest model.
rf = RandomForestClassifier(n_estimators = 100, 
                    criterion = 'gini').fit(X_train, Y_train)

In [22]:
# Predict the outcome for the test data.
pred_test1 = rf.predict(X_test)

# Compute the accuracy rate.
accuracy_score(pred_test1, Y_test)*100

70.10714285714286

# MLP (Multi-Layer Perceptron) Classifier

In [23]:
from sklearn.neural_network import MLPClassifier

# Train the MLPClassifier using the training data.
mlp = MLPClassifier(solver = 'lbfgs', alpha = 1e-5, activation = 'logistic', 
                      hidden_layer_sizes = (10,), random_state = 7, 
                      max_iter = 6000)

# Fit the training data to the MLPClassifier.
mlp.fit(X_train, Y_train)

# Predict the outcome for the test data.
pred_test2 = mlp.predict(X_test)

# Compute the accuracy rate. 
accuracy_score(pred_test2, Y_test)*100

68.10714285714286