In [1]:
#grab the imports
import pandas as pd
import numpy as np
from sklearn import preprocessing
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", category=DataConversionWarning)
from tqdm import tqdm
np.random.seed(42)

In [2]:
#grab the customer churn dataset
churn_df = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%203/data/ChurnData.csv")

churn_df.head()

Unnamed: 0,tenure,age,address,income,ed,employ,equip,callcard,wireless,longmon,...,pager,internet,callwait,confer,ebill,loglong,logtoll,lninc,custcat,churn
0,11.0,33.0,7.0,136.0,5.0,5.0,0.0,1.0,1.0,4.4,...,1.0,0.0,1.0,1.0,0.0,1.482,3.033,4.913,4.0,1.0
1,33.0,33.0,12.0,33.0,2.0,0.0,0.0,0.0,0.0,9.45,...,0.0,0.0,0.0,0.0,0.0,2.246,3.24,3.497,1.0,1.0
2,23.0,30.0,9.0,30.0,1.0,2.0,0.0,0.0,0.0,6.3,...,0.0,0.0,0.0,1.0,0.0,1.841,3.24,3.401,3.0,0.0
3,38.0,35.0,5.0,76.0,2.0,10.0,1.0,1.0,1.0,6.05,...,1.0,1.0,1.0,1.0,1.0,1.8,3.807,4.331,4.0,0.0
4,7.0,35.0,14.0,80.0,2.0,15.0,0.0,1.0,0.0,7.1,...,0.0,0.0,1.0,1.0,0.0,1.96,3.091,4.382,3.0,0.0


In [3]:
#do some preprocessing
churn_df = churn_df[['tenure', 'age', 'address', 'income', 'ed', 'employ', 'equip',   'callcard', 'wireless','churn']]
churn_df['churn'] = churn_df['churn'].astype('int')
churn_df.head()


Unnamed: 0,tenure,age,address,income,ed,employ,equip,callcard,wireless,churn
0,11.0,33.0,7.0,136.0,5.0,5.0,0.0,1.0,1.0,1
1,33.0,33.0,12.0,33.0,2.0,0.0,0.0,0.0,0.0,1
2,23.0,30.0,9.0,30.0,1.0,2.0,0.0,0.0,0.0,0
3,38.0,35.0,5.0,76.0,2.0,10.0,1.0,1.0,1.0,0
4,7.0,35.0,14.0,80.0,2.0,15.0,0.0,1.0,0.0,0


In [4]:
print(churn_df.shape)
print(*churn_df)
#churn_df[0:5]

(200, 10)
tenure age address income ed employ equip callcard wireless churn


In [5]:
#helper function for later
def get_accuracy(X_train, X_test, y_train, y_test, model):
    return  {"test Accuracy":metrics.accuracy_score(y_test, model.predict(X_test)),"train Accuracy": metrics.accuracy_score(y_train, model.predict(X_train))}

In [6]:
#now we set up the data
X = churn_df[['tenure', 'age', 'address', 'income', 'ed', 'employ', 'equip', 'callcard', 'wireless']]
y = churn_df[['churn']]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 10)
print ('Train set', X_train.shape,  y_train.shape)
print ('Test set', X_test.shape,  y_test.shape)

Train set (150, 9) (150, 1)
Test set (50, 9) (50, 1)


In [7]:
#first we will test the decision tree
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn import tree

max_depth = 5
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 10)
tree = dtc(criterion="entropy", max_depth = max_depth, random_state = 10)
#tree
#train the tree
tree.fit(X_train, y_train)

#now we find the training and testing accuracy
get_accuracy(X_train, X_test, y_train, y_test, tree)

#I personally care far more about the testing accuracy
#if our decision tree matches up pretty close to one of our other models, then I think it would be 
#worth exploring the tree later

{'test Accuracy': 0.72, 'train Accuracy': 0.9}

In [8]:
#now lets test with random forest
from sklearn.ensemble import RandomForestClassifier as rf
forest = rf(criterion="gini")
forest.fit(X_train, y_train)
get_accuracy(X_train, X_test, y_train, y_test, forest)


{'test Accuracy': 0.7, 'train Accuracy': 1.0}

In [9]:
#next, bagging
from sklearn.ensemble import BaggingClassifier as bc
bag = bc()
bag.fit(X_train, y_train)
get_accuracy(X_train, X_test, y_train, y_test, bag)


{'test Accuracy': 0.66, 'train Accuracy': 0.9733333333333334}

In [10]:
#also lets do svc
from sklearn.svm import SVC
sv = SVC(kernel='linear', gamma='scale')
sv.fit(X_train, y_train)
get_accuracy(X_train, X_test, y_train, y_test, sv)


{'test Accuracy': 0.76, 'train Accuracy': 0.78}

In [11]:
"""it was around here where I started to think that the 75/25 training/test split was too much 
training and the models were becoming a bit overfit.  So I decided to retrain the models on a 
much more modest, 70/30 split. 
"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)
#print ('Train set', X_train.shape,  y_train.shape)
#print ('Test set', X_test.shape,  y_test.shape)

#train the decision tree
tree = dtc(criterion="entropy", max_depth = max_depth, random_state = 10)
tree.fit(X_train, y_train)
print("Accuracy for decision tree:")
print(get_accuracy(X_train, X_test, y_train, y_test, tree))
print()

forest = rf(criterion="gini")
forest.fit(X_train, y_train)
print("Accuracy for random forest:")
print(get_accuracy(X_train, X_test, y_train, y_test, forest))
print()

bag = bc()
bag.fit(X_train, y_train)
print("Accuracy for bagging:")
print(get_accuracy(X_train, X_test, y_train, y_test, bag))
print()

sv = SVC(kernel='linear', gamma='scale')
sv.fit(X_train, y_train)
print("Accuracy for Support Vector:")
print(get_accuracy(X_train, X_test, y_train, y_test, sv))
print()


Accuracy for decision tree:
{'test Accuracy': 0.7333333333333333, 'train Accuracy': 0.8642857142857143}

Accuracy for random forest:
{'test Accuracy': 0.6833333333333333, 'train Accuracy': 1.0}

Accuracy for bagging:
{'test Accuracy': 0.7, 'train Accuracy': 0.9642857142857143}

Accuracy for Support Vector:
{'test Accuracy': 0.7666666666666667, 'train Accuracy': 0.7857142857142857}

