# Read the dataset CustomerChurn

In [1]:
import pandas as pd
import numpy as np

OriginalCustomerChurn = pd.read_csv("CustomerChurn.csv")
OriginalCustomerChurn.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


# Preprocess Data with encoding

## Function GetEncodeData

In [2]:
from sklearn.preprocessing import LabelEncoder
import copy

def getEncodeData(DataFrame, Columns):
    labelEncoder = LabelEncoder()
    encodedData = copy.deepcopy(DataFrame)
    for column in Columns:
        encodedData[column] = labelEncoder.fit_transform(DataFrame[column])
    return encodedData

## Preprocess Data

In [3]:
encodedData = getEncodeData(OriginalCustomerChurn, ['State', 'International plan', 'Voice mail plan'])
encodedData.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,16,128,415,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,35,107,415,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,31,137,415,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,35,84,408,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,36,75,415,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


# Create train data and test data

## SplitDataFrameToTrainAndTest function

In [4]:
def SplitDataFrameToTrainAndTest(DataFrame, TrainDataRate, TargetAtt):
    training = DataFrame.sample(frac=TrainDataRate, random_state=1)
    testing = DataFrame.loc[~DataFrame.index.isin(training.index)]
    x_train = training.drop(TargetAtt, 1)
    y_train = training[TargetAtt]
    x_test = testing.drop(TargetAtt, 1)
    y_test = testing[TargetAtt]
    return x_train, y_train, x_test, y_test

## Train data and test data

In [5]:
x_train, y_train, x_test, y_test = SplitDataFrameToTrainAndTest(DataFrame=encodedData, TrainDataRate=0.6, TargetAtt='Churn')

# NaiveBayes Training

## Declaration

In [6]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

def NaiveBayesLearning(DataTrain, TargetTrain):
    return GaussianNB().fit(DataTrain, TargetTrain.ravel())

def NaiveBayesTesting(NBModel, DataTest, TargetTest):
    PredictTest = NBModel.predict(DataTest)
    Accuracy = accuracy_score(TargetTest, PredictTest)
    return Accuracy, PredictTest

## Training and testing

In [7]:
model = NaiveBayesLearning(x_train, y_train)
accuracy, predictTest = NaiveBayesTesting(model, x_test, y_test)
print('Accuracy: ', accuracy)
print('Predict: ', predictTest)

Accuracy:  0.8657164291072769
Predict:  [False  True  True ...  True  True False]


# DecisionTree Training

## Declaration

In [8]:
from sklearn import tree

def DecisionTreeLearning(DataTrain, TargetTrain):
    if (type(DataTrain) == bool):
        return tree.DecisionTreeRegressor().fit(DataTrain, TargetTrain)
    else:
        return tree.DecisionTreeClassifier().fit(DataTrain, TargetTrain)

def DecisionTreeTesting(DTModel, DataTest, TargetTest):
    PredictTest = DTModel.predict(DataTest)
    Accuracy = accuracy_score(TargetTest, PredictTest)
    return Accuracy, PredictTest

## Training and testing

In [9]:
model = DecisionTreeLearning(x_train, y_train)
accuracy, predictTest = DecisionTreeTesting(model, x_test, y_test)
print('Accuracy: ', accuracy)
print('Predict: ', predictTest)

Accuracy:  0.9099774943735934
Predict:  [False False False ... False  True False]


## Graph the decision tree and export to pdf

In [10]:
import graphviz

dot_data = tree.export_graphviz(model, out_file=None, 
                                feature_names=x_train.columns,
                                filled=True, 
                                rounded=True, 
                                special_characters=True)
graph = graphviz.Source(dot_data)
graph.render('BeforeMined')

'BeforeMined.pdf'

# Learning and data mining

## Function DetectOutlierByIQR

In [11]:
def DetectOutlierByIQR(DataFrame):
    Q1 = DataFrame.quantile(0.25)
    Q3 = DataFrame.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - (1.5 * IQR) 
    upper = Q3 + (1.5 * IQR) 
    outlier_Df = (DataFrame >= lower) & (DataFrame <= upper)
    return outlier_Df.loc[outlier_Df[outlier_Df.columns[0]] == False]

## Function DetectOutlierByLOF

In [12]:
from sklearn.neighbors import LocalOutlierFactor
def DetectOutlierByLOF(DataFrame):
    lof = LocalOutlierFactor(contamination='auto')
    y_pred = lof.fit_predict(DataFrame)
    return y_pred

## Function RemoveRowFromDataFrame

In [13]:
def RemoveRowFromDataFrame(DataFrame, EliminateListIndex):
    return DataFrame.loc[~DataFrame.index.isin(EliminateListIndex)]

## Get List of Outliers with IQR and LOF

In [14]:
outliersListIndex = set()
checkListLOF = [['Total day minutes', 'Total day calls', 'Total day charge'], ['Total eve minutes', 'Total eve calls', 'Total eve charge']]
checkListIQR = [['Account length'], ['Number vmail messages'], ['Customer service calls']]
for lofList in checkListLOF:
    checkOutlierAtt = encodedData[lofList]
    acceptedList = DetectOutlierByLOF(checkOutlierAtt)
    for i, value in enumerate(acceptedList):
        if (value != 1):
            outliersListIndex.add(i)

for iqrList in checkListIQR:
    checkOutlierAtt = encodedData[iqrList]
    resultList = DetectOutlierByIQR(checkOutlierAtt)
    for i, row in resultList.iterrows():
         outliersListIndex.add(i)

## Mined Encoded Data

In [15]:
minedEncodedData = encodedData.loc[~encodedData.index.isin(outliersListIndex)]
display(minedEncodedData)

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,16,128,415,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.70,1,False
1,35,107,415,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.70,1,False
2,31,137,415,0,0,0,243.4,114,41.38,121.2,110,10.30,162.6,104,7.32,12.2,5,3.29,0,False
3,35,84,408,1,0,0,299.4,71,50.90,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,36,75,415,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False
5,1,118,510,1,0,0,223.4,98,37.98,220.6,101,18.75,203.9,118,9.18,6.3,6,1.70,0,False
6,19,121,510,0,1,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3,False
7,24,147,415,1,0,0,157.0,79,26.69,103.1,94,8.76,211.8,96,9.53,7.1,6,1.92,0,False
8,18,117,408,0,0,0,184.5,97,31.37,351.6,80,29.89,215.8,90,9.71,8.7,4,2.35,1,False
9,49,141,415,1,1,37,258.6,84,43.96,222.0,111,18.87,326.4,97,14.69,11.2,5,3.02,0,False


# Train Again

## Get Train and Test

In [16]:
x_train, y_train, x_test, y_test = SplitDataFrameToTrainAndTest(DataFrame=minedEncodedData, TrainDataRate=0.6, TargetAtt='Churn')

## NaiveBayes Train & Test

In [17]:
model = NaiveBayesLearning(x_train, y_train)
accuracy, predictTest = NaiveBayesTesting(model, x_test, y_test)
print('Accuracy: ', accuracy)
print('Predict: ', predictTest)

Accuracy:  0.8951342281879194
Predict:  [False  True  True ...  True False False]


## Decision Tree Train & Test

In [18]:
model = DecisionTreeLearning(x_train, y_train)
accuracy, predictTest = DecisionTreeTesting(model, x_test, y_test)
print('Accuracy: ', accuracy)
print('Predict: ', predictTest)

Accuracy:  0.9110738255033557
Predict:  [False False  True ... False False False]


## Graph the Decision Tree

In [19]:
import graphviz

dot_data = tree.export_graphviz(model, out_file=None, 
                                feature_names=x_train.columns,
                                filled=True, 
                                rounded=True, 
                                special_characters=True)
graph = graphviz.Source(dot_data)
graph.render('AfterMined')

'AfterMined.pdf'