In [772]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_recall_curve
from sklearn.feature_selection import RFE

In [773]:
lead_df = pd.read_csv("Data/processed_lead_ml.csv")

In [774]:
# Putting predictor variables to X
X = lead_df.drop('Converted', axis=1)

# Putting Target variables to y
y = lead_df["Converted"]

# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

In [775]:
# Lets use RFE to reduce variables 
dt_class = DecisionTreeClassifier()
rfe = RFE(dt_class, n_features_to_select=15)            
rfe = rfe.fit(X_train, y_train)

In [776]:
# instead of using as zip using this for more readablility

# Check the top 15 features chosen by RFE
top15=pd.DataFrame()
top15['features']=X_train.columns
top15['Feature Chosen'] = rfe.support_
top15['Ranking']=rfe.ranking_
top15.sort_values(by='Ranking')

Unnamed: 0,features,Feature Chosen,Ranking
0,Unnamed: 0,True,1
49,Tags_Will revert after reading the email,True,1
44,Tags_Ringing,True,1
40,Tags_Lost to EINS,True,1
32,Tags_Closed by Horizzon,True,1
31,Tags_Busy,True,1
54,Tags_switched off,True,1
25,Last Activity_SMS Sent,True,1
17,Lead Source_Welingak Website,True,1
11,Lead Source_Google,True,1


In [777]:
# columns which are selected by RFE
rfe_col = X_train.columns[rfe.support_]
rfe_col

Index(['Unnamed: 0', 'TotalVisits', 'Total Time Spent on Website',
       'Page Views Per Visit', 'Lead Origin_Landing Page Submission',
       'Lead Source_Google', 'Lead Source_Welingak Website',
       'Last Activity_SMS Sent', 'Specialization_Management Specializations',
       'Tags_Busy', 'Tags_Closed by Horizzon', 'Tags_Lost to EINS',
       'Tags_Ringing', 'Tags_Will revert after reading the email',
       'Tags_switched off'],
      dtype='object')

In [781]:
# Building model using statsmodels, for the detailed statistics

# columns selected by RFE to be used for this model 
rfe_col=X_train.columns[rfe.support_]

# Creating X_train dataframe with variables selected by RFE
X_train_rfe = X_train[rfe_col]

# Adding a constant variable 
dt_model = dt_class.fit(X_train_rfe,y_train)


In [782]:
# Getting the predicted values on the train set
y_train_pred = dt_model.predict(X_train_rfe)           # giving prob. of getting 1

y_train_pred[:10]

array([0, 0, 0, 0, 0, 1, 0, 1, 0, 1], dtype=int64)

In [786]:
# for array
y_train_pred = y_train_pred.reshape(-1)
y_train_pred[:10]


array([0, 0, 0, 0, 0, 1, 0, 1, 0, 1], dtype=int64)

In [787]:
from sklearn import metrics

In [789]:
confusion = metrics.confusion_matrix(y_train, y_train_pred)
print(confusion)

[[4002    0]
 [   0 2466]]


In [790]:
X_test = X_test[rfe_col]

In [791]:
y_test_pred = dt_model.predict(X_test)

In [792]:
confusion_matrix = metrics.confusion_matrix(y_test,y_test_pred)

In [793]:
confusion_matrix

array([[1493,  184],
       [ 156,  939]], dtype=int64)

In [794]:
def logreg_all_metrics(confusion_matrix):
    TN =confusion_matrix[0,0]
    TP =confusion_matrix[1,1]
    FP =confusion_matrix[0,1]
    FN =confusion_matrix[1,0]
    
    accuracy = (TN+TP)/(TN+TP+FN+FP)
    sensi = TP/(TP+FN)
    speci = TN/(TN+FP)
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    TPR = TP/(TP + FN)
    TNR = TN/(TN + FP)
    
    #Calculate false postive rate - predicting conversion when customer does not have converted
    FPR = FP/(FP + TN)     
    FNR = FN/(FN +TP)
    
    print ("True Negative                    : ", TN)
    print ("True Positive                    : ", TP)
    print ("False Negative                   : ", FN)
    print ("False Positve                    : ", FP) 
    
    print ("Model Accuracy                   : ", round(accuracy,4))
    print ("Model Sensitivity                : ", round(sensi,4))
    print ("Model Specificity                : ", round(speci,4))
    print ("Model Precision                  : ", round(precision,4))
    print ("Model Recall                     : ", round(recall,4))
    print ("Model True Positive Rate (TPR)   : ", round(TPR,4))
    print ("Model False Positive Rate (FPR)  : ", round(FPR,4))

In [795]:
logreg_all_metrics(confusion_matrix)

True Negative                    :  1493
True Positive                    :  939
False Negative                   :  156
False Positve                    :  184
Model Accuracy                   :  0.8773
Model Sensitivity                :  0.8575
Model Specificity                :  0.8903
Model Precision                  :  0.8362
Model Recall                     :  0.8575
Model True Positive Rate (TPR)   :  0.8575
Model False Positive Rate (FPR)  :  0.1097


In [796]:
metrics.accuracy_score(y_test,y_test_pred)

0.8773448773448773