# Activity 6: Explore predictive accuracy

In [48]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix as cm
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
# VAR19: Telephone (A191 : none; A192 : yes, registered under the customers name) 
# VAR21: Status (target) : (1: Good, 2: Bad) 

xy = pd.read_csv('german.csv', usecols=['VAR19','VAR21'])
xy.head(10)

Unnamed: 0,VAR19,VAR21
0,A192,1
1,A191,2
2,A191,1
3,A191,1
4,A191,2
5,A192,1
6,A191,1
7,A192,1
8,A191,1
9,A191,2


In [5]:
# skikit-learn can handle three types of data/distribtuions: Bernoulli variables (yes/no), multinomial (categorical), Gaussian (numeric)
# For categorical data we still need quantitative inputs,
# we will convert categories into dummies, the target variable also should be coded as 0/1.

xy_dum=pd.get_dummies(xy, prefix='Phone')

# VAR21 is coded as a number, so 'get dummies' ignores it, we can use 'rename' and 'replace'. 
# We will model Probability of Default, so let's name target as 'Bad' (equals 1)
# Renaming phone dummies is optional, but helps interpretation
xy_dum.rename (columns={'VAR21':'Bad','Phone_A191':'Phone_no','Phone_A192':'Phone_yes'}, inplace=True)
xy_dum.loc[:,'Bad'].replace(([1,2]),[0,1],inplace=True)
xy_dum.head()

Unnamed: 0,Bad,Phone_no,Phone_yes
0,0,0,1
1,1,1,0
2,0,1,0
3,0,1,0
4,1,1,0


In [21]:
X = xy_dum.drop('Bad', axis=1)
Y = xy_dum['Bad']

model1 = MultinomialNB()
model1.fit(X, Y.values.ravel())
# ravel() converts y-vector into 1d array as required by sklearn
# Y = xy_dum[['Bad']] also works
Y_pred = model1.predict(X)

print("Confusion matrix: \n"+str(cm(Y,Y_pred)))

Confusion matrix: 
[[700   0]
 [300   0]]


In [37]:
# This happens because of imbalanced classes, as discussed before, since everyone is classified into 'Good'
# From business point, it means we should accept everyone, and our PD will be as before 300/1000=0.3

Y_prob = model1.predict_proba(X) # this produces predicted probabilities, column0 for 0 (Good), Column1 - for 1(Bad)
Y_pred1 = np.where(Y_prob[:,1]>=0.3,1,0)
# define cut-off at PD =0.3
print('Confusion matrix: \n'+str(cm(Y,Y_pred1)))

Y_pr = pd.DataFrame({'Bad':Y, 'P_Bad':Y_prob[:,1], 'Bad_hat':Y_pred1})
phone_cross = pd.crosstab(index=Y_pr['Bad'], columns=Y_pr['P_Bad'], margins=True)

phone_cross

Confusion matrix: 
[[291 409]
 [113 187]]


P_Bad,0.28002314048044485,0.31356461080822495,All
Bad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,291,409,700
1,113,187,300
All,404,596,1000


In [38]:
# the accuracy is lower, since now we mis-classify 409+113 customers as compared to 300 in the previous step
# however, from a business point of view we will only accept those that are predicted Good (404 or c.40%),
# and our PD among accepted (predicted as Good) is 0.28
# which is slighly lower than 0.3, but we will have to reject 60% of applicants

# We can also calculate the area under the ROC curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y, Y_prob[:,1])
roc_auc=auc(false_positive_rate, true_positive_rate)
print('AUC:'+str(roc_auc))

AUC:0.5195238095238095


In [49]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3,random_state=2)
# 70% training and 30% test 
# random_state fixes the split, so if we want to compare the models, we compare then on the same sample

model2 = BernoulliNB()
model2.fit(X_train, Y_train.values.ravel())

Y_prob2 = model2.predict_proba(X_test) # this produces predicted probablities
Y_pred2 = np.where(Y_prob2[:,1]>=0.3, 1, 0) # define cut-off at PD =0.3 instead of 0.5
print("Confusion matrix: \n"+str(cm(Y_test,Y_pred2)))

# Model AUC?
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_test, Y_prob2[:,1])
roc_auc=auc(false_positive_rate, true_positive_rate)
print('AUC:' + str(roc_auc))

Confusion matrix: 
[[ 82 119]
 [ 40  59]]
AUC:0.5019598974822855


As expected measures of predictive accuracy on the test sample are worse as compared to the training/whole. The deviation of AUC from 0.5 is small anyway, and estimated PD is now 0.32. Therefore, 'Telephone' is perhaps not a very strong predictor.