# Assessment: Coding for Naive Bayes

VAR7 (Present employment since). The categories are:

A71 : unemployed;

A72 : ... < 1 year;

A73 : 1 <= ... < 4 years;

A74 : 4 <= ... < 7 years;

A75 : .. >= 7 years.

VAR14 (Other installment plans). Installment plans are fixed-term loans that require fixed repayments (installments) every month. The categories are:

A141 : banks;

A142 : stores;

A143 : none.

VAR21: Status (target) : (1: Good, 2: Bad)

In [15]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split

# Read in the data

yx = pd.read_csv('german.csv', usecols=['VAR7','VAR14', 'VAR21'])
yx.head(10)

Unnamed: 0,VAR7,VAR14,VAR21
0,A75,A143,1
1,A73,A143,2
2,A74,A143,1
3,A74,A143,1
4,A73,A143,2
5,A73,A143,1
6,A75,A143,1
7,A73,A143,1
8,A74,A143,1
9,A71,A143,2


In [17]:
# Calculate frequency distributions/tables for VAR7 and VAR14 on the whole dataset
fre_tables_VAR7 = pd.crosstab(index=yx['VAR21'], columns=[yx['VAR7']],margins = True)
print(fre_tables_VAR7)

VAR7   A71  A72  A73  A74  A75   All
VAR21                               
1       39  102  235  135  189   700
2       23   70  104   39   64   300
All     62  172  339  174  253  1000


In [18]:
fre_tables_VAR14 = pd.crosstab(index=yx['VAR21'], columns=[yx['VAR14']],margins = True)
print(fre_tables_VAR14)

VAR14  A141  A142  A143   All
VAR21                        
1        82    28   590   700
2        57    19   224   300
All     139    47   814  1000


In [19]:
# Calculate Probability of Default/Bad (PD) for VAR7 and VAR14 on the whole dataset
print(fre_tables_VAR7.loc[(2),('A71')]/fre_tables_VAR7.loc[('All'),('A71')])
print(fre_tables_VAR7.loc[(2),('A72')]/fre_tables_VAR7.loc[('All'),('A72')])
print(fre_tables_VAR7.loc[(2),('A73')]/fre_tables_VAR7.loc[('All'),('A73')])
print(fre_tables_VAR7.loc[(2),('A74')]/fre_tables_VAR7.loc[('All'),('A74')])
print(fre_tables_VAR7.loc[(2),('A75')]/fre_tables_VAR7.loc[('All'),('A75')])

print('####')
print(fre_tables_VAR14.loc[(2),('A141')]/fre_tables_VAR14.loc[('All'),('A141')])
print(fre_tables_VAR14.loc[(2),('A142')]/fre_tables_VAR14.loc[('All'),('A142')])
print(fre_tables_VAR14.loc[(2),('A143')]/fre_tables_VAR14.loc[('All'),('A143')])


0.3709677419354839
0.4069767441860465
0.30678466076696165
0.22413793103448276
0.25296442687747034
####
0.41007194244604317
0.40425531914893614
0.2751842751842752


In [20]:
emp_dum=pd.get_dummies(yx['VAR7'],prefix='emp')
inst_dum=pd.get_dummies(yx['VAR14'],prefix='inst')
yx_dum=pd.concat([yx['VAR21'],emp_dum,inst_dum],axis=1)
yx_dum.loc[:,'VAR21'].replace(([1,2]),[0,1],inplace=True)

In [22]:
# Prepare the data for NB and split the data into train (50%)/test (50%) sets using random_state =5
X_emp = yx_dum[['emp_A71','emp_A72','emp_A73','emp_A74','emp_A75']]
Y_emp = yx_dum['VAR21']
X_train_emp, X_test_emp, Y_train_emp, Y_test_emp = train_test_split(X_emp,Y_emp,test_size=0.5, random_state=2)

X_inst = yx_dum[['inst_A141','inst_A142','inst_A143']]
Y_inst = yx_dum['VAR21']
X_train_inst, X_test_inst, Y_train_inst, Y_test_inst = train_test_split(X_inst,Y_inst,test_size=0.5, random_state=2)

X_ei = yx_dum[['emp_A71','emp_A72','emp_A73','emp_A74','emp_A75','inst_A141','inst_A142','inst_A143']]
Y_ei = yx_dum['VAR21']
X_train_ei, X_test_ei, Y_train_ei, Y_test_ei = train_test_split(X_ei,Y_ei,test_size=0.5, random_state=2)


In [32]:
# build Naive Bayes classifier using VAR7 on the train set and calculate AUC on the test set
model1 = MultinomialNB()
model1.fit(X_train_emp, Y_train_emp.values.ravel())
Y_prob = model1.predict_proba(X_test_emp)

false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_test_emp, Y_prob[:,1])
roc_auc=auc(false_positive_rate, true_positive_rate)
print('AUC:'+str(roc_auc))

AUC:0.5776982011992006


In [33]:
# build Naive Bayes classifier using VAR14 on the train set and calculate AUC on the test set
model2 = MultinomialNB()
model2.fit(X_train_inst, Y_train_inst.values.ravel())
Y_prob = model2.predict_proba(X_test_inst)

false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_test_inst, Y_prob[:,1])
roc_auc=auc(false_positive_rate, true_positive_rate)
print('AUC:'+str(roc_auc))

AUC:0.508364793841143
