In [1]:
import pandas as pd

## Readind Data
# Read UniversalBank.csv file and store it into bank_df
bank_df = pd.read_csv("UniversalBank.csv")

# Delete ID and Zip.Code column. We are not going to use it for our analysis
bank_df = bank_df.drop(['ID', 'ZIP Code'], axis=1)

# Preview universal.df
bank_df.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,25,1,49,4,1.6,1,0,0,1,0,0,0
1,45,19,34,3,1.5,1,0,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0,0
3,35,9,100,1,2.7,2,0,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,0,1


In [2]:
# Set Education and Personal.Loan to categorical variable
bank_df = bank_df.astype({'Education': 'category'})

#---------------
# Q1. Transform 'Personal Loan' into a factor variable 
# 
# Your Code here
bank_df=bank_df.astype({'Personal Loan':'category'})
#---------------

bank_df.dtypes

Age                      int64
Experience               int64
Income                   int64
Family                   int64
CCAvg                  float64
Education             category
Mortgage                 int64
Personal Loan         category
Securities Account       int64
CD Account               int64
Online                   int64
CreditCard               int64
dtype: object

In [3]:
# Convert the categorical data into dummy variables
bank_df = pd.get_dummies(bank_df, columns=['Education'])

# Delete some variables for the sake of simplicity
# -- Age, Experience, Mortgage, Education, Personal.Loan, Securities Account
# -- Advanced/Professional
bank_df = bank_df.drop(columns=['Age', 'Experience', 'Mortgage', 'Securities Account', 'Education_3'])
bank_df


Unnamed: 0,Income,Family,CCAvg,Personal Loan,CD Account,Online,CreditCard,Education_1,Education_2
0,49,4,1.6,0,0,0,0,1,0
1,34,3,1.5,0,0,0,0,1,0
2,11,1,1.0,0,0,0,0,1,0
3,100,1,2.7,0,0,0,0,0,1
4,45,4,1.0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...
4995,40,1,1.9,0,0,1,0,0,0
4996,15,4,0.4,0,0,1,0,1,0
4997,24,2,0.3,0,0,0,0,0,0
4998,49,3,0.5,0,0,1,0,0,1


In [4]:
# Rename dummy variables 
bank_df.rename(columns = {"Education_1": "Undergrad", 
                          "Education_2": "Graduate"}, inplace = True)

bank_df.head()

Unnamed: 0,Income,Family,CCAvg,Personal Loan,CD Account,Online,CreditCard,Undergrad,Graduate
0,49,4,1.6,0,0,0,0,1,0
1,34,3,1.5,0,0,0,0,1,0
2,11,1,1.0,0,0,0,0,1,0
3,100,1,2.7,0,0,0,0,0,1
4,45,4,1.0,0,0,0,1,0,1


In [5]:
from sklearn.model_selection import train_test_split

outcome = ['Personal Loan']

# partition data
X = bank_df.drop(columns=outcome)
y = bank_df[outcome]
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1234)


In [6]:
from sklearn.neural_network import MLPClassifier
from dmba import classificationSummary


In [7]:

#---------------
# Q2. Run MLPClassifier() with 10 hidden nodes, tanh activation function, 
# adam solver, max iteration 1000, alpha 0.05
# 
# Your Code here
#
clf=MLPClassifier(hidden_layer_sizes=(10), activation='tanh', solver='adam', alpha=0.05, max_iter=1000, random_state=1)
#---------------

clf.fit(train_X, train_y.values.ravel())
clf.predict(valid_X)

# Network structure
print('Intercepts')
print(clf.intercepts_)
print('Weights')
print(clf.coefs_)



Intercepts
[array([ 0.60797931, -1.81366275,  2.63183156, -0.53820527,  2.39816614,
        3.25494451, -2.81950443, -0.57644468,  0.4910639 ,  0.03696653]), array([-0.27475772])]
Weights
[array([[ 4.78297094e-02,  1.57585824e-02, -1.41411213e-02,
        -9.52516745e-02, -1.22380239e-02, -3.77745674e-02,
        -4.88776040e-02, -9.26187821e-02,  1.03216942e-01,
        -4.84236353e-02],
       [ 6.49114216e-02, -1.57902064e-01, -7.16960118e-01,
        -2.26386413e-02, -7.80120641e-01,  4.93506615e-01,
        -7.48042175e-03, -2.15537015e-02, -5.23926595e-02,
        -2.95684618e-01],
       [ 3.59577102e-01,  6.75866446e-02,  4.36278661e-02,
        -3.47048390e-03,  4.06551273e-02, -4.55404659e-01,
        -1.44606501e-03, -3.29172009e-03, -6.14126819e-02,
         1.60478717e-01],
       [ 2.87987576e-03,  1.38738722e+00, -5.12994419e-01,
        -5.66925792e-04, -7.52375016e-01, -2.41769124e-01,
        -2.12257426e-04, -4.50089482e-04,  2.59215850e-03,
        -7.80642129e-04],

In [8]:
# training performance (use idxmax to revert the one-hotencoding)
classificationSummary(train_y, clf.predict(train_X))

#---------------
# Q3. Create a confusion matrix for the test set.
#
# Your Code here
#
classificationSummary(valid_y, clf.predict(valid_X))
#---------------


#---------------
# Q4. What is the error rate, false positive rate, 
# and false negative rate from the confusion matrix of test set?
# Is there any sign of overfitting?
# 
# Your answer here
#Train set 과 test set의 accuracy 차이가 크지 않아 overfitting 문제는 없어 보인다.
FPR=8/(1782+8)
print('FPR of test set :{}'.format(FPR))
FNR=52/(52+158)
print('FNR of test set :{}'.format(FNR))
#---------------



Confusion Matrix (Accuracy 0.9780)

       Prediction
Actual    0    1
     0 2726    4
     1   62  208
Confusion Matrix (Accuracy 0.9700)

       Prediction
Actual    0    1
     0 1782    8
     1   52  158
FPR of test set :0.004469273743016759
FNR of test set :0.24761904761904763
