In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import confusion_matrix
import matplotlib.pylab as plot

from dmba import classificationSummary

# Load the Data
bank_df = pd.read_csv('/Users/kathrynschoos/Desktop/Data Sets/UniversalBank.csv')

no display found. Using non-interactive Agg backend


In [2]:
# Drop ID and Zipcode
bank_df = bank_df.drop(columns=['ID', 'ZIP Code'])
columns = list(bank_df.columns)
columns.remove('Personal Loan')

modelResults = [] # store model results

bank_df.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,25,1,49,4,1.6,1,0,0,1,0,0,0
1,45,19,34,3,1.5,1,0,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0,0
3,35,9,100,1,2.7,2,0,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,0,1


In [3]:
X = bank_df.drop(columns='Personal Loan')
y = bank_df['Personal Loan']

train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = .4, random_state = 1)
print('Training X:', train_X.shape, 'Valid_X:', valid_X.shape)

Training X: (3000, 11) Valid_X: (2000, 11)


### K -nearest neighbor model

In [4]:
# normalize data
scaler = preprocessing.StandardScaler()

scaler.fit(train_X)

train_norm_X = pd.DataFrame(scaler.transform(train_X),
                           index = train_X.index, columns = train_X.columns)
valid_norm_X = pd.DataFrame(scaler.transform(valid_X),
                           index = valid_X.index, columns = valid_X.columns)

In [5]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(train_norm_X, train_y)

knnProb = knn.predict_proba(valid_norm_X)[:,1]
knnPred = knn.predict(valid_norm_X)
classificationSummary(valid_y, knnPred)

modelResults.append({
    'Model' : 'Knn',
    'Accuracy' : accuracy_score(valid_y, knnPred)
})

knnPred[:50]

Confusion Matrix (Accuracy 0.9545)

       Prediction
Actual    0    1
     0 1793   14
     1   77  116


array([1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0])

### Naive-Bayes

In [6]:
# Reload the Data
bank_df = pd.read_csv('/Users/kathrynschoos/Desktop/Data Sets/UniversalBank.csv')

# Drop ID and zip code columns
bank_df.drop(columns=['ID', 'ZIP Code'], inplace=True)

# We convert several of the columns to categorical data
for column in ('Family', 'Education'):
    bank_df[column] = bank_df[column].astype('category')

# The remaining columns (Age, Experience, Income, Mortgage and CCAvg) will be binned
bank_df['Age'] = pd.cut(bank_df['Age'], 5, labels=range(1, 6)).astype('category')
bank_df['Experience'] = pd.cut(bank_df['Experience'], 10, labels=range(1, 11)).astype('category')
bank_df['Income'] = pd.cut(bank_df['Income'], 5, labels=range(1, 6)).astype('category')
bank_df['CCAvg'] = pd.cut(bank_df['CCAvg'], 6, labels=range(1, 7)).astype('category')
bank_df['Mortgage'] = pd.cut(bank_df['Mortgage'], 10, labels=range(1, 11)).astype('category')

X = pd.get_dummies(bank_df.drop(columns = ['Personal Loan']))
y = bank_df['Personal Loan']
nbColumns = list(X.columns)

train_X, valid_X, train_y, valid_y = train_test_split( X, y, test_size = 0.4, random_state = 1)

nb = MultinomialNB(alpha = 0.01)
nb.fit(train_X, train_y)

nbProb = nb.predict_proba(valid_X)[:,1]
nbPred = nb.predict(valid_X)

classificationSummary(valid_y, nbPred)
modelResults.append({
    'Model' : "Naive-Bayes",
    'Accuracy' : accuracy_score(valid_y, nbPred)
})

nbPred[:50]

Confusion Matrix (Accuracy 0.9275)

       Prediction
Actual    0    1
     0 1741   66
     1   79  114


array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

In [7]:
# Reload the Data
bank_df = pd.read_csv('/Users/kathrynschoos/Desktop/Data Sets/UniversalBank.csv')

# Drop ID and zip code columns
bank_df.drop(columns=['ID', 'ZIP Code'], inplace=True)

X = bank_df.drop(columns = ['Personal Loan'])
y = bank_df['Personal Loan']

train_X, valid_X, train_y, valid_y = train_test_split( X, y, test_size = 0.4, random_state = 1)

In [8]:
# user grid search to find optimized tree
param_grid = {
    'max_depth': [5, 10, 15, 20, 25], 
    'min_impurity_decrease': [0, 0.001, 0.005, 0.01], 
    'min_samples_split': [10, 20, 30, 40, 50], 
}
gridSearch = GridSearchCV(DecisionTreeClassifier(random_state=1), param_grid, cv=5, n_jobs=-1)
gridSearch.fit(train_X, train_y)
print('Initial parameters: ', gridSearch.best_params_)

param_grid = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8], 
    'min_impurity_decrease': [0, 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007], 
    'min_samples_split': [8, 9, 10, 11, 12, 13, 14, 15], 
}
gridSearch = GridSearchCV(DecisionTreeClassifier(random_state=1), param_grid, cv=5, n_jobs=-1)
gridSearch.fit(train_X, train_y)
print('Improved parameters: ', gridSearch.best_params_)

classTree = gridSearch.best_estimator_

Initial parameters:  {'max_depth': 5, 'min_impurity_decrease': 0, 'min_samples_split': 20}
Improved parameters:  {'max_depth': 5, 'min_impurity_decrease': 0.001, 'min_samples_split': 13}


In [9]:
treeProb = classTree.predict_proba(valid_X)[:,1]
treePred = classTree.predict(valid_X)

classificationSummary(valid_y, treePred)
modelResults.append({
    'Model': 'Decision Tree', 
    'Accuracy': accuracy_score(valid_y, treePred)
})

Confusion Matrix (Accuracy 0.9825)

       Prediction
Actual    0    1
     0 1793   14
     1   21  172


In [10]:
res = pd.DataFrame({'Personal Loan': valid_y,
                    'knnProb': knnProb, 'knnPred': knnPred,
                    'nbProb': nbProb, 'nbPred': nbPred,
                    'treeProb': treeProb, 'treePred': treePred,
                   })
res.head(10)

Unnamed: 0,Personal Loan,knnProb,knnPred,nbProb,nbPred,treeProb,treePred
2764,0,0.666667,1,0.003757642,0,0.000907,0
4767,0,0.0,0,6.958332e-07,0,0.000907,0
3814,0,0.0,0,1.364776e-06,0,0.000907,0
3499,0,0.0,0,0.04312439,0,0.0,0
2735,0,0.0,0,0.01180696,0,0.000907,0
3922,0,0.0,0,2.619459e-06,0,0.000907,0
2701,0,0.0,0,0.002339459,0,0.000907,0
1179,0,0.0,0,0.1505303,0,0.000907,0
932,0,0.0,0,0.7756032,1,0.189189,0
792,0,0.666667,1,0.6376769,1,0.75,1


In [11]:
predColumns = ['knnPred', 'nbPred', 'treePred']
probColumns = ['knnProb', 'nbProb', 'treeProb']
res['majority'] = [1 if p > 0.5 else 0 for p in res[predColumns].mean(axis=1)]
res['avg'] = res[probColumns].mean(axis=1)
print(res.head())

classificationSummary(res['Personal Loan'], res['majority'])
modelResults.append({'Model': 'Ensemble majority', 
                     'Accuracy': accuracy_score(res['Personal Loan'], 
                                                res['majority'])})

      Personal Loan   knnProb  knnPred        nbProb  nbPred  treeProb  \
2764              0  0.666667        1  3.757642e-03       0  0.000907   
4767              0  0.000000        0  6.958332e-07       0  0.000907   
3814              0  0.000000        0  1.364776e-06       0  0.000907   
3499              0  0.000000        0  4.312439e-02       0  0.000000   
2735              0  0.000000        0  1.180696e-02       0  0.000907   

      treePred  majority       avg  
2764         0         0  0.223777  
4767         0         0  0.000303  
3814         0         0  0.000303  
3499         0         0  0.014375  
2735         0         0  0.004238  
Confusion Matrix (Accuracy 0.9680)

       Prediction
Actual    0    1
     0 1795   12
     1   52  141


In [12]:
classificationSummary(res['Personal Loan'], [1 if p > 0.5 else 0 for p in res['avg']])
modelResults.append({'Model': 'Ensemble average', 
                     'Accuracy': accuracy_score(res['Personal Loan'], [1 if p > 0.5 else 0 for p in res['avg']])})

Confusion Matrix (Accuracy 0.9730)

       Prediction
Actual    0    1
     0 1797   10
     1   44  149


In [13]:
pd.DataFrame(modelResults)

Unnamed: 0,Model,Accuracy
0,Knn,0.9545
1,Naive-Bayes,0.9275
2,Decision Tree,0.9825
3,Ensemble majority,0.968
4,Ensemble average,0.973


#### Both ensemble methods out perform the knn and naive bayes prediction. The classifcation tree yeilded the most accurate results out of the 5 deployed models.

In [14]:
bagging = BaggingClassifier(classTree, 
                            max_samples=0.5, max_features=0.5, random_state=1)
bagging.fit(train_X, train_y)
pred = bagging.predict(valid_X)

classificationSummary(valid_y, pred)
modelResults.append({
    'Model': 'Bagging', 
    'Accuracy': accuracy_score(valid_y, pred),
})

Confusion Matrix (Accuracy 0.9260)

       Prediction
Actual    0    1
     0 1807    0
     1  148   45


In [15]:
boost = AdaBoostClassifier(n_estimators=100, base_estimator=classTree, random_state=1)
boost.fit(train_X, train_y)
pred = boost.predict(valid_X)

classificationSummary(valid_y, pred)
modelResults.append({'Model': 'Boosting', 
                     'Accuracy': accuracy_score(valid_y, pred)})

Confusion Matrix (Accuracy 0.9845)

       Prediction
Actual    0    1
     0 1802    5
     1   26  167


In [16]:
pd.DataFrame(modelResults)

Unnamed: 0,Model,Accuracy
0,Knn,0.9545
1,Naive-Bayes,0.9275
2,Decision Tree,0.9825
3,Ensemble majority,0.968
4,Ensemble average,0.973
5,Bagging,0.926
6,Boosting,0.9845


#### Boosting is a more effective model than bagging.

In [17]:
newCustomer = pd.DataFrame([
    {'Age': 40, 'Experience': 10, 'Income': 84, 'Family': 2, 'CCAvg': 2, 'Education': 2, 
     'Mortgage': 0, 'Securities Account': 0, 'CD Account': 0, 'Online': 1, 'CreditCard': 1},
    {'Age': 25, 'Experience': 6, 'Income': 50, 'Family': 1, 'CCAvg': 1.8, 'Education': 1, 
     'Mortgage': 1, 'Securities Account': 0, 'CD Account': 0, 'Online': 1, 'CreditCard': 1},
    {'Age': 59, 'Experience': 30, 'Income': 120, 'Family': 3, 'CCAvg': 1.9, 'Education': 3, 
     'Mortgage': 0, 'Securities Account': 0, 'CD Account': 1, 'Online': 1, 'CreditCard': 0},
])
newCustomer = newCustomer[columns]
newCustomer

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Securities Account,CD Account,Online,CreditCard
0,40,10,84,2,2.0,2,0,0,0,1,1
1,25,6,50,1,1.8,1,1,0,0,1,1
2,59,30,120,3,1.9,3,0,0,1,1,0


In [18]:
print(pd.cut(bank_df['Mortgage'], 10)[:1])
newCustomer_nb = pd.DataFrame([
#           Age        Experience           Income     Family   CCAvg        Education Mortgage
  [0,0,0,1, 0,1,0,0,0, 0,0,1,0,0,0,0,0,0,0, 0,1,0,0,0, 0,1,0,0, 0,1,0,0,0,0, 0,1,0, 1,0,0,0,0,0,0,0,0,0],
  [0,0,0,1, 1,0,0,0,0, 0,1,0,0,0,0,0,0,0,0, 1,0,0,0,0, 1,0,0,0, 0,1,0,0,0,0, 1,0,0, 1,0,0,0,0,0,0,0,0,0],
  [0,1,1,0, 0,0,0,0,1, 0,0,0,0,0,0,0,1,0,0, 0,0,1,0,0, 0,0,1,0, 0,1,0,0,0,0, 0,0,1, 1,0,0,0,0,0,0,0,0,0],
], columns=nbColumns)

newCustomer_norm = pd.DataFrame(scaler.transform(newCustomer), columns=newCustomer.columns)

print('kNN\n', knn.predict_proba(newCustomer_norm))
print('Naive Bayes\n', nb.predict_proba(newCustomer_nb))
print('Decision Tree\n', classTree.predict_proba(newCustomer))

0    (-0.635, 63.5]
Name: Mortgage, dtype: category
Categories (10, interval[float64]): [(-0.635, 63.5] < (63.5, 127.0] < (127.0, 190.5] < (190.5, 254.0] ... (381.0, 444.5] < (444.5, 508.0] < (508.0, 571.5] < (571.5, 635.0]]
kNN
 [[1. 0.]
 [1. 0.]
 [0. 1.]]
Naive Bayes
 [[9.82157006e-01 1.78429945e-02]
 [9.99998693e-01 1.30668500e-06]
 [2.73464312e-01 7.26535688e-01]]
Decision Tree
 [[9.99092559e-01 9.07441016e-04]
 [9.99092559e-01 9.07441016e-04]
 [0.00000000e+00 1.00000000e+00]]


In [19]:
pd.DataFrame({
    'kNN': knn.predict(newCustomer_norm),
    'Naive Bayes': nb.predict(newCustomer_nb),
    'Decision Tree': classTree.predict(newCustomer),
    'Ensemble majority': [0, 0, 1],
    'Ensemble average': [0, 0, 1],
    'bagging': bagging.predict(newCustomer),
    'boosting': boost.predict(newCustomer),
}, columns=['kNN', 'Naive Bayes', 'Decision Tree', 'bagging', 'boosting']).transpose()

Unnamed: 0,0,1,2
kNN,0,0,1
Naive Bayes,0,0,1
Decision Tree,0,0,1
bagging,0,0,1
boosting,0,0,1


#### All models predict the same outcome. The first two customers would not accept the loan, but the third customer would.