In [1]:
# Supress unnecessary warnings so that presentation looks clean
import warnings
warnings.filterwarnings('ignore')

# Import needed packages needed for EDA
import pandas as pd
import numpy as np

#Print all rows and columns. Dont hide any
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

bankingcalldata = pd.read_csv('data/bank-additional-full.csv', sep=';')

print('Full dataset shape: ')
print(bankingcalldata.shape)

Full dataset shape: 
(41188, 21)


In [2]:
bankingcalldata.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [3]:
from datetime import datetime
from sklearn.metrics import mean_absolute_error, accuracy_score, average_precision_score, recall_score, confusion_matrix
from scipy.stats import skew, boxcox
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold

import itertools

if bankingcalldata.isnull().values.any() == True:
    print('There are missing values in the dataset.')
else:
    print('There are no missing values in the dataset.')

columns = list(bankingcalldata.columns)
    
for column in columns:
    if bankingcalldata[column].isnull().values.any() == True:
        print('There are missing values in the column ' + column)

There are no missing values in the dataset.


In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

ONE_HOT = False

#Variable to hold the list of variables for an attribute in the train and test data
labels = []
to_be_encoded = ['job','marital','education','default','housing','loan','contact','month','day_of_week','previous',
                 'poutcome']

print('Encoding categorical columns..')

for i in bankingcalldata.columns.values:
    if bankingcalldata[i].dtype == object:
        lbl = preprocessing.LabelEncoder()
        bankingcalldata[i] = lbl.fit_transform(bankingcalldata[i])

for i in range(len(to_be_encoded)):
    labels.append(list(bankingcalldata[to_be_encoded[i]].unique()))
    
#One hot encode all categorical attributes
cats = []
encoded_data = bankingcalldata.drop('y', axis=1)
        
#One hot encode
if ONE_HOT:
    for i in range(len(to_be_encoded)):
        feature = encoded_data[to_be_encoded[i]]
        feature = feature.reshape(encoded_data.shape[0], 1)
        onehot_encoder = OneHotEncoder(sparse=False,n_values=len(labels[i]))
        feature = onehot_encoder.fit_transform(feature)
        cats.append(feature)

    # Make a 2D array from a list of 1D arrays
    encoded_cats = np.column_stack(cats)

    # Print the shape of the encoded data
    print(encoded_cats.shape)

    #Concatenate encoded attributes with continuous attributes
    bankingcalldata_encoded = encoded_data.drop(to_be_encoded, axis=1)
    bankingcalldata_encoded = np.concatenate((encoded_cats,bankingcalldata_encoded),axis=1)
    
    bankingcalldata_encoded = pd.DataFrame(bankingcalldata_encoded)
    
    print('Checking datatypes..')
    tmp = 0
    for i in bankingcalldata_encoded.columns.values:
        if bankingcalldata_encoded[i].dtype == object:
            tmp = tmp + 1
    if tmp == 0:
        print('All columns are encoded.')
    else:
        print('Not all columns are encoded')
    
else:
    print('Checking datatypes..')
    tmp = 0
    for i in bankingcalldata.columns.values:
        if bankingcalldata[i].dtype == object:
            tmp = tmp + 1
    if tmp == 0:
        print('All columns are encoded.')
    else:
        print('Not all columns are encoded')

print('Finished.')

Encoding categorical columns..
Checking datatypes..
All columns are encoded.
Finished.


In [5]:
if ONE_HOT:
    X_full = bankingcalldata_encoded
else:
    X_full = bankingcalldata.drop('y', axis=1) 
    
y_full = bankingcalldata['y']
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.40, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

print(y_full.value_counts())


(24712, 20)
(16476, 20)
(24712,)
(16476,)
0    36548
1     4640
Name: y, dtype: int64


In [6]:
def accuracy(array):
    TN = array[0,0]
    TP = array[1,1]
    FN = array[1,0]
    FP = array[0,1]
    
    return((TP + TN) / (TN + TP + FN + FP))
def precision(array):
    TN = array[0,0]
    TP = array[1,1]
    FN = array[1,0]
    FP = array[0,1]
    
    return(TP / (TP + FP))
def recall(array):
    TN = array[0,0]
    TP = array[1,1]
    FN = array[1,0]
    FP = array[0,1]
    
    return(TP / (TP + FN))

In [8]:
#Import the library
from sklearn.linear_model import RidgeClassifier

def ridge_model(alpha, X, y):
    
    ridge_model = RidgeClassifier(alpha, fit_intercept= True, normalize=True)
    ridge_model.fit(X, y)
    return ridge_model

n_folds = 10
alpha = 0.1
i = 0

skf = StratifiedKFold(n_splits=n_folds, random_state=123, shuffle=True)

for (train_index, test_index) in skf.split(X_train, y_train):
    # cross-validation randomly splits train data into train and validation data
    print('\n Fold %d' % (i + 1))
    
    X_train_cv, X_val_cv = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[test_index]
 
    # declare your model
    model = ridge_model(alpha, X_train_cv, y_train_cv)
    
    # predict train and validation set accuracy and get eval metrics
    scores_cv = model.predict(X_train_cv)
    scores_val = model.predict(X_val_cv)

    train_confusion_matrix = confusion_matrix(y_train_cv, np.around(scores_cv).astype(int))
    val_confusion_matrix = confusion_matrix(y_val_cv, np.around(scores_val).astype(int))

    train_pc = accuracy(train_confusion_matrix)
    train_pp = precision(train_confusion_matrix)
    train_re = recall(train_confusion_matrix)
    print('\n train-Accuracy: %.6f' % train_pc)
    print(' train-Precision: %.6f' % train_pp)
    print(' train-Recall: %.6f' % train_re)
    
    eval_pc = accuracy(val_confusion_matrix)
    eval_pp = precision(val_confusion_matrix)
    eval_re = recall(val_confusion_matrix)
    print('\n eval-Accuracy: %.6f' % eval_pc)
    print(' eval-Precision: %.6f' % eval_pp)
    print(' eval-Recall: %.6f' % eval_re)

    i = i+1


 Fold 1

 train-Accuracy: 0.904991
 train-Precision: 0.703399
 train-Recall: 0.272329

 eval-Accuracy: 0.906553
 eval-Precision: 0.700000
 eval-Recall: 0.301075

 Fold 2

 train-Accuracy: 0.905351
 train-Precision: 0.702513
 train-Recall: 0.278708

 eval-Accuracy: 0.904935
 eval-Precision: 0.720000
 eval-Recall: 0.258065

 Fold 3

 train-Accuracy: 0.905081
 train-Precision: 0.699899
 train-Recall: 0.277113

 eval-Accuracy: 0.903317
 eval-Precision: 0.717391
 eval-Recall: 0.236559

 Fold 4

 train-Accuracy: 0.905306
 train-Precision: 0.700599
 train-Recall: 0.279904

 eval-Accuracy: 0.902508
 eval-Precision: 0.686275
 eval-Recall: 0.250896

 Fold 5

 train-Accuracy: 0.904182
 train-Precision: 0.693333
 train-Recall: 0.269537

 eval-Accuracy: 0.911812
 eval-Precision: 0.774775
 eval-Recall: 0.308244

 Fold 6

 train-Accuracy: 0.904905
 train-Precision: 0.699898
 train-Recall: 0.274322

 eval-Accuracy: 0.904087
 eval-Precision: 0.698113
 eval-Recall: 0.265233

 Fold 7

 train-Accuracy: 0

In [12]:
# divide predictions and CV-sum by number of folds to get mean of all folds
final_train_pred_ridge = model.predict(X_train)
final_test_pred_ridge = model.predict(X_test)
final_full_pred_ridge = model.predict(X_full)

train_confusion_matrix_ridge = confusion_matrix(y_train, np.around(final_train_pred_ridge).astype(int))
test_confusion_matrix_ridge = confusion_matrix(y_test, np.around(final_test_pred_ridge).astype(int))
full_confusion_matrix_ridge = confusion_matrix(y_full, np.around(final_full_pred_ridge).astype(int))

final_train_accuracy_ridge = accuracy(train_confusion_matrix_ridge)
final_train_precision_ridge = precision(train_confusion_matrix_ridge)
final_train_recall_ridge = recall(train_confusion_matrix_ridge)

final_test_accuracy_ridge = accuracy(test_confusion_matrix_ridge)
final_test_precision_ridge = precision(test_confusion_matrix_ridge)
final_test_recall_ridge = recall(test_confusion_matrix_ridge)

final_full_accuracy_ridge = accuracy(full_confusion_matrix_ridge)
final_full_precision_ridge = precision(full_confusion_matrix_ridge)
final_full_recall_ridge = recall(full_confusion_matrix_ridge)

print('\n Ridge full-Accuracy: %.6f' % final_full_accuracy_ridge)
print(' Ridge full-Precision: %.6f' % final_full_precision_ridge)
print(' Ridge full-Recall: %.6f' % final_full_recall_ridge)


 Ridge full-Accuracy: 0.905118
 Ridge full-Precision: 0.696141
 Ridge full-Recall: 0.279957


In [25]:
import xgboost as xgb


def xgb_model(d_train, d_valid):
    
    params = {
        'learning_rate': 0.01,
        'n_estimators': 1000,
        'max_depth': 5,
        'min_child_weight': 1,
        'gamma': 0,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'objective': 'binary:logistic',
        'nthread': -1,
        'scale_pos_weight': 1,
        'seed': 27,
        'eval_metric': 'logloss',
        #'num_class': 1,
        'silent': 1
    }

    watchlist = [(d_train, 'train'), (d_valid, 'eval')]

    xgb_model = xgb.train(params,
                          d_train,
                          10000, # number of rounds
                          watchlist,
                          early_stopping_rounds = 50,
                          verbose_eval=50
                          )
    return xgb_model

n_folds = 10
i = 0

skf = StratifiedKFold(n_splits=n_folds, random_state=123, shuffle=True)

for (train_index, test_index) in skf.split(X_train, y_train):
    # cross-validation randomly splits train data into train and validation data
    print('\n Fold %d' % (i + 1))

    X_train_cv, X_val_cv = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[test_index]

    d_train = xgb.DMatrix(X_train_cv, label=y_train_cv)
    d_valid = xgb.DMatrix(X_val_cv, label=y_val_cv)
    d_train_full = xgb.DMatrix(X_train)
    d_test = xgb.DMatrix(X_test)
    d_full = xgb.DMatrix(X_full)

    # declare your model
    model = xgb_model(d_train, d_valid)
    
    # predict the train, test and full data and add it to the other predictions
    train_pred = model.predict(d_train_full, ntree_limit=model.best_ntree_limit)  
    test_pred = model.predict(d_test, ntree_limit=model.best_ntree_limit)
    full_pred = model.predict(d_full, ntree_limit=model.best_ntree_limit)
    
    
    
    i = i+1


 Fold 1
[0]	train-logloss:0.685507	eval-logloss:0.685548
Multiple eval metrics have been passed: 'eval-logloss' will be used for early stopping.

Will train until eval-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.439211	eval-logloss:0.438961
[100]	train-logloss:0.321104	eval-logloss:0.321167
[150]	train-logloss:0.256851	eval-logloss:0.257696
[200]	train-logloss:0.21937	eval-logloss:0.221249
[250]	train-logloss:0.198041	eval-logloss:0.201109
[300]	train-logloss:0.183968	eval-logloss:0.188478
[350]	train-logloss:0.175379	eval-logloss:0.181201
[400]	train-logloss:0.16951	eval-logloss:0.176779
[450]	train-logloss:0.165116	eval-logloss:0.173973
[500]	train-logloss:0.161871	eval-logloss:0.171966
[550]	train-logloss:0.159499	eval-logloss:0.170799
[600]	train-logloss:0.157419	eval-logloss:0.170042
[650]	train-logloss:0.155745	eval-logloss:0.169549
[700]	train-logloss:0.154111	eval-logloss:0.169042
[750]	train-logloss:0.152798	eval-logloss:0.168672
[800]	train-logloss:0.151366	ev

[1650]	train-logloss:0.13305	eval-logloss:0.162165
[1700]	train-logloss:0.132109	eval-logloss:0.162161
Stopping. Best iteration:
[1690]	train-logloss:0.132293	eval-logloss:0.162087


 Fold 6
[0]	train-logloss:0.685533	eval-logloss:0.685529
Multiple eval metrics have been passed: 'eval-logloss' will be used for early stopping.

Will train until eval-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.43751	eval-logloss:0.437848
[100]	train-logloss:0.319623	eval-logloss:0.320283
[150]	train-logloss:0.255765	eval-logloss:0.257142
[200]	train-logloss:0.219407	eval-logloss:0.222014
[250]	train-logloss:0.197423	eval-logloss:0.201443
[300]	train-logloss:0.18386	eval-logloss:0.189209
[350]	train-logloss:0.175179	eval-logloss:0.181533
[400]	train-logloss:0.169282	eval-logloss:0.176847
[450]	train-logloss:0.165258	eval-logloss:0.173821
[500]	train-logloss:0.16231	eval-logloss:0.171956
[550]	train-logloss:0.160038	eval-logloss:0.170644
[600]	train-logloss:0.15825	eval-logloss:0.169846
[650

In [27]:
# divide predictions and CV-sum by number of folds to get mean of all folds
final_train_pred_xgb = model.predict(d_train_full, ntree_limit=model.best_ntree_limit) 
final_test_pred_xgb = model.predict(d_test, ntree_limit=model.best_ntree_limit)
final_full_pred_xgb = model.predict(d_full, ntree_limit=model.best_ntree_limit)

train_confusion_matrix_xgb = confusion_matrix(y_train, np.around(final_train_pred_xgb).astype(int))
test_confusion_matrix_xgb = confusion_matrix(y_test, np.around(final_test_pred_xgb).astype(int))
full_confusion_matrix_xgb = confusion_matrix(y_full, np.around(final_full_pred_xgb).astype(int))

final_train_accuracy_xgb = accuracy(train_confusion_matrix_xgb)
final_train_precision_xgb = precision(train_confusion_matrix_xgb)
final_train_recall_xgb = recall(train_confusion_matrix_xgb)

final_test_accuracy_xgb = accuracy(test_confusion_matrix_xgb)
final_test_precision_xgb = precision(test_confusion_matrix_xgb)
final_test_recall_xgb = recall(test_confusion_matrix_xgb)

final_full_accuracy_xgb = accuracy(full_confusion_matrix_xgb)
final_full_precision_xgb = precision(full_confusion_matrix_xgb)
final_full_recall_xgb = recall(full_confusion_matrix_xgb)

print('\n Average XGB accuracy: %.6f' % final_full_accuracy_xgb)
print('\n Average XGB precision: %.6f' % final_full_precision_xgb)
print('\n Average XGB recall: %.6f' % final_full_recall_xgb)


 Average XGB accuracy: 0.927746

 Average XGB precision: 0.720456

 Average XGB recall: 0.585991
