# A short project introduction
Machine learning algorithms on a dataset about bank term deposits

This is code from my bachelor's thesis, where we tested four different machine learning and prediction models to find the optimal one for predicting customers willing to sign up for a term deposit. This code has been cleaned and restructured later to better fit GitHub, where I mostly post the programming part of major assignments and not the assignment as a whole.

It should be prefaced that this was a group assignment. Although I have gone through and rewritten a lot of the original code, it was a collaborative work. Therefore, I would like to thank Henrik Krantz Knudsen, Jakob Lindstrøm, and Joakim Sælemyr for their contributions to this project.

<h1>1 Data management, cleaning and resampling

<h2>1.1 Cleaning

In [1]:
#Importing libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.colors as colors

import seaborn as sns
from seaborn import heatmap as hm

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score as cvs

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.svm import SVC

from sklearn.decomposition import PCA

from sklearn.preprocessing import scale
from sklearn.preprocessing import MinMaxScaler

from keras.models import Sequential
from keras.layers import Dense

In [2]:
# Importing dataset
df=pd.read_csv("BankData.csv",sep=";")
df.shape

#Converting the binary value objects into int using the LabelEncoder
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

#Convert all strings into numbers
df["Def_Credit"]=le.fit_transform(df.default)
df["House_loan"]=le.fit_transform(df.housing)
df["Pers_loan"]=le.fit_transform(df.loan)
df["Deposit"]=le.fit_transform(df.y)

#Looking at the new dataframe
list=["default","housing","loan","y"]
df=df.drop(columns=list)
df=pd.get_dummies(df, drop_first=True)
df.shape

(45211, 43)

<h2> 1.2 Splitting and resampling

<h3> 1.2.1 Original train set

In [3]:
#Splitting into original train set and universal test size
orgtrain, test = train_test_split(df, test_size=.2, random_state=42)

<h3> 1.2.2 Undersample train set

In [4]:
#Retriving all 1s from original train set
ones = orgtrain.loc[orgtrain.Deposit == 1]

#Retriving random zeros from the original train set
alfa, randomzeros = train_test_split(orgtrain.loc[orgtrain.Deposit==0], test_size=.13255, random_state=42)

# Creating the undersampled training dataset
under = pd.concat([ones, randomzeros], axis=0)

<h3> 1.2.3 Oversample set

In [5]:
#Retrieving all 0s from original train set
allzeros = orgtrain.loc[orgtrain.Deposit == 0]

#Replicating all the ones from original train set seven times,
#and concats it into one df
newone = ones
for i in range(0,6,1):
    newone = pd.concat([newone,ones],axis=0)

#Retrieving random 1s from original train set 
#so the amount of 1s and 0s is matching.
charlie, additionalones = train_test_split(ones, test_size=0.54429, random_state=42)

#adding all the ones to create one large df with only 1s.
totalones = pd.concat([newone,additionalones], axis=0)

#Concating the 0s and 1s into one oversample df.
over = pd.concat([totalones, allzeros],axis=0)

<h3> 1.2.4 Creating csv-files out of our train- and test sets and reloading them

In [6]:
# Creating individual csv-files for each dataset. This made it easier to distribute the same data to the whole group.
under.to_csv('under.csv')
test.to_csv('test.csv')
over.to_csv('over.csv')
orgtrain.to_csv('org.csv')

In [7]:
#Downloading csv-files
test = pd.read_csv('test.csv')
orgtrain = pd.read_csv('org.csv')
over  = pd.read_csv('over.csv')
under = pd.read_csv('under.csv')
del test['Unnamed: 0']
del orgtrain['Unnamed: 0']
del over['Unnamed: 0']
del under['Unnamed: 0']

<h3> 1.2.5 Defining Variables and targets

In [8]:
#Defining columns in a list
columns = ['age', 'balance', 'day',
 'duration', 'campaign', 'pdays','previous', 'Def_Credit', 'House_loan',
 'Pers_loan', 'job_blue-collar', 'job_entrepreneur',
 'job_housemaid', 'job_management',
 'job_retired', 'job_self-employed', 'job_services',
 'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
 'marital_married', 'marital_single', 'education_secondary',
 'education_tertiary', 'education_unknown', 'contact_telephone',
 'contact_unknown','month_aug', 'month_dec','month_feb',
 'month_jan','month_jul','month_jun', 'month_mar',
 'month_may','month_nov','month_oct',
 'month_sep','poutcome_other','poutcome_success','poutcome_unknown']

In [9]:

# Defining test data
df_test = test

# Defining training data
df_org_train = orgtrain
df_under_train = under
df_over_train = over 

# Defining targets and varaibles for all datasets
org_train_target = df_org_train["Deposit"]
org_train_data = df_org_train[columns]

under_train_target = df_under_train["Deposit"]
under_train_data = df_under_train[columns]

over_train_target = df_over_train["Deposit"]
over_train_data = df_over_train[columns]

test_target = df_test[['Deposit']]
test_data = df_test[columns]

# Defining lists of datasets to use in functions
data = [org_train_data, under_train_data, over_train_data]
targets = [org_train_target, under_train_target, over_train_target]

<h1> 2 Logistic Regression

In [10]:
# Making a function looping through all logistic regression scores

def Logistic_Regression():
    model = LogisticRegression(solver='lbfgs',max_iter=95000)

    train_res = []
    test_res = []
    pcl_res = []
    
    for i in range(0,3):
        model = model.fit(data[i], targets[i])

        # Attaining scores and saving them
        score_train = model.score(data[i], targets[i])
        train_res.append(score_train)

        score_test = model.score(test_data, test_target)
        test_res.append(score_test)

        cm = confusion_matrix(model.predict(test_data), test_target)
        cm = cm/np.sum(cm)
        cm = cm[0][1]/(cm[0][1]+cm[0][0])
        pcl_res.append(cm)

    df = pd.DataFrame({"Train": train_res, "Test": test_res, "PCL": pcl_res}, index=["Un-sampled", "Undersampled", "Oversampled"])

    return df

In [11]:
LogReg_results = Logistic_Regression()
LogReg_results

Unnamed: 0,Train,Test,PCL
Un-sampled,0.903063,0.898927,0.083747
Undersampled,0.835586,0.839876,0.027737
Oversampled,0.832404,0.841756,0.027258


<h1> 3 Decision Tree Classsifier

In [12]:
def Decision_Tree_Classifier():
    model = dtc()

    train_res = []
    test_res = []
    pcl_res = []

    ideal_params = []

    for i in range(0,3):
        # Using pruning to itterate through alpha values
        model.fit(data[i], targets[i])
        path = model.cost_complexity_pruning_path(data[i], targets[i])
        ccp_alphas = path.ccp_alphas
        ccp_alphas = ccp_alphas[:-1]
        
        # Finding the optimal alpha value through itteration
        alpha_loop_values = []
        for alpha in ccp_alphas:
            model = dtc(ccp_alpha = alpha, random_state=42)
            scores = cvs(model, data[i], targets[i], cv=5)
            alpha_loop_values.append([alpha, np.mean(scores), np.std(scores)])
        
        alpha_res = pd.DataFrame(alpha_loop_values, columns=["Alpha", "Mean_Accuracy", "STD_Accuracy"])
        ideal_alpha = alpha_res[alpha_res.Mean_Accuracy == max(alpha_res.Mean_Accuracy)].Alpha

        # Incase there are several optimal alpha values
        if len(ideal_alpha) > 1:
            ideal_alpha = ideal_alpha.iloc[-1]
        ideal_alpha = float(ideal_alpha)

        # Fitting the ideal model
        model = dtc(ccp_alpha = ideal_alpha)
        model = model.fit(data[i], targets[i])

        # Attaining scores and saving them
        train_score = model.score(data[i], targets[i])
        train_res.append(train_score)

        test_score = model.score(test_data, test_target)
        test_res.append(test_score)

        cm = confusion_matrix(model.predict(test_data), test_target)
        cm = cm/np.sum(cm)
        cm = cm[0][1]/(cm[0][1]+cm[0][0])
        pcl_res.append(cm)

        ideal_params.append(ideal_alpha)

    df = pd.DataFrame({"Train": train_res, "Test": test_res, "PCL": pcl_res}, index=["Un-sampled", "Undersampled", "Oversampled"])

    return df, ideal_params

In [13]:
DTC_function = Decision_Tree_Classifier()
print("Ideal alpah parameters", DTC_function[1])
DTC_results = DTC_function[0]
DTC_results

Ideal alpah parameters [0.00027570515406692236, 0.00048409972122178107, 1.2450220732038378e-05]


Unnamed: 0,Train,Test,PCL
Un-sampled,0.905607,0.898706,0.072124
Undersampled,0.870673,0.809134,0.020782
Oversampled,0.99989,0.87482,0.077552


<h1> 4 Support Vector Machine


In [14]:
# Scaling variables
org_data_scaled = scale(org_train_data)
under_data_scaled = scale(under_train_data)
over_data_scaled = scale(over_train_data)
test_data_scaled = scale(test_data)

data_svm = [org_data_scaled, under_data_scaled, over_data_scaled]

# Using gridserch and low number of variables to avoid over-processing
grid_params = [{ "C":[1, 5, 10], "gamma":["scale",0.1, 0.01, 0.001], "kernel":["rbf"] }]

In [17]:
def Support_Vector_Machine():
    model = SVC(random_state=42)

    train_res = []
    test_res = []
    pcl_res = []

    ideal_params = []

    for i in range(0,3):
        # Finding optimal gamma and C parameters
        optimal_params = GridSearchCV(SVC(), grid_params, refit=True, cv=3, scoring="accuracy", verbose=0)
        optimal_params = optimal_params.fit(data_svm[i], targets[i])

        optimal_C = optimal_params.best_params_["C"]
        optimal_gamma = optimal_params.best_params_["gamma"]

        # Fitting the ideal model
        model =SVC(random_state=42, C=optimal_C, gamma=optimal_gamma)
        model.fit(data_svm[i], targets[i])

        # Attaining scores and saving them
        train_score = model.score(data_svm[i], targets[i])
        train_res.append(train_score)

        test_score = model.score(test_data_scaled, test_target)
        test_res.append(test_score)

        cm = confusion_matrix(model.predict(test_data_scaled), test_target)
        cm = cm/np.sum(cm)
        cm = cm[0][1]/(cm[0][1]+cm[0][0])
        pcl_res.append(cm)

    ideal_params.append([optimal_C, optimal_gamma])

    df = pd.DataFrame({"Train": train_res, "Test": test_res, "PCL": pcl_res}, index=["Un-sampled", "Undersampled", "Oversampled"])

    return df, ideal_params 

In [18]:
SVM_function = Support_Vector_Machine()
print("Ideal C and gamma parameters", SVM_function[1])
SVM_results = SVM_function[0]
SVM_results

Ideal C and gamma parameters [[10, 0.1]]


Unnamed: 0,Train,Test,PCL
Un-sampled,0.921063,0.902687,0.082132
Undersampled,0.888454,0.69634,0.009238
Oversampled,0.982024,0.824616,0.078634


<h1> 5 Artificial Neural Network

In [19]:
# Scaling the numeric variables
# This changes the original datasets
num_vars = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

for i in data:
  scaler = MinMaxScaler()
  i[num_vars] = scaler.fit_transform(i[num_vars])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i[num_vars] = scaler.fit_transform(i[num_vars])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i[num_vars] = scaler.fit_transform(i[num_vars])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i[num_vars] = scaler.fit_transform(i[num_vars])


In [20]:
def Artificial_Neural_Network():

    train_res = []
    test_res = []
    pcl_res = []


    for i in range(0,3):
        model = Sequential()

         # Build the neural network
        model.add(Dense(32, input_dim = data[i].shape[1], activation="leaky_relu"))
        model.add(Dense(16, activation="leaky_relu"))
        model.add(Dense(1, activation="sigmoid"))

        # Compile the model
        model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

        # Train the model
        model.fit(x=data[i], y=targets[i], epochs=20, batch_size=10, verbose=1, validation_split=.1)

        # Predictions 
        train_predictions = model.predict(x=data[i], batch_size=10, verbose=0)
        test_predictions = model.predict(x=test_data, batch_size=10, verbose=0)

        # Binary predictions
        train_pred = [np.round(num) for num in train_predictions]
        test_pred = [np.round(num) for num in test_predictions]


        cm = confusion_matrix(train_pred, targets[i])
        cm = cm/np.sum(cm)
        train_res.append(cm[0][0]+cm[1][1])

        cm = confusion_matrix(test_pred, test_target)
        cm = cm/np.sum(cm)
        test_res.append(cm[0][0]+cm[1][1])

        cm = cm[0][1]/(cm[0][1]+cm[0][0])
        pcl_res.append(cm)

    df = pd.DataFrame({"Train": train_res, "Test": test_res, "PCL": pcl_res}, index=["Un-sampled", "Undersampled", "Oversampled"])

    return df  

In [21]:
ANN_results = Artificial_Neural_Network()
ANN_results

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Unnamed: 0,Train,Test,PCL
Un-sampled,0.915865,0.301006,0.08072
Undersampled,0.877193,0.125069,0.0
Oversampled,0.891595,0.120978,0.0
