# <span style="color:navy; font-size:40px;">Assignment 2</span>
# Bank Marketing Dataset 

In [92]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Tools for creating iterators for efficient looping
import itertools

# Numpy for numerical operations
import numpy as np
from numpy import mean, std

# Pandas for data manipulation
import pandas as pd

# Scipy for statistical tests
import scipy.stats as stats
from scipy.stats import shapiro, normaltest

# Matplotlib for plotting
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# Seaborn for enhanced data visualisation
import seaborn as sns

# IPython display utilities
from IPython.display import Markdown, display

# Category Encoders for encoding categorical variables
import category_encoders as ce

# Scikit-learn for machine learning tasks
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold  # Data splitting and cross-validation
# Model evaluation metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, auc, 
    roc_curve, confusion_matrix, ConfusionMatrixDisplay, 
    classification_report, roc_auc_score
)
from sklearn.preprocessing import StandardScaler, LabelEncoder  # Data preprocessing
from sklearn.neighbors import KNeighborsClassifier  # K-Nearest Neighbors algorithm
from sklearn.tree import export_text  # Export decision tree in text format
from sklearn.neural_network import MLPClassifier  # Multi-layer Perceptron classifier

# Mlxtend for additional plotting utilities
from mlxtend.plotting import plot_decision_regions

# Part A (Predicting Bank Marketing Campaign Outcomes)

## Task 1: Exploratory Data Analysis (EDA)

In [93]:
# Loading Marketing Bank Dataset
mbd = pd.read_csv("bank.csv")

In [94]:
display(Markdown("### Dataset Display and Basic Information"))

display(Markdown("**---First Few Rows---**"))
print(mbd.head())

### Dataset Display and Basic Information

**---First Few Rows---**

  age;"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"
0  30;"unemployed";"married";"primary";"no";1787;...                                                                                                  
1  33;"services";"married";"secondary";"no";4789;...                                                                                                  
2  35;"management";"single";"tertiary";"no";1350;...                                                                                                  
3  30;"management";"married";"tertiary";"no";1476...                                                                                                  
4  59;"blue-collar";"married";"secondary";"no";0;...                                                                                                  


In [99]:
# Reloading the dataset with the correct delimiter and quote character
mbd = pd.read_csv("bank.csv", delimiter=';', quotechar='"')

# Giving appropriate name to target class
mbd.rename(columns={'y': 'deposit permission'}, inplace=True)

display(Markdown("**---Dataset Basic Information---**"))
print(mbd.info())

**---Dataset Basic Information---**

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   age                 4521 non-null   int64 
 1   job                 4521 non-null   object
 2   marital             4521 non-null   object
 3   education           4521 non-null   object
 4   default             4521 non-null   object
 5   balance             4521 non-null   int64 
 6   housing             4521 non-null   object
 7   loan                4521 non-null   object
 8   contact             4521 non-null   object
 9   day                 4521 non-null   int64 
 10  month               4521 non-null   object
 11  duration            4521 non-null   int64 
 12  campaign            4521 non-null   int64 
 13  pdays               4521 non-null   int64 
 14  previous            4521 non-null   int64 
 15  poutcome            4521 non-null   object
 16  deposit permission  4521

In [100]:
display(Markdown("**---Dataset Null Check---**"))
missing_values = mbd.isnull().sum()
print(missing_values)

display(Markdown("**---Dataset Duplicates Check---**"))
duplicates = mbd.duplicated().sum()
duplicates

**---Dataset Null Check---**

age                   0
job                   0
marital               0
education             0
default               0
balance               0
housing               0
loan                  0
contact               0
day                   0
month                 0
duration              0
campaign              0
pdays                 0
previous              0
poutcome              0
deposit permission    0
dtype: int64


**---Dataset Duplicates Check---**

0

In [101]:
display(Markdown("### Dataset Summary Statistics for Numerical"))
mbd.describe()

### Dataset Summary Statistics for Numerical

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0
mean,41.170095,1422.657819,15.915284,263.961292,2.79363,39.766645,0.542579
std,10.576211,3009.638142,8.247667,259.856633,3.109807,100.121124,1.693562
min,19.0,-3313.0,1.0,4.0,1.0,-1.0,0.0
25%,33.0,69.0,9.0,104.0,1.0,-1.0,0.0
50%,39.0,444.0,16.0,185.0,2.0,-1.0,0.0
75%,49.0,1480.0,21.0,329.0,3.0,-1.0,0.0
max,87.0,71188.0,31.0,3025.0,50.0,871.0,25.0


In [103]:
display(Markdown("### Dataset Summary for Categorical"))
mbd.describe(include=['object', 'category'])

### Dataset Summary for Categorical

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,deposit permission
count,4521,4521,4521,4521,4521,4521,4521,4521,4521,4521
unique,12,3,4,2,2,2,3,12,4,2
top,management,married,secondary,no,yes,no,cellular,may,unknown,no
freq,969,2797,2306,4445,2559,3830,2896,1398,3705,4000


In [109]:
numerical_data = mbd.select_dtypes(include=['int64', 'float64']).columns

z_scores = stats.zscore(mbd[numerical_data].dropna())

outlier_threshold = 3
outliers_z_scores = (z_scores > outlier_threshold) | (z_scores < -outlier_threshold)

display(Markdown("### Detecting Outliers using Z Score Method"))
display(Markdown("#### Numerical Data Only:"))
print(outliers_z_scores.sum())

### Detecting Outliers using Z Score Method

#### Numerical Data Only:

age          44
balance      88
day           0
duration     88
campaign     87
pdays       171
previous     99
dtype: int64
