In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import np_utils
import numpy as np
from numpy import dstack

# Data transformation
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler

# Feature Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, mutual_info_classif

#ClassifiersP
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier 
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from keras.models import Sequential
from keras.layers import Dense
from scikeras.wrappers import KerasClassifier

# Warnings deactivate
import warnings
warnings.filterwarnings('ignore')

## Dataset Preparation

In [2]:
# Read and import data
df = pd.read_csv('Bank-data/bank-additional-full.csv', sep = ';')

# Explore the dataset
df.head()
df.tail()
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,258.28501,2.567593,962.475454,0.172963,0.081886,93.575664,-40.5026,3.621291,5167.035911
std,10.42125,259.279249,2.770014,186.910907,0.494901,1.57096,0.57884,4.628198,1.734447,72.251528
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


 Here's a summary of the information we've gathered about the dataset:

**Structure:**

- The dataset has 41,188 entries (rows) and 21 columns (features).
- The data types include integers `int64` for 5 columns, floating-point numbers `float64` for 5 columns, and text data `object` for 11 columns.
- There are no missing values in any of the columns, which simplifies data cleaning.

**Column Data Types:**

- Numerical Features:
   - `age` (int64)
   - `duration` (int64)
   - `campaign` (int64)
   - `pdays` (int64)
   - `previous` (int64)
   - `emp.var.rate` (float64)
   - `cons.price.idx` (float64)
   - `cons.conf.idx` (float64)
   - `euribor3m` (float64)
   - `nr.employed` (float64)

- Categorical Features:
   - `job` (object)
   - `marital` (object)
   - `education` (object)
   - `default` (object)
   - `housing` (object)
   - `loan` (object)
   - `contact` (object)
   - `month` (object)
   - `day_of_week` (object)
   - `poutcome` (object)
   - `y` (object, likely the target variable)


In [3]:
# Transform to categorical data
col_cat = list(df.select_dtypes(include=['object']).columns)
df.loc[:, col_cat] = df[col_cat].astype('category')

In [4]:
# Creating Separate DataFrames for Input Features (X) and Target Variable (y):
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [5]:
print(col_cat)

['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome', 'y']


In [6]:
# Encoding

# Remove target variable from enconding proccess
col_cat.pop()

# Encode categorical features
encoder = OrdinalEncoder()
encoder.fit(X[col_cat])

X_cat_enc = encoder.transform(X[col_cat])
X_cat_enc = pd.DataFrame(X_cat_enc)
X_cat_enc.columns = col_cat

In [7]:
# Normalization

# Selecting Numerical Columns
col_num = list(df.select_dtypes(include =['int64', 'float64']).columns)

# Creating MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

# Fitting and Transforming
X_num_enc = scaler.fit_transform(X[col_num])
X_num_enc = pd.DataFrame(X_num_enc)
X_num_enc.columns = col_num

# Concatenate the encoded categorical features with the normalized numerical features
x_enc = pd.concat([X_cat_enc, X_num_enc], axis=1)

In [8]:
# Encoding target variable
le = LabelEncoder()
le.fit(y)
y_enc = le.transform(y)
y_enc = pd.Series(y_enc)

# Set the column name of the Series to the original target variable name
y_enc.columns = y.name

In [9]:
# Remove correlated fields

# Create List of Columns
col = list(x_enc.columns)

# Remove Columns from List
col.remove('emp.var.rate')
col.remove('nr.employed')

# Filter DataFrame
x_enc = x_enc[col]

In [11]:
# Feature selection
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(x_enc,y_enc)

# Feature Scores Analysis
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(x_enc.columns)
featureScores = pd.concat([dfcolumns, dfscores], axis = 1)
featureScores.columns = ['Specs','Score']  # Naming the dataframe columns

 # Create a new x_enc containing only the top 10 features identified by the chi-square test.
x_enc = x_enc[list(featureScores.nlargest(10,'Score')['Specs'])]

# Print Results
pd.options.display.float_format = '{:.2f}'.format
print(x_enc)
print(y_enc)

       euribor3m  contact  previous  duration  default  education  pdays  \
0           0.96     1.00      0.00      0.05     0.00       0.00   1.00   
1           0.96     1.00      0.00      0.03     1.00       3.00   1.00   
2           0.96     1.00      0.00      0.05     0.00       3.00   1.00   
3           0.96     1.00      0.00      0.03     0.00       1.00   1.00   
4           0.96     1.00      0.00      0.06     0.00       3.00   1.00   
...          ...      ...       ...       ...      ...        ...    ...   
41183       0.09     0.00      0.00      0.07     0.00       5.00   1.00   
41184       0.09     0.00      0.00      0.08     0.00       5.00   1.00   
41185       0.09     0.00      0.00      0.04     0.00       6.00   1.00   
41186       0.09     0.00      0.00      0.09     0.00       5.00   1.00   
41187       0.09     0.00      0.14      0.05     0.00       5.00   1.00   

       poutcome  job  cons.price.idx  
0          1.00 3.00            0.70  
1        

## Classification models