In [2]:
import pandas as pd
import numpy as np
from scipy.stats import trim_mean, kurtosis, skew, boxcox
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest, RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from sklearn.svm import OneClassSVM, SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

import warnings
from time import perf_counter
import joblib

In [6]:
data = pd.read_csv("../../Data/Bank_Marketing.csv")
print("Few rows of the dataset")
data

Few rows of the dataset


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45202,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45203,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45204,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45205,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [10]:
print(f'The shape of Dataset:\nNumber of rows: {data.shape[0]}\nNumber of columns: {data.shape[1]}')

The shape of Dataset:
Number of rows: 45207
Number of columns: 17


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45207 entries, 0 to 45206
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45207 non-null  int64 
 1   job        45207 non-null  object
 2   marital    45207 non-null  object
 3   education  45207 non-null  object
 4   default    45207 non-null  object
 5   balance    45207 non-null  int64 
 6   housing    45207 non-null  object
 7   loan       45207 non-null  object
 8   contact    45207 non-null  object
 9   day        45207 non-null  int64 
 10  month      45207 non-null  object
 11  duration   45207 non-null  int64 
 12  campaign   45207 non-null  int64 
 13  pdays      45207 non-null  int64 
 14  previous   45207 non-null  int64 
 15  poutcome   45207 non-null  object
 16  y          45207 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [13]:
print(f'Null values in each column:\n\n{data.isnull().sum()}')

Null values in each column:

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


<p style = "color:black"; "font-family:Times New Roman"
>The data.isnull() method indicates that this dataset has no <b>missing values</b>.
However, it is important to investigate <b>Unknown</b> values in different features.</p>

In [14]:
print("\nSummary statistics for numerical features:")
data.describe()


Summary statistics for numerical features:


Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45207.0,45207.0,45207.0,45207.0,45207.0,45207.0,45207.0
mean,40.936315,1362.246798,15.806357,258.178202,2.763576,40.195656,0.574292
std,10.61894,3044.825898,8.322256,257.530264,3.097614,100.127179,1.907985
min,18.0,-8019.0,1.0,1.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,58.0


<span style = "color:black">Insights from Dataset Description
The dataset description provides valuable insights into the distribution ans statistics of the numerical features:  
1. __Age:__
   * The mean age of clients is approximately 41 years.
   * The age of clients ranges from 18 to 95 years.
   * the majority of clients are between the age of 33 and 48.

2. __Balance:__
   * The mean account balance is approximately 1362 euros.
   * The account balance varies widely, ranging from -8019 to 102127 euros.
   * The distribution of account balances is skewed, with a large standard deviation of 3044,77.

3. __Day:__  
   * The mean last contact day of the month is approximately 16.

4. __Duration:__
   * The mean duration of the last contact is approximately 258 seconds.
   * The duration of the last contact ranges from 0 to 4918 seconds.
   * The distribution of contact duration is right-skewed, with a large standard deviation of 257.

5. __Campaign:__
   * The mean number of contacts performed during the current campaign is approximately 3.
   * The number of campaign contacts ranges from 1 to 63.

6. __Pdays (Number of days since the client was last contacted from a previous campaign):__  
   * The mean number of days since the client was last contacted is approximately 40 days.
   * A negative value (-1) indicates that the client was not previously contacted.

7. __Previous (Number of contacts performed before this campaign for the client):__
   * The number of previous contacts ranges from 0 to 275.
   * The majority of clients were not contacted before this campaign, as indicated by the 75th percentile.
</span>

In [18]:
#Selects all the columns that are of type numerical
num_col = data.select_dtypes(include=['int64']).columns

# Calculate mean and trimmed mean for each numeric columns
mean_vs_trimmed_mean = pd.DataFrame({'mean' : data[num_col].mean(),
                                    'trimmed_mean': data[num_col].
                                    apply(lambda x: trim_mean(x, proportiontocut=0.1))})


Unnamed: 0,mean,trimmed_mean
age,40.936315,40.251998
balance,1362.246798,767.225482
day,15.806357,15.687201
duration,258.178202,210.886775
campaign,2.763576,2.119805
pdays,40.195656,11.916913
previous,0.574292,0.129953
