[dataset](https://www.kaggle.com/datasets/janiobachmann/bank-marketing-dataset?utm_source=chatgpt.com)

In [36]:
# Libraries
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [37]:
# Load Dataset
df = pd.read_csv('bank.csv')
print(df.info()
    ,df.head(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        11162 non-null  int64 
 1   job        11162 non-null  object
 2   marital    11162 non-null  object
 3   education  11162 non-null  object
 4   default    11162 non-null  object
 5   balance    11162 non-null  int64 
 6   housing    11162 non-null  object
 7   loan       11162 non-null  object
 8   contact    11162 non-null  object
 9   day        11162 non-null  int64 
 10  month      11162 non-null  object
 11  duration   11162 non-null  int64 
 12  campaign   11162 non-null  int64 
 13  pdays      11162 non-null  int64 
 14  previous   11162 non-null  int64 
 15  poutcome   11162 non-null  object
 16  deposit    11162 non-null  object
dtypes: int64(7), object(10)
memory usage: 1.4+ MB
None    age         job  marital  education default  balance housing loan  contact  \
0   59      a

In [38]:
# Check null values
print(df.isnull().sum())

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
deposit      0
dtype: int64


In [39]:
# check values to standardization
for column in df.columns:
    print(f"{column}: {set(df[column])}")

age: {18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 92, 93, 95}
job: {'services', 'management', 'blue-collar', 'student', 'admin.', 'self-employed', 'technician', 'entrepreneur', 'housemaid', 'unemployed', 'retired', 'unknown'}
marital: {'single', 'married', 'divorced'}
education: {'secondary', 'tertiary', 'unknown', 'primary'}
default: {'no', 'yes'}
balance: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 8278, 94, 95, 96, 97, 98, 99,

In [40]:
# Imputation
num_cols = df.select_dtypes(include=['int64','float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

imputer_num = SimpleImputer(strategy='median')
df[num_cols] = imputer_num.fit_transform(df[num_cols])

imputer_cat = SimpleImputer(strategy='most_frequent')
df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])


In [41]:
# Standardization
scaler_std = StandardScaler()
df[num_cols] = scaler_std.fit_transform(df[num_cols])


In [42]:
# Codification
encoder = OneHotEncoder(sparse_output=False,drop='first')
encoded = encoder.fit_transform(df[cat_cols])
encoded_df = pd.DataFrame(encoded,columns=encoder.get_feature_names_out(cat_cols))


In [43]:
# Final Dataset
df_final = pd.concat([df[num_cols], encoded_df], axis=1)
df_final.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_blue-collar,job_entrepreneur,job_housemaid,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown,deposit_yes
0,1.491505,0.252525,-1.265746,1.930226,-0.554168,-0.481184,-0.36326,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,1.239676,-0.459974,-1.265746,3.154612,-0.554168,-0.481184,-0.36326,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,-0.01947,-0.08016,-1.265746,2.929901,-0.554168,-0.481184,-0.36326,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,1.155733,0.293762,-1.265746,0.596366,-0.554168,-0.481184,-0.36326,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,1.07179,-0.416876,-1.265746,0.867171,-0.186785,-0.481184,-0.36326,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
