In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('./Dataset/bank.csv')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [3]:
# mapping default, housing, loan column to binary values
binary_mapping = {'yes': 1, 'no': 0}
df['default'] = df['default'].map(binary_mapping)
df['housing'] = df['housing'].map(binary_mapping)
df['loan'] = df['loan'].map(binary_mapping)
df['deposit'] = df['deposit'].map(binary_mapping)

# mapping month to month number
month_mapping = {
    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4,
    'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
    'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
}
df['month_num'] = df['month'].map(month_mapping)

In [4]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit,month_num
0,59,admin.,married,secondary,0,2343,1,0,unknown,5,may,1042,1,-1,0,unknown,1,5
1,56,admin.,married,secondary,0,45,0,0,unknown,5,may,1467,1,-1,0,unknown,1,5
2,41,technician,married,secondary,0,1270,1,0,unknown,5,may,1389,1,-1,0,unknown,1,5
3,55,services,married,secondary,0,2476,1,0,unknown,5,may,579,1,-1,0,unknown,1,5
4,54,admin.,married,tertiary,0,184,0,0,unknown,5,may,673,2,-1,0,unknown,1,5


In [5]:
bin = [0, 500, 1000, 2000, 5000, 10000, np.inf]
label = ['0-500', '501-1000', '1001-2000', '2001-5000', '5001-10000', '10001+']

df['balance_bin'] = pd.cut(df['balance'], bins=bin, labels=label, right=False) 

In [6]:
df = df.drop(columns=['duration', 'day'])

In [7]:
# ...existing code...
categorical_trans = Pipeline(
    steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]
)
categorical_col = ['job', 'marital', 'education', 'contact', 'month', 'poutcome']

preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', categorical_trans, categorical_col)
    ],
    remainder='drop'
)

# Fit+transform in one step or use fit then transform
after_trans = preprocessor.fit_transform(df)

# Optionally convert to DataFrame with column names:
import pandas as pd
col_names = preprocessor.get_feature_names_out()
after_df = pd.DataFrame(after_trans, columns=col_names)
# ...existing code...

In [8]:
df = df.drop(columns=categorical_col)
df = df.drop(columns="balance_bin")
df = pd.concat([df.reset_index(drop=True), after_df.reset_index(drop=True)], axis=1)

In [9]:
df.head()

Unnamed: 0,age,default,balance,housing,loan,campaign,pdays,previous,deposit,month_num,...,categorical__month_jun,categorical__month_mar,categorical__month_may,categorical__month_nov,categorical__month_oct,categorical__month_sep,categorical__poutcome_failure,categorical__poutcome_other,categorical__poutcome_success,categorical__poutcome_unknown
0,59,0,2343,1,0,1,-1,0,1,5,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,56,0,45,0,0,1,-1,0,1,5,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,41,0,1270,1,0,1,-1,0,1,5,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,55,0,2476,1,0,1,-1,0,1,5,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,54,0,184,0,0,2,-1,0,1,5,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
df.to_csv('./Dataset/bank_cleansed.csv', index=False)