In [2]:
import pandas as pd

# load data from csv
data_path = "./Data/bank-full.csv"
df = pd.read_csv(data_path, delimiter=";")
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [14]:
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder

def apply_one_hot_encoding(input_df, output_df, feature):
    one_hot_encoder = OneHotEncoder()

    # fit and transform the data, then convert the resulting sparse matrix to a DF
    one_hot_encoded = one_hot_encoder.fit_transform(input_df[[feature]])
    one_hot_encoded_df = pd.DataFrame(one_hot_encoded.toarray(), columns=one_hot_encoder.get_feature_names_out([feature]))

    # concatenate the one-hot encoded DataFrame with the original DataFrame
    output_df = pd.concat([output_df, one_hot_encoded_df], axis=1)
    return output_df

def apply_ordinal_encoding(input_df, output_df, feature, order):
    ord_encoder = OrdinalEncoder(categories=[order])
    output_df[feature] = ord_encoder.fit_transform(input_df[[feature]])
    return output_df

#def apply_label_encoding(input_df, output_df, feature):
#    output_df['education'] = LabelEncoder().fit_transform(input_df[feature])
#    return output_df

In [27]:
# create a new df for encoded data
encoded_df = pd.DataFrame()

# encode age data through binning method ========================================================
discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform', subsample=50000)
encoded_df[['age']] = discretizer.fit_transform(df[['age']])

# encode job ====================================================================================
encoded_df = apply_one_hot_encoding(df, encoded_df, 'job')

# encode marital ================================================================================
encoded_df = apply_one_hot_encoding(df, encoded_df, 'marital')

# encode education ==============================================================================
edu_order = ['unknown', 'primary', 'secondary', 'tertiary']
encoded_df = apply_ordinal_encoding(df, encoded_df, 'education', edu_order)

# encode default ================================================================================
encoded_df = apply_one_hot_encoding(df, encoded_df, 'default')

# encode balance ================================================================================
discretizer = KBinsDiscretizer(n_bins=128, encode='ordinal', strategy='uniform', subsample=50000)
encoded_df[['balance']] = discretizer.fit_transform(df[['balance']])

# encode housing ================================================================================

# encode loan ===================================================================================

# encode contact ================================================================================

# encode day ====================================================================================

# encode month ==================================================================================

# encode duration ===============================================================================

# encode campaign ===============================================================================

# encode pdays ==================================================================================

# encode previous ===============================================================================

# encode poutcome ===============================================================================

# display encoded df
encoded_df


Unnamed: 0,age,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,education,default_no,default_yes,balance
0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,1.0,0.0,11.0
1,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,9.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,1.0,0.0,9.0
3,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,11.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,3.0,1.0,0.0,10.0
45207,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,11.0
45208,7.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,1.0,0.0,15.0
45209,5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,1.0,0.0,10.0


In [28]:
encoded_df['balance'].describe()

count    45211.000000
mean        10.412532
std          3.542833
min          0.000000
25%          9.000000
50%          9.000000
75%         10.000000
max        127.000000
Name: balance, dtype: float64

In [26]:
filtered_rows = encoded_df[encoded_df['balance'] < 100]

# Print the first row that meets the condition
print(filtered_rows.index[2])

display(df.iloc[filtered_rows.index[0]])
display(encoded_df.iloc[filtered_rows.index[0]])

4


age                  44
job          technician
marital          single
education     secondary
default              no
balance              29
housing             yes
loan                 no
contact         unknown
day                   5
month               may
duration            151
campaign              1
pdays                -1
previous              0
poutcome        unknown
y                    no
Name: 1, dtype: object

age                   3.0
job_admin.            0.0
job_blue-collar       0.0
job_entrepreneur      0.0
job_housemaid         0.0
job_management        0.0
job_retired           0.0
job_self-employed     0.0
job_services          0.0
job_student           0.0
job_technician        1.0
job_unemployed        0.0
job_unknown           0.0
marital_divorced      0.0
marital_married       0.0
marital_single        1.0
education             2.0
default_no            1.0
default_yes           0.0
balance              87.0
Name: 1, dtype: float64