In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from pathlib import Path

In [5]:
raw_df = pd.read_csv(f'{Path.cwd()}/../datasets/bank_marketing_original.csv', sep=';')

In [None]:
raw_df.columns.size

In [7]:
raw_df.rename(columns={'y': 'deposit'}, inplace=True)

In [None]:
for i in raw_df.columns:
    print(f"- \"{i}\"")

In [None]:
raw_df.info()

In [None]:
# number of rows in dataset

print("Bank marketing dataset consists of {rows} rows.".format(rows = len(raw_df)))

In [None]:
#find percentage of missing values for each column
missing_values = raw_df.isnull().mean()*100

missing_values.sum()

In [10]:
cat_columns = ['job', 'marital', 'education', 'contact', 'month','poutcome']
numerical_columns = ['age', 'balance', 'day','duration', 'pdays', 'campaign', 'previous']
binary_columns = ['default', 'housing', 'loan']

In [12]:
# map income labels to binary values
raw_df['deposit'] = raw_df['deposit'].map({'no': 0, 'yes': 1})

In [None]:
raw_df['deposit'].value_counts()

In [None]:
raw_df['deposit'].value_counts().plot.bar(title = 'Deposit value counts')


In [None]:
# One-hot encode categorical columns
raw_one_hot_encoded = raw_df.copy()

for cat_col in cat_columns:
    # if raw_df[cat_col].nunique() <= 15:
    print(f"One-hot encoding column {cat_col}")

    # Perform one-hot encoding using pd.get_dummies()
    encoded_column = pd.get_dummies(raw_df[cat_col], prefix=cat_col, prefix_sep="__")

    print(f"One hot encoded column {cat_col} has {encoded_column.shape[1]} new features")
    
    # Drop the original categorical column
    encoded_df = raw_one_hot_encoded.drop(columns=[cat_col])
    
    # Concatenate the one-hot encoded columns to the DataFrame
    raw_one_hot_encoded = pd.concat([encoded_df, encoded_column], axis=1)

raw_one_hot_encoded

In [None]:
# label encode binary categorical columns
for col in binary_columns:
    _encoder = LabelEncoder()
    _encoder.fit(raw_one_hot_encoded[col])

    # Some logging
    print(f"Number Unique Classes of LabelEncoding in column {col}: {len(np.unique(_encoder.classes_))}")
    
    raw_one_hot_encoded[col] = _encoder.transform(raw_one_hot_encoded[col])

In [17]:
# standard scale numerical columns
for col in numerical_columns:
    _scaler = StandardScaler()
    raw_one_hot_encoded[col] = _scaler.fit_transform(raw_one_hot_encoded[col].to_numpy().reshape(-1, 1))

In [None]:
raw_df['deposit'].value_counts()

In [None]:
for i in raw_one_hot_encoded.columns:
    print(f"- \"{i}\"")

In [None]:
raw_one_hot_encoded.to_csv(f'{Path.cwd()}/../datasets/bank_marketing_preprocessed.csv', index=False)

In [None]:
corr = raw_one_hot_encoded.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})