In [741]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [742]:
# Load the dataset
data = pd.read_csv("new-bank-additional-full.csv")
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [743]:
data['marital'].value_counts()

marital
married     24928
single      11568
divorced     4612
unknown        80
Name: count, dtype: int64

In [744]:
data['default'].value_counts()

default
no         32588
unknown     8597
yes            3
Name: count, dtype: int64

In [745]:
data['default'] = data['default'].map({'no': 0, 'yes': 1, 'unknown': -1})

In [746]:
data['housing'].value_counts()

housing
yes        21576
no         18622
unknown      990
Name: count, dtype: int64

In [747]:
data['housing'] = data['housing'].map({'no': 0, 'yes': 1, 'unknown': -1})

In [748]:
data['loan'].value_counts()

loan
no         33950
yes         6248
unknown      990
Name: count, dtype: int64

In [749]:
data['loan'] = data['loan'].map({'no': 0, 'yes': 1, 'unknown': -1})

In [750]:
data['contact'].value_counts()

contact
cellular     26144
telephone    15044
Name: count, dtype: int64

In [751]:
data['contact'] = data['contact'].map({'cellular': 1, 'telephone': 0})

In [752]:
data['month'].value_counts()

month
may    13769
jul     7174
aug     6178
jun     5318
nov     4101
apr     2632
oct      718
sep      570
mar      546
dec      182
Name: count, dtype: int64

In [753]:
month_mapping = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
                 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}

data['month'] = data['month'].map(month_mapping)

In [754]:
data['day_of_week'].value_counts()

day_of_week
thu    8623
mon    8514
wed    8134
tue    8090
fri    7827
Name: count, dtype: int64

In [755]:
day_mapping = {'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 'fri': 5}
data['day_of_week'] = data['day_of_week'].map(day_mapping)

In [756]:
data['poutcome'].value_counts()

poutcome
nonexistent    35563
failure         4252
success         1373
Name: count, dtype: int64

In [757]:
poutcome_mapping = {'nonexistent': 0, 'failure': 1, 'success': 2}
data['poutcome'] = data['poutcome'].map(poutcome_mapping)

In [758]:
data['education'].value_counts()

education
university.degree      12168
high.school             9515
basic.9y                6045
professional.course     5243
basic.4y                4176
basic.6y                2292
unknown                 1731
illiterate                18
Name: count, dtype: int64

In [759]:
education_mapping = {
    'illiterate': 0,
    'basic.4y': 1,
    'basic.6y': 2,
    'basic.9y': 3,
    'high.school': 4,
    'professional.course': 5,
    'university.degree': 6,
    'unknown': 0  # Treat 'unknown' the same as 'illiterate'
}

data['education'] = data['education'].map(education_mapping)

In [760]:
data['job'].value_counts()

job
admin.           10422
blue-collar       9254
technician        6743
services          3969
management        2924
retired           1720
entrepreneur      1456
self-employed     1421
housemaid         1060
unemployed        1014
student            875
unknown            330
Name: count, dtype: int64

In [761]:
# Replace "unknown" with NaN
columns_to_consider = ["marital","job"]
data = data.replace("unknown", np.nan)

# Before dropping rows, print the number of rows before dropping
rows_before = len(data)
print(f"Rows before removing unknown values: {rows_before}")

# Drop rows with NaN in specified columns
data = data.dropna(subset=columns_to_consider)

# After dropping rows, print the number of rows after dropping
rows_after = len(data)
print(f"Rows after removing unknown values: {rows_after}")

# Print how many rows were dropped
rows_deleted = rows_before - rows_after
print(f"Rows dropped after removing unknown values: {rows_deleted}")

Rows before removing unknown values: 41188
Rows after removing unknown values: 40787
Rows dropped after removing unknown values: 401


In [762]:
# Drop the 'duration' column as it leaks information
data.drop(columns=["duration"], inplace=True)

In [763]:
# Encode target variable
data['y'] = data['y'].map({"yes": 1, "no": 0})

In [764]:
# Handle missing values (Imputation using mode)
data.fillna(data.mode().iloc[0], inplace=True)

In [765]:
# Custom One-Hot Encoding for 'marital'
data['marital_single'] = (data['marital'] == 'single').astype(int)
data['marital_married'] = (data['marital'] == 'married').astype(int)

# Drop the original 'marital' column if no longer needed
data.drop(columns=['marital'], inplace=True)

For Neural Networks

In [766]:
# One-Hot Encoding for job column
data_nn = pd.get_dummies(data, columns=["job"], drop_first=True)

# Convert boolean True/False to 1/0 for One-Hot Encoded columns
boolean_columns = data_nn.select_dtypes(include=[bool]).columns  # Select only boolean columns
data_nn[boolean_columns] = data_nn[boolean_columns].astype(int)

# Print the first few rows to verify the changes
print(data_nn.head(10))

   age  education  default  housing  loan  contact  month  day_of_week  \
0   56          1        0        0     0        0      5            1   
1   57          4       -1        0     0        0      5            1   
2   37          4        0        1     0        0      5            1   
3   40          2        0        0     0        0      5            1   
4   56          4        0        0     1        0      5            1   
5   45          3       -1        0     0        0      5            1   
6   59          5        0        0     0        0      5            1   
7   41          0       -1        0     0        0      5            1   
8   24          5        0        1     0        0      5            1   
9   25          4        0        1     0        0      5            1   

   campaign  pdays  ...  job_blue-collar  job_entrepreneur  job_housemaid  \
0         1    999  ...                0                 0              1   
1         1    999  ...        

In [767]:
# Step 5: Group 'job' categories into meaningful categories
data['job'] = data['job'].replace({
    'retired': 'retired',
    'student': 'student_unemployed', 'unemployed': 'student_unemployed',
    'blue-collar': 'blue-collar', 'technician': 'technician_services',
    'services': 'technician_services', 'management': 'management',
    'entrepreneur': 'entrepreneur', 'self-employed': 'self-employed',
    'housemaid': 'housemaid', 'admin.': 'admin'
})

# Preview after job grouping
print("\nJob grouping preview:")
print(data['job'].value_counts())


Job grouping preview:
job
technician_services    10694
admin                  10408
blue-collar             9240
management              2921
student_unemployed      1883
retired                 1715
entrepreneur            1453
self-employed           1416
housemaid               1057
Name: count, dtype: int64


For Random Forests

In [768]:
# Step 7: **For Random Forest**: Label Encoding
# Directly encode 'job' column with LabelEncoder
le = LabelEncoder()
data_rf = data.copy()  # Create a copy for Random Forest
data_rf["job"] = le.fit_transform(data_rf["job"])

# Create a DataFrame with the encoded job mappings
job_mapping_df = pd.DataFrame({
    'Job Category': le.classes_,
    'Encoded Value': le.transform(le.classes_)
})

# Display the table
print("Job Encoding Mapping (in Table):")
print(job_mapping_df.to_string(index=False))

Job Encoding Mapping (in Table):
       Job Category  Encoded Value
              admin              0
        blue-collar              1
       entrepreneur              2
          housemaid              3
         management              4
            retired              5
      self-employed              6
 student_unemployed              7
technician_services              8


In [769]:
# Preview after Label Encoding
print("\nPreview of first 10 rows after Label Encoding for Random Forest:")
print(data_rf.head(10))


Preview of first 10 rows after Label Encoding for Random Forest:
   age  job  education  default  housing  loan  contact  month  day_of_week  \
0   56    3          1        0        0     0        0      5            1   
1   57    8          4       -1        0     0        0      5            1   
2   37    8          4        0        1     0        0      5            1   
3   40    0          2        0        0     0        0      5            1   
4   56    8          4        0        0     1        0      5            1   
5   45    8          3       -1        0     0        0      5            1   
6   59    0          5        0        0     0        0      5            1   
7   41    1          0       -1        0     0        0      5            1   
8   24    8          5        0        1     0        0      5            1   
9   25    8          4        0        1     0        0      5            1   

   campaign  ...  previous  poutcome  emp.var.rate  cons.price.i

In [114]:
# Save the dataset for Neural Networks
data_nn.to_csv("processed_bank_data_nn.csv", index=False)

In [113]:
# Save this dataset for Random Forest model
data_rf.to_csv("processed_data_for_rf.csv", index=False)