In [1]:
import pandas as pd

# Load the cleaned dataset (post 2.3)
df = pd.read_csv("../data/processed/bank_cleaned.csv")

# Quick check
df.head(5)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        11162 non-null  int64 
 1   job        11162 non-null  object
 2   marital    11162 non-null  object
 3   education  11162 non-null  object
 4   default    11162 non-null  int64 
 5   balance    11162 non-null  int64 
 6   housing    11162 non-null  int64 
 7   loan       11162 non-null  int64 
 8   contact    11162 non-null  object
 9   day        11162 non-null  int64 
 10  month      11162 non-null  object
 11  duration   11162 non-null  int64 
 12  campaign   11162 non-null  int64 
 13  pdays      11162 non-null  int64 
 14  previous   11162 non-null  int64 
 15  poutcome   11162 non-null  object
 16  deposit    11162 non-null  int64 
dtypes: int64(11), object(6)
memory usage: 1.4+ MB


In [2]:
def age_group(age):
    if age <= 30:
        return 'Young Adult'
    elif age <= 45:
        return 'Adult'
    elif age <= 60:
        return 'Middle-Aged'
    else:
        return 'Senior'

df['age_group'] = df['age'].apply(age_group)
df['age_group'] = df['age_group'].astype('category')


In [3]:
def balance_category(balance):
    if balance < 1000:
        return 'Low'
    elif balance <= 5000:
        return 'Medium'
    else:
        return 'High'

df['balance_category'] = df['balance'].apply(balance_category)
df['balance_category'] = df['balance_category'].astype('category')


In [4]:
def contact_intensity(campaign):
    if campaign <= 2:
        return 'Low'
    elif campaign <= 5:
        return 'Medium'
    else:
        return 'High'

df['contact_intensity'] = df['campaign'].apply(contact_intensity)
df['contact_intensity'] = df['contact_intensity'].astype('category')


In [5]:
# Quick check of new features
print(df[['age_group','balance_category','contact_intensity']].head(5))

# Value counts
print(df['age_group'].value_counts())
print(df['balance_category'].value_counts())
print(df['contact_intensity'].value_counts())


     age_group balance_category contact_intensity
0  Middle-Aged           Medium               Low
1  Middle-Aged              Low               Low
2        Adult           Medium               Low
3  Middle-Aged           Medium               Low
4  Middle-Aged              Low               Low
age_group
Adult          5522
Middle-Aged    3022
Young Adult    2007
Senior          611
Name: count, dtype: int64
balance_category
Low       7115
Medium    4047
Name: count, dtype: int64
contact_intensity
Low       7826
Medium    2470
High       866
Name: count, dtype: int64


In [6]:
# Save final dataset for EDA & modeling
df.to_csv("../data/processed/bank_final.csv", index=False)


### Feature Engineering

1. **Age Group:** Categorized customers into 'Young Adult (18–30)', 'Adult (31–45)', 'Middle-Aged (46–60)', and 'Senior (60+)' to capture age-based behavior.

2. **Balance Category:** Segmented balances into 'Low', 'Medium', 'High' to reduce skewness and highlight financial capability.

3. **Contact Intensity:** Grouped number of campaign contacts into 'Low', 'Medium', 'High' to model the effect of repeated marketing attempts.

These features enhance interpretability, improve model learning, and provide meaningful segments for analysis.
