# 2) Data Preprocessing - Pipeline

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np
import joblib 
import warnings
warnings.filterwarnings("ignore")

# 1. Initial Cleaning and Feature Engineering

In [20]:
# Load the dataset
df = pd.read_csv('./data/bank-additional-full.csv', sep=';')

# Standardize column names
df.columns = df.columns.str.replace('.', '_').str.replace('-', '_')

# Drop the 'duration' column
df = df.drop('duration', axis=1)

# Impute 'unknown' values for specific columns with the mode
for col in ['default', 'housing', 'loan']:
    mode_val = df[col].mode()[0]
    df[col] = df[col].replace('unknown', mode_val)

# Create a new age group feature
bins = [18, 30, 40, 50, 60, 100]
labels = ['18-29', '30-39', '40-49', '50-59', '60+']
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)

# Convert 'y' target variable to binary
df['y'] = df['y'].map({'yes': 1, 'no': 0})
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y,age_group
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,50-59
1,57,services,married,high.school,no,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,50-59
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,30-39
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,40-49
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,50-59


In [21]:
df.to_csv('./data/bank_marketing_preprocessed.csv', index=False)

# 2. Define Preprocessing Pipelines

In [15]:
# Separate features and target
X = df.drop('y', axis=1)
y = df['y']

# Identify numerical and categorical features
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Define numerical transformer pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Define categorical transformer pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create the preprocessor with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

# 3. Split the data and apply preprocessing

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply preprocessing to training and testing data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

print("Shape of preprocessed training data:", X_train_transformed.shape)
print("Shape of preprocessed testing data:", X_test_transformed.shape)

Shape of preprocessed training data: (32950, 64)
Shape of preprocessed testing data: (8238, 64)


In [17]:
# Save the preprocessor pipeline
joblib.dump(preprocessor, './models/preprocessor.pkl')
print("\nPreprocessor pipeline saved as 'preprocessor.pkl'")


Preprocessor pipeline saved as 'preprocessor.pkl'
