# Data Preprocessing for Dataset2

Load the Data

In [2]:
import pandas as pd

# Load the dataset
data = pd.read_csv('bank-additional.csv', delimiter=';')

Handle Missing Values; Mean for numerical values and Mode for categorical ones


In [3]:
# Replace "unknown" with NaN
data.replace('unknown', pd.NA, inplace=True)

# Convert yes to 1, no to 0 for the 'y' column
data['y'].replace({'yes': 1, 'no': 0}, inplace=True)

# Identify numeric columns
numeric_columns = data.select_dtypes(include=['number']).columns

# Handle missing values for numeric columns (impute with the mean)
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns

# Handle missing values for categorical columns (impute with the most frequent value)
data[categorical_columns] = data[categorical_columns].fillna(data[categorical_columns].mode().iloc[0])

Convert Categorical Features to Numerical (One-Hot Encoding)

In [4]:
# One-hot encode categorical features using pandas get_dummies
data = pd.get_dummies(data, columns=categorical_columns)

# Convert boolean values to integers (1 for True, 0 for False)
data = data.astype(int)

Separate Input from Prediction

In [5]:
# Separate input features and prediction feature
X = data.drop(columns=['y'])
y = data['y']

Split the Data

In [6]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Normalize the data

In [7]:
from sklearn.preprocessing import MinMaxScaler

# Identify numeric columns for normalization
numeric_columns = X_train.select_dtypes(include=['number']).columns

# Create a MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform both training and test data
X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])

print(X_train.head())

           age  duration  campaign  pdays  previous  emp.var.rate  \
324   0.214286  0.063958  0.000000    1.0  0.000000           1.0   
2350  0.214286  0.025254  0.000000    1.0  0.000000           1.0   
1391  0.485714  0.176503  0.035714    1.0  0.000000           1.0   
468   0.200000  0.099643  0.000000    1.0  0.166667           0.5   
561   0.300000  0.046116  0.035714    1.0  0.000000           1.0   

      cons.price.idx  cons.conf.idx  euribor3m  nr.employed  ...  month_oct  \
324              0.5       0.583333        0.8     0.860377  ...        0.0   
2350             0.5       0.583333        0.8     0.860377  ...        0.0   
1391             0.5       0.583333        0.8     0.860377  ...        0.0   
468              0.0       0.166667        0.2     0.513208  ...        0.0   
561              0.5       0.583333        0.8     0.860377  ...        0.0   

      month_sep  day_of_week_fri  day_of_week_mon  day_of_week_thu  \
324         0.0              0.0        