Load the Data

In [36]:
import pandas as pd

# Load the dataset
data = pd.read_csv('bank-additional.csv', delimiter=';')

# Display basic information about the dataset
print(data.info())

# Display the first few rows of the dataset
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4119 entries, 0 to 4118
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             4119 non-null   int64  
 1   job             4119 non-null   object 
 2   marital         4119 non-null   object 
 3   education       4119 non-null   object 
 4   default         4119 non-null   object 
 5   housing         4119 non-null   object 
 6   loan            4119 non-null   object 
 7   contact         4119 non-null   object 
 8   month           4119 non-null   object 
 9   day_of_week     4119 non-null   object 
 10  duration        4119 non-null   int64  
 11  campaign        4119 non-null   int64  
 12  pdays           4119 non-null   int64  
 13  previous        4119 non-null   int64  
 14  poutcome        4119 non-null   object 
 15  emp.var.rate    4119 non-null   float64
 16  cons.price.idx  4119 non-null   float64
 17  cons.conf.idx   4119 non-null   f

Handle Missing Values

In [37]:
# Replace "unknown" with NaN
data.replace('unknown', pd.NA, inplace=True)

# Identify numeric columns
numeric_columns = data.select_dtypes(include=['number']).columns

# Handle missing values for numeric columns (impute with the mean)
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())


Convert Categorical Features to Numerical

In [38]:
# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns

# One-hot encode categorical features
data = pd.get_dummies(data, columns=categorical_columns)
# Convert boolean values to integers (1 for True, 0 for False)
data = data.astype(int)


Separate Input from Prediction

In [39]:
# Separate input features and prediction feature
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

Display the data

In [40]:
# Display the preprocessed data
print("Preprocessed Data:")
print(X.head())
print(y.head())

#print number of columns
print("Number of columns: ", len(X.columns))


Preprocessed Data:
   age  duration  campaign  pdays  previous  emp.var.rate  cons.price.idx  \
0   30       487         2    999         0            -1              92   
1   39       346         4    999         0             1              93   
2   25       227         1    999         0             1              94   
3   38        17         3    999         0             1              94   
4   47        58         1    999         0             0              93   

   cons.conf.idx  euribor3m  nr.employed  ...  month_sep  day_of_week_fri  \
0            -46          1         5099  ...          0                1   
1            -36          4         5191  ...          0                1   
2            -41          4         5228  ...          0                0   
3            -41          4         5228  ...          0                1   
4            -42          4         5195  ...          0                0   

   day_of_week_mon  day_of_week_thu  day_of_week_tue  d

Split the Data

In [41]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Normalize the data

In [42]:
from sklearn.preprocessing import MinMaxScaler

# Identify numeric columns for normalization
numeric_columns = X_train.select_dtypes(include=['number']).columns

# Create a MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform both training and test data
X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])
