# Ensemble Model Building
### Random Forest

Importing all required libraries

In [1]:
# import pandas for reading the dataframe
import pandas as pd

# import test train split
from sklearn.model_selection import train_test_split

# import encoders to transform numerical and categorical data
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# import random forest model from ensemble models
from sklearn.ensemble import RandomForestClassifier

# import metrics to see how well the model performed
from sklearn.metrics import classification_report, roc_auc_score

Gather the data

In [2]:
# reading the full csv dataset
df = pd.read_csv("../Data/bank-additional/bank-additional-full.csv", delimiter=";")

### Visualizing and cleaning dataset

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

Dropping duplicates / unused features

In [5]:
# drop duplicate data
df.drop_duplicates(keep='first', inplace=True)

In [6]:
# removing socio-economic factors from dataset
df = df.drop(labels=["emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"], axis=1)

In [7]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

### Splitting the dataset into X and y, with binary mapping of 'y'

In [8]:
X = df.drop('y', axis=1)
y = df['y'].map({'yes': 1, 'no': 0})

## Setting up data pipeline

#### Preprocessing

In [9]:
# Data preprocessing pipeline
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [10]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

Splitting the dataset into training and testing data

In [11]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Building random forest model

In [12]:
# Random Forest pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_estimators=100))
])

rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)

print("Random Forest Results:")
print(classification_report(y_test, y_pred_rf))
print('ROC-AUC Score:', roc_auc_score(y_test, y_pred_rf))

Random Forest Results:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      7265
           1       0.65      0.37      0.47       971

    accuracy                           0.90      8236
   macro avg       0.78      0.67      0.71      8236
weighted avg       0.89      0.90      0.89      8236

ROC-AUC Score: 0.673739831578261


- The ROC curve shows the performance of a binary classifier with different decision thresholds. It plots the True Positive rate (TPR) against the False Positive rate (FPR).
- The ROC AUC score sums up how well a model can produce relative scores to discriminate between positive or negative instances across all classification thresholds. 
- The ROC AUC score ranges from 0 to 1, where 0.5 indicates random guessing, and 1 indicates perfect performance.

### Visualising the most imoportant factors influencing the sale of the term deposit

In [13]:
# Feature importance
feature_names = numeric_features.tolist() + \
    rf_pipeline.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_features).tolist()
feature_importances = rf_pipeline.named_steps['classifier'].feature_importances_
important_features = pd.Series(feature_importances, index=feature_names).sort_values(ascending=False)
print("Feature Importances:\n", important_features)

Feature Importances:
 duration                         2.969830e-01
age                              1.090412e-01
campaign                         4.629377e-02
pdays                            4.515205e-02
poutcome_success                 3.404102e-02
previous                         1.839379e-02
month_mar                        1.606221e-02
housing_yes                      1.515182e-02
day_of_week_thu                  1.480222e-02
housing_no                       1.456928e-02
month_oct                        1.421382e-02
day_of_week_mon                  1.404362e-02
education_university.degree      1.369184e-02
month_jun                        1.361858e-02
day_of_week_wed                  1.354237e-02
day_of_week_fri                  1.351417e-02
day_of_week_tue                  1.347399e-02
job_admin.                       1.336237e-02
poutcome_nonexistent             1.254092e-02
education_high.school            1.238637e-02
marital_married                  1.210568e-02
month_may   