<a href="https://colab.research.google.com/github/Jannahan/CM2604-ML-Coursework/blob/Develop/2409089_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Import required libraries
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

In [3]:
# Mount Google Drive for dataset access
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Load the dataset
file_path = '/content/drive/My Drive/ML_Coursework/bank-additional-full.csv'
data = pd.read_csv(file_path, sep=';')

#Dataset

In [5]:
# Display dataset summary
print(data.head())
print(data.info())

   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         0  nonexistent          1.1   
1   may         mon  ...         1    999         0  nonexistent          1.1   
2   may         mon  ...         1    999         0  nonexistent          1.1   
3   may         mon  ...         1    999         0  nonexistent          1.1   
4   may         mon  ...         1    999         0  nonexistent          1.1   

   cons.price.idx  cons.conf.idx  euribor3m  nr.employed

#Preprocessing

In [6]:
# Data Preprocessing
print("missing values")
print(data.isnull().sum())

print("unique values in each column")
print(data.nunique())

# Replace 'unknown' values with NaN to mark 'unknown' values as missing data (NaN),
data.replace("unknown", pd.NA, inplace=True)

# One-hot encode categorical variables
data_encoded = pd.get_dummies(data, drop_first=True)

# Encode the target column ('y')
# This step converts categorical target values into numerical format for machine learning models.
data_encoded['y'] = data['y'].map({'yes': 1, 'no': 0})

# Confirm encoding
print(data_encoded.head())

# Separate features and target
X = data_encoded.drop(columns=['y'])
y = data_encoded['y']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print("Training and testing data split completed.")

missing values
age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64
unique values in each column
age                 78
job                 12
marital              4
education            8
default              3
housing              3
loan                 3
contact              2
month               10
day_of_week          5
duration          1544
campaign            42
pdays               27
previous             8
poutcome             3
emp.var.rate        10
cons.price.idx      26
cons.conf.idx       26
euribor3m          316
nr.employed         11
y                    2
dtype: int64
   age  duration  campaign 

#Random Forest

In [7]:
# Initialize and train Random Forest Classifier
# Using 100 estimators to ensure robust decision-making.
# Random state is set for reproducibility of results.
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate Random Forest
print("Random Forest Classifier Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Feature importance
importances = rf_model.feature_importances_
feature_names = X_train.columns

# Display top 10 important features
importances_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)
print("Top 10 Important Features:")
print(importances_df.head(10))


Random Forest Classifier Results:
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     10968
           1       1.00      1.00      1.00      1389

    accuracy                           1.00     12357
   macro avg       1.00      1.00      1.00     12357
weighted avg       1.00      1.00      1.00     12357

Confusion Matrix:
 [[10968     0]
 [    0  1389]]
Top 10 Important Features:
             Feature  Importance
47             y_yes    0.698003
1           duration    0.090111
9        nr.employed    0.035375
8          euribor3m    0.034701
3              pdays    0.018515
46  poutcome_success    0.016436
0                age    0.013494
5       emp.var.rate    0.013305
7      cons.conf.idx    0.013274
6     cons.price.idx    0.009717


# Random Forest Without duration

In [8]:
# Drop 'duration' from training and testing sets
X_train_rf = X_train.drop(columns=['duration'])
X_test_rf = X_test.drop(columns=['duration'])

# Initialize and train the Random Forest Classifier without 'duration'
rf_model_no_duration = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_no_duration.fit(X_train_rf, y_train)

# Make predictions and evaluate
y_pred_rf_no_duration = rf_model_no_duration.predict(X_test_rf)

# Evaluate the model
print("Random Forest (Without 'duration') Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf_no_duration))
print("Classification Report:\n", classification_report(y_test, y_pred_rf_no_duration))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf_no_duration))

Random Forest (Without 'duration') Results:
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     10968
           1       1.00      1.00      1.00      1389

    accuracy                           1.00     12357
   macro avg       1.00      1.00      1.00     12357
weighted avg       1.00      1.00      1.00     12357

Confusion Matrix:
 [[10968     0]
 [    0  1389]]


# Random Forest without y_yes

In [9]:
# Drop 'y_yes' from training and testing sets
if 'y_yes' in X_train.columns:
    X_train_rf = X_train.drop(columns=['y_yes'])
    X_test_rf = X_test.drop(columns=['y_yes'])

    rf_model_no_y_yes = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model_no_y_yes.fit(X_train_rf, y_train)

    y_pred_rf_no_y_yes = rf_model_no_y_yes.predict(X_test_rf)
    print("Random Forest (Without 'y_yes') Results:")
    print("Accuracy:", accuracy_score(y_test, y_pred_rf_no_y_yes))
    print("Classification Report:\n", classification_report(y_test, y_pred_rf_no_y_yes))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf_no_y_yes))

Random Forest (Without 'y_yes') Results:
Accuracy: 0.9137331067411184
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.95     10968
           1       0.66      0.48      0.55      1389

    accuracy                           0.91     12357
   macro avg       0.80      0.72      0.75     12357
weighted avg       0.91      0.91      0.91     12357

Confusion Matrix:
 [[10631   337]
 [  729   660]]
