<a href="https://colab.research.google.com/github/HRashmika/Bank_Prediction/blob/main/ML_CW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import os
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout



In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# File path
file_path = 'bank-additional-full.csv'
data_add = pd.read_csv(file_path, delimiter=';')

# Debugging: print the original data
print("Original DataFrame Shape:", data_add.shape)

# Step 1: Remove the 'duration' column
data_add = data_add.drop('duration', axis=1, errors='ignore')

# Step 2: One-hot encode categorical columns
one_hot_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan',
                   'contact', 'month', 'day_of_week']
data_add = pd.get_dummies(data_add, columns=one_hot_columns, drop_first=True)

# Step 3: Label encode 'poutcome' column
label_columns = ['poutcome']
label_encoder = LabelEncoder()
label_mappings = {}

for col in label_columns:
    data_add[col] = label_encoder.fit_transform(data_add[col].fillna('unknown'))  # Fill NaNs with 'unknown'
    label_mappings[col] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Encode target column 'y'
data_add['y'] = data_add['y'].map({'yes': 1, 'no': 0})

# Print label mappings
print("\nLabel Mappings for 'poutcome':", label_mappings)

# Step 4: Handle duplicates
duplicates = data_add.duplicated().sum()
if duplicates > 0:
    print(f"\n{duplicates} duplicate rows found and removed.")
    data_add = data_add.drop_duplicates()
else:
    print("\nNo duplicate rows found.")

# Step 5: Handle missing values
missing_values = data_add.isnull().sum()
if missing_values.any():
    print("\nMissing Values Found:")
    print(missing_values[missing_values > 0])
    for col in data_add.columns:
        if data_add[col].dtype == 'object':
            data_add[col].fillna('unknown', inplace=True)
        else:
            data_add[col].fillna(data_add[col].median(), inplace=True)
else:
    print("\nNo missing values found.")

# Step 6: Min-Max Normalization
scaler = MinMaxScaler()
features_to_scale = data_add.drop('y', axis=1)
data_add[features_to_scale.columns] = scaler.fit_transform(features_to_scale)

# Step 7: Split data into features (X) and target (y)
X = data_add.drop('y', axis=1)
y = data_add['y']

# Step 8: Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 9: Balance the training dataset using SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Combine balanced data for inspection
train_data_add = pd.DataFrame(X_train_balanced, columns=X.columns)
train_data_add['y'] = y_train_balanced

test_data_add = X_test.copy()
test_data_add['y'] = y_test

# Print dataset shapes and class distributions
print("\nTraining Data Shape (After Balancing):", train_data_add.shape)
print("Testing Data Shape:", test_data_add.shape)
print("\nClass Distribution in Balanced Training Data:")
print(train_data_add['y'].value_counts(normalize=True))

# Final DataFrame Info
print("\nFinal DataFrame Info:")
data_add.info()
print("\nFinal DataFrame Preview:")
print(data_add.head())

# Check class distribution in the target variable ('y')
class_distribution = data_add['y'].value_counts(normalize=True) * 100
print("Class Distribution (%):")
print(class_distribution)


Original DataFrame Shape: (41188, 21)

Label Mappings for 'poutcome': {'poutcome': {'failure': 0, 'nonexistent': 1, 'success': 2}}

1784 duplicate rows found and removed.

No missing values found.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_add[features_to_scale.columns] = scaler.fit_transform(features_to_scale)



Training Data Shape (After Balancing): (55690, 52)
Testing Data Shape: (7881, 52)

Class Distribution in Balanced Training Data:
y
0    0.5
1    0.5
Name: proportion, dtype: float64

Final DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 39404 entries, 0 to 41187
Data columns (total 52 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            39404 non-null  float64
 1   campaign                       39404 non-null  float64
 2   pdays                          39404 non-null  float64
 3   previous                       39404 non-null  float64
 4   poutcome                       39404 non-null  float64
 5   emp.var.rate                   39404 non-null  float64
 6   cons.price.idx                 39404 non-null  float64
 7   cons.conf.idx                  39404 non-null  float64
 8   euribor3m                      39404 non-null  float64
 9   nr.employed              

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Set output directory in Google Drive
output_dir = '/content/drive/My Drive/ML_CW/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Define file paths
transformed_path = os.path.join(output_dir, 'transformed_add.csv')
train_data_path = os.path.join(output_dir, 'train_data_add.csv')
test_data_path = os.path.join(output_dir, 'test_data_add.csv')

try:
    # Save datasets
    data_add.to_csv(transformed_path, index=False)
    train_data_add.to_csv(train_data_path, index=False)
    test_data_add.to_csv(test_data_path, index=False)

    # Confirm saved files
    if os.path.exists(transformed_path):
        print(f"Transformed data saved to: {transformed_path}")
    else:
        print("Error: Transformed data not saved.")

    if os.path.exists(train_data_path):
        print(f"Training data saved to: {train_data_path}")
    else:
        print("Error: Training data not saved.")

    if os.path.exists(test_data_path):
        print(f"Testing data saved to: {test_data_path}")
    else:
        print("Error: Testing data not saved.")
except Exception as e:
    print(f"Error while saving files: {e}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Transformed data saved to: /content/drive/My Drive/ML_CW/transformed_add.csv
Training data saved to: /content/drive/My Drive/ML_CW/train_data_add.csv
Testing data saved to: /content/drive/My Drive/ML_CW/test_data_add.csv


In [9]:

# File paths for training and testing data
train_file_path = '/content/drive/My Drive/ML_CW/train_data_add.csv'
test_file_path = '/content/drive/My Drive/ML_CW/test_data_add.csv'

# Load training and testing data
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

X_train = train_data.drop('y', axis=1)
y_train = train_data['y']

X_test = test_data.drop('y', axis=1)
y_test = test_data['y']

# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# fit the classifier
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


# Output to debug
# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Feature Importances
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf_classifier.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nTop Features:")
print(feature_importances.head())


Accuracy: 0.88

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.95      0.94      6961
           1       0.51      0.36      0.42       920

    accuracy                           0.88      7881
   macro avg       0.71      0.66      0.68      7881
weighted avg       0.87      0.88      0.88      7881


Confusion Matrix:
[[6640  321]
 [ 587  333]]

Top Features:
        Feature  Importance
8     euribor3m    0.127059
1      campaign    0.113511
9   nr.employed    0.086621
0           age    0.080504
5  emp.var.rate    0.050869


In [11]:

# Define file paths in Google Drive
train_data_path = '/content/drive/My Drive/ML_CW/train_data_add.csv'
test_data_path = '/content/drive/My Drive/ML_CW/test_data_add.csv'

# Load the datasets
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Split features (X) and target (y)
X_train = train_data.drop('y', axis=1)
y_train = train_data['y']
X_test = test_data.drop('y', axis=1)
y_test = test_data['y']

# Build the Neural Network model
model = Sequential([
    Dense(64, activation='relu', input_dim=X_train.shape[1]),  # Input layer
    Dropout(0.3),  # Regularization
    Dense(32, activation='relu'),  # Hidden layer
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Accuracy: {test_accuracy:.4f}")

# Predict on the test set
y_pred = (model.predict(X_test) > 0.5).astype(int)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

model_save_path = '/content/drive/My Drive/ML_CW/bank_marketing_model.h5'
model.save(model_save_path)
print(f"\nModel saved to: {model_save_path}")


Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1393/1393[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.7274 - loss: 0.5641 - val_accuracy: 0.5617 - val_loss: 0.7368
Epoch 2/30
[1m1393/1393[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.7729 - loss: 0.5076 - val_accuracy: 0.5962 - val_loss: 0.6920
Epoch 3/30
[1m1393/1393[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7797 - loss: 0.4974 - val_accuracy: 0.5892 - val_loss: 0.6911
Epoch 4/30
[1m1393/1393[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.7849 - loss: 0.4847 - val_accuracy: 0.6152 - val_loss: 0.6776
Epoch 5/30
[1m1393/1393[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.7844 - loss: 0.4785 - val_accuracy: 0.5864 - val_loss: 0.7379
Epoch 6/30
[1m1393/1393[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7891 - loss: 0.4697 - val_accuracy: 0.6099 - val_loss: 0.6705
Epoch 7/30
[1m1393/1393[0



              precision    recall  f1-score   support

           0       0.93      0.93      0.93      6961
           1       0.45      0.45      0.45       920

    accuracy                           0.87      7881
   macro avg       0.69      0.69      0.69      7881
weighted avg       0.87      0.87      0.87      7881


Confusion Matrix:
[[6452  509]
 [ 503  417]]

Model saved to: /content/drive/My Drive/ML_CW/bank_marketing_model.h5
