<a href="https://colab.research.google.com/github/HRashmika/Bank_Prediction/blob/main/ML_CW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import os
from imblearn.over_sampling import SMOTE


In [None]:

# File path
file_path = 'bank-additional-full.csv'
data_add = pd.read_csv(file_path, delimiter=';')

# Debugging: print the original data
print("Original DataFrame:")
print(data_add.head())

# Drop the 'duration' column as it's highly correlated with the target variable
data_add = data_add.drop('duration', axis=1, errors='ignore')

one_hot_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week']
data_add = pd.get_dummies(data_add, columns=one_hot_columns, drop_first=True)

label_columns = ['poutcome']
label_encoder = LabelEncoder()
label_mappings = {}

for col in label_columns:
    data_add[col] = label_encoder.fit_transform(data_add[col].fillna('unknown'))  # Fill NaNs with 'unknown'
    label_mappings[col] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Encode 'y' column as 1 for 'yes' and 0 for 'no'
data_add['y'] = data_add['y'].map({'yes': 1, 'no': 0})

# Print label mappings for reference
print("\nLabel Mappings:")
for col, mapping in label_mappings.items():
    print(f"{col}: {mapping}")

# Check and handle duplicates
duplicates = data_add[data_add.duplicated()]
if not duplicates.empty:
    print("\nDuplicate Rows Found:")
    print(duplicates)
    data_add = data_add.drop_duplicates()
    print("\nDuplicates removed. Current shape of DataFrame:", data_add.shape)
else:
    print("\nNo duplicate rows found.")

# Check for missing values
missing_values = data_add.isnull().sum()
if missing_values.any():
    print("\nMissing Values Found:")
    print(missing_values[missing_values > 0])
    for col in data_add.columns:
        if data_add[col].dtype == 'object':
            data_add[col].fillna('unknown', inplace=True)
        else:
            data_add[col].fillna(data_add[col].median(), inplace=True)
else:
    print("\nNo missing values found.")

# Min-Max Normalization
scaler = MinMaxScaler()
features_to_scale = data_add.drop('y', axis=1)
data_add[features_to_scale.columns] = scaler.fit_transform(features_to_scale)
X = data_add.drop('y', axis=1)
y = data_add['y']

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Combine X and y for training and testing datasets
train_data_add = pd.DataFrame(X_train_balanced, columns=X.columns)
train_data_add['y'] = y_train_balanced

test_data_add = X_test.copy()
test_data_add['y'] = y_test

# Print dataset shapes
print("\nTraining Data Shape (After Balancing):", train_data_add.shape)
print("Testing Data Shape:", test_data_add.shape)
print("\nClass Distribution in Balanced Training Data:")
print(train_data_add['y'].value_counts(normalize=True))
print("\nFinal DataFrame Info:")
data_add.info()
print("\nFinal DataFrame Preview:")
print(data_add.head())


Original DataFrame:
   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         0  nonexistent          1.1   
1   may         mon  ...         1    999         0  nonexistent          1.1   
2   may         mon  ...         1    999         0  nonexistent          1.1   
3   may         mon  ...         1    999         0  nonexistent          1.1   
4   may         mon  ...         1    999         0  nonexistent          1.1   

   cons.price.idx  cons.conf.idx  eu




Training Data Shape (After Balancing): (55690, 52)
Testing Data Shape: (7881, 52)

Class Distribution in Balanced Training Data:
y
0    0.5
1    0.5
Name: proportion, dtype: float64

Final DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 39404 entries, 0 to 41187
Data columns (total 52 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            39404 non-null  float64
 1   campaign                       39404 non-null  float64
 2   pdays                          39404 non-null  float64
 3   previous                       39404 non-null  float64
 4   poutcome                       39404 non-null  float64
 5   emp.var.rate                   39404 non-null  float64
 6   cons.price.idx                 39404 non-null  float64
 7   cons.conf.idx                  39404 non-null  float64
 8   euribor3m                      39404 non-null  float64
 9   nr.employed              

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Ensure directories exist
output_dir = '/content/drive/My Drive/ML_CW/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

transformed_path = output_dir + 'transformed_add.csv'
train_data_path = output_dir + 'train_data_add.csv'
test_data_path = output_dir + 'test_data_add.csv'

try:
    data_add.to_csv(transformed_path, index=False)
    train_data_add.to_csv(train_data_path, index=False)
    test_data_add.to_csv(test_data_path, index=False)

    print(f"Transformed data saved to: {transformed_path}")
    print(f"Training Data saved to: {train_data_path}")
    print(f"Testing Data saved to: {test_data_path}")

    if os.path.exists(transformed_path) and os.path.exists(train_data_path) and os.path.exists(test_data_path):
        print("All files saved successfully.")
    else:
        print("Error in saving files.")
except Exception as e:
    print(f"Error saving files: {e}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Transformed data saved to: /content/drive/My Drive/ML_CW/transformed_add.csv
Training Data saved to: /content/drive/My Drive/ML_CW/train_data_add.csv
Testing Data saved to: /content/drive/My Drive/ML_CW/test_data_add.csv
Error in saving files.


In [None]:

# File paths for training and testing data
train_file_path = '/content/drive/My Drive/ML_CW/train_data_add.csv'
test_file_path = '/content/drive/My Drive/ML_CW/test_data_add.csv'

# Load training and testing data
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

X_train = train_data.drop('y', axis=1)
y_train = train_data['y']

X_test = test_data.drop('y', axis=1)
y_test = test_data['y']

# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# fit the classifier
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


# Output to debug
# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Feature Importances
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf_classifier.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nTop Features:")
print(feature_importances.head())


Accuracy: 0.88

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.95      0.94      6961
           1       0.51      0.36      0.42       920

    accuracy                           0.88      7881
   macro avg       0.71      0.66      0.68      7881
weighted avg       0.87      0.88      0.88      7881


Confusion Matrix:
[[6640  321]
 [ 587  333]]

Top Features:
        Feature  Importance
8     euribor3m    0.127059
1      campaign    0.113511
9   nr.employed    0.086621
0           age    0.080504
5  emp.var.rate    0.050869
