<a href="https://colab.research.google.com/github/HRashmika/Bank_Prediction/blob/main/ML_CW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = 'bank.csv'
data_bank = pd.read_csv(file_path, delimiter=';')

# For debugging
print("Original DataFrame:")
print(data_bank.head())

# After performing one-hot encoding, keep 'y' separate for splitting:
# One-hot encoding (without dropping 'y')
binary_columns = ['default', 'housing', 'loan']  # Remove 'y' from binary_columns
data_bank = pd.get_dummies(data_bank, columns=binary_columns, drop_first=True)

# O and 1 for binary columns
binary_cols = [col for col in data_bank.columns if data_bank[col].dtype == 'bool']
for col in binary_cols:
    data_bank[col] = data_bank[col].astype(int)

# Label encoding
label_columns = ['job', 'marital', 'education', 'contact', 'month', 'poutcome']
label_encoder = LabelEncoder()
label_mappings = {}

for col in label_columns:
    data_bank[col] = label_encoder.fit_transform(data_bank[col])
    label_mappings[col] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

print("\nLabel Mappings:")
for col, mapping in label_mappings.items():
    print(f"{col}: {mapping}")

print("\nTransformed DataFrame:")
print(data_bank.head())

# Duplicates
duplicates = data_bank[data_bank.duplicated()]
if not duplicates.empty:
    print("\nDuplicate Rows Found:")
    print(duplicates)
    data_bank = data_bank.drop_duplicates()
    print("\nDuplicates removed. Current shape of DataFrame:", data_bank.shape)
else:
    print("\nNo duplicate rows found.")

# Missing values
missing_values = data_bank.isnull().sum()
if missing_values.any():
    print("\nMissing Values Found:")
    print(missing_values[missing_values > 0])

    # Replace all missing values with the string 'non'
    for col in data_bank.columns:
        data_bank[col].fillna("non", inplace=True)
else:
    print("\nNo missing values found.")

# Valid or invalid, checked with label encoding
columns_to_check = ['job', 'marital', 'education', 'contact', 'poutcome']
for col in columns_to_check:
    max_label = len(label_mappings[col]) - 1
    invalid_values = data_bank[col][(data_bank[col] < 0) | (data_bank[col] > max_label)].unique()
    if invalid_values.size > 0:
        print(f"\nInvalid values in column '{col}': {invalid_values}")
        data_bank = data_bank[~data_bank[col].isin(invalid_values)]
    else:
        print(f"\nNo invalid values found in column '{col}'.")


# Split the data into training (80%) and testing (20%) sets
X = data_bank.drop('y', axis=1, errors='ignore')  # Ensure 'y' is ignored if it was dropped
y = data_bank.get('y')  # Extract 'y' from the DataFrame

if y is not None:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    train_data_bank = X_train.copy()
    train_data_bank['y'] = y_train

    test_data_bank = X_test.copy()
    test_data_bank['y'] = y_test


else:
    print("\n'Y' column is missing, cannot split data.")

print("\nFinal DataFrame Info:")
data_bank.info()
print("\nFinal DataFrame Preview:")
print(data_bank.head())


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

# File path
file_path = 'bank-additional-full.csv'
data_add = pd.read_csv(file_path, delimiter=';')

# Debugging: print the original data
print("Original DataFrame:")
print(data_add.head())

# Drop the 'duration' column
data_add = data_add.drop('duration', axis=1, errors='ignore')  # Drop column if it exists

# One-hot encoding for binary columns (encoding 'yes'/'no' columns as 1 and 0)
binary_columns = ['default', 'housing', 'loan']
data_add = pd.get_dummies(data_add, columns=binary_columns, drop_first=True)

# Convert boolean columns (if any) to integers
binary_cols = [col for col in data_add.columns if data_add[col].dtype == 'bool']
for col in binary_cols:
    data_add[col] = data_add[col].astype(int)

# Label encoding for categorical columns (job, marital, education, etc.)
label_columns = ['job', 'marital', 'education', 'contact', 'month', 'day_of_week', 'poutcome']
label_encoder = LabelEncoder()
label_mappings = {}

for col in label_columns:
    data_add[col] = label_encoder.fit_transform(data_add[col].fillna('unknown'))  # Filling NaNs with 'unknown' before encoding
    label_mappings[col] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Print label mappings for reference
print("\nLabel Mappings:")
for col, mapping in label_mappings.items():
    print(f"{col}: {mapping}")

# Encode 'y' column as 1 for 'yes' and 0 for 'no'
data_add['y'] = data_add['y'].map({'yes': 1, 'no': 0})

# Check and handle duplicates (rows where all values are identical)
duplicates = data_add[data_add.duplicated()]
if not duplicates.empty:
    print("\nDuplicate Rows Found:")
    print(duplicates)
    data_add = data_add.drop_duplicates()  # Remove duplicates
    print("\nDuplicates removed. Current shape of DataFrame:", data_add.shape)
else:
    print("\nNo duplicate rows found.")

# Check and handle missing values
missing_values = data_add.isnull().sum()
if missing_values.any():
    print("\nMissing Values Found:")
    print(missing_values[missing_values > 0])
    # Fill missing values with 'unknown' for categorical and 'non' for other columns
    for col in data_add.columns:
        if data_add[col].dtype == 'object':
            data_add[col].fillna('unknown', inplace=True)
        else:
            data_add[col].fillna('non', inplace=True)
else:
    print("\nNo missing values found.")

# Check for invalid values in label-encoded columns
columns_to_check = label_columns
for col in columns_to_check:
    max_label = len(label_mappings[col]) - 1
    invalid_values = data_add[col][(data_add[col] < 0) | (data_add[col] > max_label)].unique()
    if invalid_values.size > 0:
        print(f"\nInvalid values in column '{col}': {invalid_values}")
        data_add = data_add[~data_add[col].isin(invalid_values)]  # Remove rows with invalid values
    else:
        print(f"\nNo invalid values found in column '{col}'.")

# Min-Max Normalization (scaling the features to the range [0, 1])
scaler = MinMaxScaler()

# Apply scaling only to numeric features (excluding the target 'y')
features_to_scale = data_add.drop('y', axis=1)  # Excluding 'y' from scaling
data_add[features_to_scale.columns] = scaler.fit_transform(features_to_scale)

# Split the data into features (X) and target (y)
X = data_add.drop('y', axis=1, errors='ignore')  # Ensure 'y' is ignored if it was dropped
y = data_add.get('y')  # Extract 'y' from the DataFrame

# Perform train-test split if the target column 'y' exists
if y is not None:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Combine X and y for training and testing datasets
    train_data_add = X_train.copy()
    train_data_add['y'] = y_train

    test_data_add = X_test.copy()
    test_data_add['y'] = y_test

    print("\nTraining Data Shape:", train_data_add.shape)
    print("Testing Data Shape:", test_data_add.shape)
else:
    print("\n'Y' column is missing, cannot split data.")

# Print final DataFrame information
print("\nFinal DataFrame Info:")
data_add.info()
print("\nFinal DataFrame Preview:")
print(data_add.head())


Original DataFrame:
   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         0  nonexistent          1.1   
1   may         mon  ...         1    999         0  nonexistent          1.1   
2   may         mon  ...         1    999         0  nonexistent          1.1   
3   may         mon  ...         1    999         0  nonexistent          1.1   
4   may         mon  ...         1    999         0  nonexistent          1.1   

   cons.price.idx  cons.conf.idx  eu

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_add[features_to_scale.columns] = scaler.fit_transform(features_to_scale)


In [None]:

output_path = '/content/drive/My Drive/ML_CW/transformed_add.csv'
data_add.to_csv(output_path, index=False)

print(f"Transformed data saved to: {output_path}")

train_data_add.to_csv('/content/drive/My Drive/ML_CW/train_data_add.csv', index=False)
test_data_add.to_csv('/content/drive/My Drive/ML_CW/test_data_add.csv', index=False)
print("\nTraining and Testing Data Saved:")
print(f"Training Data: /content/drive/My Drive/ML_CW/train_data_add.csv")
print(f"Testing Data: /content/drive/My Drive/ML_CW/test_data_add.csv")



Transformed data saved to: /content/drive/My Drive/ML_CW/transformed_add.csv

Training and Testing Data Saved:
Training Data: /content/drive/My Drive/ML_CW/train_data_add.csv
Testing Data: /content/drive/My Drive/ML_CW/test_data_add.csv


In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# File paths for training and testing data
train_file_path = '/content/drive/My Drive/ML_CW/train_data_add.csv'
test_file_path = '/content/drive/My Drive/ML_CW/test_data_add.csv'

# Load training and testing data
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

X_train = train_data.drop('y', axis=1)
y_train = train_data['y']

X_test = test_data.drop('y', axis=1)
y_test = test_data['y']

# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# fit the classifier
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


# Output to debug
# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Feature Importances
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf_classifier.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nTop Features:")
print(feature_importances.head())


Accuracy: 0.88

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.96      0.94      6963
           1       0.51      0.28      0.36       918

    accuracy                           0.88      7881
   macro avg       0.71      0.62      0.65      7881
weighted avg       0.86      0.88      0.87      7881


Confusion Matrix:
[[6716  247]
 [ 661  257]]

Top Features:
      Feature  Importance
0         age    0.183528
14  euribor3m    0.141186
7    campaign    0.088027
1         job    0.084328
3   education    0.074619
