<a href="https://colab.research.google.com/github/HRashmika/Bank_Prediction/blob/main/ML_CW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = 'bank.csv'
data_bank = pd.read_csv(file_path, delimiter=';')

# For debugging
print("Original DataFrame:")
print(data_bank.head())

# After performing one-hot encoding, keep 'y' separate for splitting:
# One-hot encoding (without dropping 'y')
binary_columns = ['default', 'housing', 'loan']  # Remove 'y' from binary_columns
data_bank = pd.get_dummies(data_bank, columns=binary_columns, drop_first=True)

# O and 1 for binary columns
binary_cols = [col for col in data_bank.columns if data_bank[col].dtype == 'bool']
for col in binary_cols:
    data_bank[col] = data_bank[col].astype(int)

# Label encoding
label_columns = ['job', 'marital', 'education', 'contact', 'month', 'poutcome']
label_encoder = LabelEncoder()
label_mappings = {}

for col in label_columns:
    data_bank[col] = label_encoder.fit_transform(data_bank[col])
    label_mappings[col] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

print("\nLabel Mappings:")
for col, mapping in label_mappings.items():
    print(f"{col}: {mapping}")

print("\nTransformed DataFrame:")
print(data_bank.head())

# Duplicates
duplicates = data_bank[data_bank.duplicated()]
if not duplicates.empty:
    print("\nDuplicate Rows Found:")
    print(duplicates)
    data_bank = data_bank.drop_duplicates()
    print("\nDuplicates removed. Current shape of DataFrame:", data_bank.shape)
else:
    print("\nNo duplicate rows found.")

# Missing values
missing_values = data_bank.isnull().sum()
if missing_values.any():
    print("\nMissing Values Found:")
    print(missing_values[missing_values > 0])

    # Replace all missing values with the string 'non'
    for col in data_bank.columns:
        data_bank[col].fillna("non", inplace=True)
else:
    print("\nNo missing values found.")

# Valid or invalid, checked with label encoding
columns_to_check = ['job', 'marital', 'education', 'contact', 'poutcome']
for col in columns_to_check:
    max_label = len(label_mappings[col]) - 1
    invalid_values = data_bank[col][(data_bank[col] < 0) | (data_bank[col] > max_label)].unique()
    if invalid_values.size > 0:
        print(f"\nInvalid values in column '{col}': {invalid_values}")
        data_bank = data_bank[~data_bank[col].isin(invalid_values)]
    else:
        print(f"\nNo invalid values found in column '{col}'.")


# Split the data into training (80%) and testing (20%) sets
X = data_bank.drop('y', axis=1, errors='ignore')  # Ensure 'y' is ignored if it was dropped
y = data_bank.get('y')  # Extract 'y' from the DataFrame

if y is not None:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    train_data_bank = X_train.copy()
    train_data_bank['y'] = y_train

    test_data_bank = X_test.copy()
    test_data_bank['y'] = y_test


else:
    print("\n'Y' column is missing, cannot split data.")

print("\nFinal DataFrame Info:")
data_bank.info()
print("\nFinal DataFrame Preview:")
print(data_bank.head())


Original DataFrame:
   age          job  marital  education default  balance housing loan  \
0   30   unemployed  married    primary      no     1787      no   no   
1   33     services  married  secondary      no     4789     yes  yes   
2   35   management   single   tertiary      no     1350     yes   no   
3   30   management  married   tertiary      no     1476     yes  yes   
4   59  blue-collar  married  secondary      no        0     yes   no   

    contact  day month  duration  campaign  pdays  previous poutcome   y  
0  cellular   19   oct        79         1     -1         0  unknown  no  
1  cellular   11   may       220         1    339         4  failure  no  
2  cellular   16   apr       185         1    330         1  failure  no  
3   unknown    3   jun       199         4     -1         0  unknown  no  
4   unknown    5   may       226         1     -1         0  unknown  no  

Label Mappings:
job: {'admin.': 0, 'blue-collar': 1, 'entrepreneur': 2, 'housemaid': 3, 'm

In [None]:

# File path
file_path = 'bank-additional.csv'
data_add = pd.read_csv(file_path, delimiter=';')

# Debugging: print the original data
print("Original DataFrame:")
print(data_add.head())

# One-hot encoding for binary columns
binary_columns = ['default', 'housing', 'loan']
data_add = pd.get_dummies(data_add, columns=binary_columns, drop_first=True)

# Convert boolean columns (if any) to integers
binary_cols = [col for col in data_add.columns if data_add[col].dtype == 'bool']
for col in binary_cols:
    data_add[col] = data_add[col].astype(int)

# Label encoding for categorical columns
label_columns = ['job', 'marital', 'education', 'contact', 'month', 'day_of_week', 'poutcome']
label_encoder = LabelEncoder()
label_mappings = {}

for col in label_columns:
    data_add[col] = label_encoder.fit_transform(data_add[col])
    label_mappings[col] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Print label mappings for reference
print("\nLabel Mappings:")
for col, mapping in label_mappings.items():
    print(f"{col}: {mapping}")

# Check and handle duplicates
duplicates = data_add[data_add.duplicated()]
if not duplicates.empty:
    print("\nDuplicate Rows Found:")
    print(duplicates)
    data_add = data_add.drop_duplicates()
    print("\nDuplicates removed. Current shape of DataFrame:", data_add.shape)
else:
    print("\nNo duplicate rows found.")

# Check and handle missing values
missing_values = data_add.isnull().sum()
if missing_values.any():
    print("\nMissing Values Found:")
    print(missing_values[missing_values > 0])
    for col in data_add.columns:
        data_add[col].fillna("non", inplace=True)
else:
    print("\nNo missing values found.")

# Check for invalid values in label-encoded columns
columns_to_check = label_columns
for col in columns_to_check:
    max_label = len(label_mappings[col]) - 1
    invalid_values = data_add[col][(data_add[col] < 0) | (data_add[col] > max_label)].unique()
    if invalid_values.size > 0:
        print(f"\nInvalid values in column '{col}': {invalid_values}")
        data_add = data_add[~data_add[col].isin(invalid_values)]
    else:
        print(f"\nNo invalid values found in column '{col}'.")

# Split the data into features (X) and target (y)
X = data_add.drop('y', axis=1, errors='ignore')  # Ensure 'y' is ignored if it was dropped
y = data_add.get('y')  # Extract 'y' from the DataFrame

# Perform train-test split if the target column 'y' exists
if y is not None:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Combine X and y for training and testing datasets
    train_data_add = X_train.copy()
    train_data_add['y'] = y_train

    test_data_add = X_test.copy()
    test_data_add['y'] = y_test

    print("\nTraining Data Shape:", train_data_add.shape)
    print("Testing Data Shape:", test_data_add.shape)
else:
    print("\n'Y' column is missing, cannot split data.")

# Print final DataFrame information
print("\nFinal DataFrame Info:")
data_add.info()
print("\nFinal DataFrame Preview:")
print(data_add.head())

Original DataFrame:
   age          job  marital          education default  housing     loan  \
0   30  blue-collar  married           basic.9y      no      yes       no   
1   39     services   single        high.school      no       no       no   
2   25     services  married        high.school      no      yes       no   
3   38     services  married           basic.9y      no  unknown  unknown   
4   47       admin.  married  university.degree      no      yes       no   

     contact month day_of_week  ...  campaign  pdays  previous     poutcome  \
0   cellular   may         fri  ...         2    999         0  nonexistent   
1  telephone   may         fri  ...         4    999         0  nonexistent   
2  telephone   jun         wed  ...         1    999         0  nonexistent   
3  telephone   jun         fri  ...         3    999         0  nonexistent   
4   cellular   nov         mon  ...         1    999         0  nonexistent   

  emp.var.rate  cons.price.idx  cons.conf.

In [None]:

output_path = '/content/drive/My Drive/ML_CW/transformed_add.csv'
data_add.to_csv(output_path, index=False)

print(f"Transformed data saved to: {output_path}")

train_data_add.to_csv('/content/drive/My Drive/ML_CW/train_data_add.csv', index=False)
test_data_add.to_csv('/content/drive/My Drive/ML_CW/test_data_add.csv', index=False)
print("\nTraining and Testing Data Saved:")
print(f"Training Data: /content/drive/My Drive/ML_CW/train_data_add.csv")
print(f"Testing Data: /content/drive/My Drive/ML_CW/test_data_add.csv")



Transformed data saved to: /content/drive/My Drive/ML_CW/transformed_add.csv

Training and Testing Data Saved:
Training Data: /content/drive/My Drive/ML_CW/train_data_add.csv
Testing Data: /content/drive/My Drive/ML_CW/test_data_add.csv
