In [2]:
import pandas as pd

# Load the CSV file into a DataFrame
file_path = 'processed_bank_data_final.csv'  # Replace with your CSV file path
df = pd.read_csv(file_path)

# Display initial DataFrame information
print("Original DataFrame:")
print(df.head())  # Display the first few rows
print(f"\nTotal rows in the original DataFrame: {len(df)}")

y
0    36193
1     4594
Name: count, dtype: int64

In [2]:
# Find duplicate rows and mark the first occurrence index for duplicates
duplicates = df[df.duplicated(keep=False)]  # Get all duplicated rows

# Add a new column 'is_duplicate' and 'duplicate_with' to highlight duplicates and show their first occurrence
df['is_duplicate'] = ''
df['duplicate_with'] = ''

# Get the index of the first occurrence of each duplicate
first_occurrence_index = df[df.duplicated(keep='first')].index

# Loop through the DataFrame to mark duplicates and their first occurrence
for idx in first_occurrence_index:
    # Find the first occurrence index
    first_occurrence = df[df.iloc[:, :].eq(df.iloc[idx, :]).all(axis=1)].index[0]

    # Mark the duplicate row and the first occurrence
    df.at[idx, 'is_duplicate'] = 'Duplicate'
    df.at[idx, 'duplicate_with'] = f"Row {first_occurrence}"

In [3]:
# Display the DataFrame with the new 'is_duplicate' and 'duplicate_with' columns
print("\nDataFrame with 'is_duplicate' and 'duplicate_with' columns:")
print(df.head())  # Display the first few rows of the updated DataFrame


DataFrame with 'is_duplicate' and 'duplicate_with' columns:
   age  job_blue-collar  job_entrepreneur  job_housemaid  job_management  \
0   56                0                 0              1               0   
1   57                0                 0              0               0   
2   37                0                 0              0               0   
3   40                0                 0              0               0   
4   56                0                 0              0               0   

   job_retired  job_self-employed  job_services  job_student  job_technician  \
0            0                  0             0            0               0   
1            0                  0             1            0               0   
2            0                  0             1            0               0   
3            0                  0             0            0               0   
4            0                  0             1            0               0   

 

In [4]:
# Count duplicates
duplicate_count = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicate_count}")


Number of duplicate rows: 277


In [5]:
# Save the updated DataFrame with the new columns to a CSV file
output_file_path = 'For Reference/highlighted_duplicates_with_row_numbers.csv'
df.to_csv(output_file_path, index=False)
print(f"\nUpdated DataFrame saved to: {output_file_path}")


Updated DataFrame saved to: highlighted_duplicates_with_row_numbers.csv


In [6]:
# Remove the rows that are marked as duplicates
df_filtered = df[df['is_duplicate'] != 'Duplicate']

# Count the total number of rows after removing duplicates
total_rows_after_removal = df_filtered.shape[0]
print(f"\nTotal number of rows after removing duplicates: {total_rows_after_removal}")

# Save the filtered DataFrame (without duplicates) to the original file or a new file
filtered_file_path = 'cleaned_data_without_duplicates.csv'  # Replace with the desired output file path
df_filtered.to_csv(filtered_file_path, index=False)
print(f"\nFiltered DataFrame (without duplicates) saved to: {filtered_file_path}")


Total number of rows after removing duplicates: 38931

Filtered DataFrame (without duplicates) saved to: cleaned data.csv


In [43]:
# Import necessary libraries for Random Forest and Neural Network models
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import pandas as pd

# Load the uploaded CSV file to inspect its structure
file_path = 'cleaned_data_without_duplicates.csv'
bank_data = pd.read_csv(file_path)

# Print the initial row count
initial_row_count = len(bank_data)
print(f"Initial row count: {initial_row_count}")

# Preprocess the data
# Filter invalid rows where pdays = 999 and poutcome != -1
invalid_rows = bank_data[(bank_data['pdays'] == 999) & (bank_data['poutcome'] != -1)]
invalid_count = len(invalid_rows)
total_rows = len(bank_data)
removed_percentage = (invalid_count / total_rows) * 100

# Save the invalid rows to a separate CSV file
invalid_rows.to_csv('invalid_rows_pdays_999.csv', index=False)
print(f"Saved {invalid_count} invalid rows to 'invalid_rows_pdays_999.csv'.")

# Create a cleaned version of the dataset without altering the original dataset
bank_data_cleaned = bank_data[~bank_data.index.isin(invalid_rows.index)]

# Print the updated row count for the cleaned dataset
final_row_count = len(bank_data_cleaned)
print(f"Row count of cleaned dataset: {final_row_count}")

# Train-test split (same for both datasets)
X = bank_data.drop('y', axis=1)  # Assuming 'target' is the target variable
y = bank_data['y']

X_cleaned = bank_data_cleaned.drop('y', axis=1)
y_cleaned = bank_data_cleaned['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_cleaned, X_test_cleaned, y_train_cleaned, y_test_cleaned = train_test_split(X_cleaned, y_cleaned, test_size=0.3, random_state=42)

# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return accuracy, report

# 1. Random Forest Model (on original data)
rf_model = RandomForestClassifier(random_state=42)
accuracy_rf, report_rf = evaluate_model(rf_model, X_train, X_test, y_train, y_test)
print("Random Forest - Original Data:")
print(f"Accuracy: {accuracy_rf:.2f}")
print(report_rf)

# 2. Neural Network Model (on original data)
nn_model = MLPClassifier(random_state=42, max_iter=500)
accuracy_nn, report_nn = evaluate_model(nn_model, X_train, X_test, y_train, y_test)
print("Neural Network - Original Data:")
print(f"Accuracy: {accuracy_nn:.2f}")
print(report_nn)

# 3. Random Forest Model (on cleaned data)
accuracy_rf_cleaned, report_rf_cleaned = evaluate_model(rf_model, X_train_cleaned, X_test_cleaned, y_train_cleaned, y_test_cleaned)
print("Random Forest - Cleaned Data:")
print(f"Accuracy: {accuracy_rf_cleaned:.2f}")
print(report_rf_cleaned)

# 4. Neural Network Model (on cleaned data)
accuracy_nn_cleaned, report_nn_cleaned = evaluate_model(nn_model, X_train_cleaned, X_test_cleaned, y_train_cleaned, y_test_cleaned)
print("Neural Network - Cleaned Data:")
print(f"Accuracy: {accuracy_nn_cleaned:.2f}")
print(report_nn_cleaned)

# Summarize the percentage of invalid rows filtered
print(f"Filtered {invalid_count} invalid rows, which is {removed_percentage:.2f}% of the total dataset.")

Initial row count: 40787
Saved 4080 invalid rows to 'invalid_rows_pdays_999.csv'.
Row count of cleaned dataset: 36707
Random Forest - Original Data:
Accuracy: 0.89
              precision    recall  f1-score   support

           0       0.91      0.97      0.94     10853
           1       0.56      0.28      0.38      1384

    accuracy                           0.89     12237
   macro avg       0.74      0.63      0.66     12237
weighted avg       0.87      0.89      0.88     12237

Neural Network - Original Data:
Accuracy: 0.88
              precision    recall  f1-score   support

           0       0.93      0.94      0.93     10853
           1       0.47      0.41      0.44      1384

    accuracy                           0.88     12237
   macro avg       0.70      0.68      0.69     12237
weighted avg       0.87      0.88      0.88     12237

Random Forest - Cleaned Data:
Accuracy: 0.90
              precision    recall  f1-score   support

           0       0.92      0.97  

In [None]:
# Save the cleaned dataset to a separate CSV file
bank_data_cleaned.to_csv('bank_data_cleaned_final.csv', index=False)
print("Saved the cleaned dataset to 'bank_data_cleaned_final.csv'.")

In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Load the data
df = pd.read_csv("bank_data_cleaned_final.csv")

# Define a function to train and evaluate models
def train_and_evaluate(X_train, X_test, y_train, y_test):
    results = {}

    # Random Forest Model
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)
    results['Random Forest'] = accuracy_score(y_test, y_pred_rf)

    # Neural Network Model
    nn = MLPClassifier(hidden_layer_sizes=(50, 25), max_iter=500, random_state=42)
    nn.fit(X_train, y_train)
    y_pred_nn = nn.predict(X_test)
    results['Neural Network'] = accuracy_score(y_test, y_pred_nn)

    return results

# 1. Keep a copy of the original dataset
original_df = df.copy()

# 2. Detect outliers in the 'campaign' column (campaign >= 17)
outliers = df[df['campaign'] > 17]

# 3. Remove outliers and create a new dataset
df_no_outliers = df[df['campaign'] <= 17]
df_no_outliers.to_csv('Updated_cleaned_data_no_outliers.csv', index=False)

# 4. Evaluate models on the original dataset
# Prepare the data
X = original_df.drop(columns=['y'])
y = original_df['y']

# One-hot encode categorical variables
X = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate models
original_results = train_and_evaluate(X_train, X_test, y_train, y_test)
print("Accuracy on the original dataset:")
print(original_results)

# 5. Evaluate models on the dataset without outliers
# Prepare the data
X_no_outliers = df_no_outliers.drop(columns=['y'])
y_no_outliers = df_no_outliers['y']

# One-hot encode categorical variables
X_no_outliers = pd.get_dummies(X_no_outliers, drop_first=True)

# Split the data into training and testing sets
X_train_no_outliers, X_test_no_outliers, y_train_no_outliers, y_test_no_outliers = train_test_split(
    X_no_outliers, y_no_outliers, test_size=0.2, random_state=42
)

# Train and evaluate models
updated_results = train_and_evaluate(X_train_no_outliers, X_test_no_outliers, y_train_no_outliers, y_test_no_outliers)
print("Accuracy on the dataset without outliers:")
print(updated_results)

Accuracy on the original dataset:
{'Random Forest': 0.8910678499856857, 'Neural Network': 0.8866304036644718}
Accuracy on the dataset without outliers:
{'Random Forest': 0.8943499567598732, 'Neural Network': 0.8731622946093975}


In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE

def balance_dataset_with_smote(original_file, target_column, output_file):
    # Step 1: Load the Original Dataset
    data = pd.read_csv(original_file)
    X = data.drop(columns=[target_column])
    y = data[target_column]

    # Step 2: Analyze Class Distribution
    class_counts = y.value_counts()
    count_y0, count_y1 = class_counts[0], class_counts[1]
    rows_to_add_y1 = count_y0 - count_y1

    # Step 3: Check if Resampling is Required
    if rows_to_add_y1 <= 0:
        print("Dataset is already balanced.")
        data.to_csv(output_file, index=False)
        return

    # Step 4: Reapply SMOTE to Generate Synthetic Data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    resampled_data = pd.DataFrame(X_resampled, columns=X.columns)
    resampled_data[target_column] = y_resampled

    # Save the original data
    original_data = data.copy()
    original_data.to_csv(output_file, index=False)

    # Step 5: Filter and Prepare the Minority Class Rows
    resampled_data_y1 = resampled_data[resampled_data[target_column] == 1]

    # Step 6: Check for Duplicates Before Appending
    resampled_and_original_data = pd.read_csv(output_file)

    # Remove duplicates using concat and drop_duplicates
    combined_data = pd.concat([resampled_and_original_data, resampled_data_y1])
    combined_data = combined_data.drop_duplicates(keep=False)

    # Append only the required number of non-duplicate rows
    resampled_data_y1_to_append = combined_data[combined_data[target_column] == 1].head(rows_to_add_y1)
    resampled_and_original_data = pd.concat(
        [resampled_and_original_data, resampled_data_y1_to_append], ignore_index=True
    )
    resampled_and_original_data.to_csv(output_file, index=False)

    # Step 7: Check Final Balance and Reapply SMOTE if Necessary
    while True:
        final_class_counts = resampled_and_original_data[target_column].value_counts()
        count_y0, count_y1 = final_class_counts[0], final_class_counts[1]

        if count_y0 == count_y1:
            break  # Dataset is balanced; no further action required.

        # Reapply SMOTE on the combined dataset
        X = resampled_and_original_data.drop(columns=[target_column])
        y = resampled_and_original_data[target_column]
        X_resampled, y_resampled = smote.fit_resample(X, y)
        resampled_data = pd.DataFrame(X_resampled, columns=X.columns)
        resampled_data[target_column] = y_resampled

        # Filter the newly generated y == 1 rows
        resampled_data_y1 = resampled_data[resampled_data[target_column] == 1]

        # Remove duplicates using concat and drop_duplicates
        combined_data = pd.concat([resampled_and_original_data, resampled_data_y1])
        combined_data = combined_data.drop_duplicates(keep=False)

        # Append only the required number of non-duplicate rows
        rows_to_add_y1 = count_y0 - count_y1
        resampled_data_y1_to_append = combined_data[combined_data[target_column] == 1].head(rows_to_add_y1)
        resampled_and_original_data = pd.concat(
            [resampled_and_original_data, resampled_data_y1_to_append], ignore_index=True
        )
        resampled_and_original_data.to_csv(output_file, index=False)

    # Step 8: Verify Final Class Distribution
    final_class_counts = resampled_and_original_data[target_column].value_counts()
    print(f"Final Class Distribution:\n{final_class_counts}")
    print(f"Balanced dataset saved to '{output_file}'.")

# Input file paths and target column
original_file = "Updated_cleaned_data_no_outliers.csv"
output_file = "resampled_and_original_data.csv"
target_column = "y"

# Balance the dataset
balance_dataset_with_smote(original_file, target_column, output_file)

In [113]:
data = pd.read_csv("resampled_and_original_data.csv")
data['y'].value_counts()

y
0    24527
1    24527
Name: count, dtype: int64

In [16]:
import pandas as pd

# Load the CSV file into a DataFrame
file_path = 'resampled_and_original_data.csv'  # Replace with your CSV file path
df = pd.read_csv(file_path)

# Find duplicate rows
duplicates = df[df.duplicated(keep=False)]  # Get all duplicated rows

# Count duplicates
duplicate_count = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicate_count}")


Number of duplicate rows: 0


In [100]:
# import pandas as pd
# from imblearn.over_sampling import SMOTE
# from sklearn.model_selection import train_test_split
#
# # Example: Load your dataset
# data = pd.read_csv("Updated_cleaned_data_no_outliers.csv")
#
# # Split features and target
# X = data.drop(columns=["y"])  # Features
# y = data["y"]                 # Target
#
# # Apply train-test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
#
# # Visualize original class distribution
# original_counts = y_train.value_counts()
#
# # Apply SMOTE
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
#
# # Convert resampled data to DataFrame
# X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
# y_resampled_df = pd.DataFrame(y_resampled, columns=["y"])
#
# # Combine features and target for resampled data
# resampled_data = pd.concat([X_resampled_df, y_resampled_df], axis=1)
#
# # Filter the rows where y == 1
# resampled_data_y1 = resampled_data[resampled_data["y"] == 1]
#
# # Calculate the number of y == 1 rows to append to make counts equal
# count_y0 = original_counts[0]  # Count of y == 0 in original data
# count_y1 = original_counts[1]  # Count of y == 1 in original data
# rows_to_add_y1 = count_y0 - count_y1  # The difference, i.e., how many y == 1 rows to append
#
# # Take only the necessary number of y == 1 rows from resampled data
# resampled_data_y1_to_append = resampled_data_y1.head(rows_to_add_y1)
#
# # Combine original data with only the necessary y == 1 rows from the resampled data
# original_data = pd.concat([X_train, y_train], axis=1)
# combined_data = pd.concat([original_data, resampled_data_y1_to_append], ignore_index=True)
#
# # Save to CSV
# combined_data.to_csv("resampled_and_original_data.csv", index=False)
#
# print("Resampled and original data saved to 'resampled_and_original_data.csv'")

Resampled and original data saved to 'resampled_and_original_data.csv'


In [2]:
# import pandas as pd
# from imblearn.over_sampling import SMOTE
# from sklearn.model_selection import train_test_split
#
# # Function to resample data until balance is achieved
# def ensure_balanced_data(data, target_column, output_file):
#     smote = SMOTE(random_state=42)
#     max_iterations = 10  # Limit iterations to avoid infinite loops in edge cases
#     iteration = 0
#
#     while iteration < max_iterations:
#         # Check current class distribution
#         class_counts = data[target_column].value_counts()
#         print(f"Iteration {iteration}: Class distribution:\n{class_counts}")
#
#         if class_counts.min() == class_counts.max():
#             print("Balanced dataset achieved.")
#             break
#
#         # Determine the majority and minority class
#         majority_class = class_counts.idxmax()
#         minority_class = class_counts.idxmin()
#
#         # Separate features and target
#         X = data.drop(columns=[target_column])
#         y = data[target_column]
#
#         # Resample to balance classes
#         sampling_strategy = {minority_class: class_counts[majority_class]}
#         X_resampled, y_resampled = SMOTE(random_state=42, sampling_strategy=sampling_strategy).fit_resample(X, y)
#
#         # Combine resampled features and target
#         resampled_data = pd.DataFrame(X_resampled, columns=X.columns)
#         resampled_data[target_column] = y_resampled
#
#         # Remove duplicates
#         resampled_data = resampled_data.drop_duplicates(keep="first")
#
#         # Update the dataset with resampled data
#         data = pd.concat([data, resampled_data], ignore_index=True).drop_duplicates(keep="first")
#         iteration += 1
#
#     # Save the balanced dataset
#     data.to_csv(output_file, index=False)
#     print(f"Final balanced dataset saved to '{output_file}'")
#
#     # Verify final class distribution
#     final_counts = data[target_column].value_counts()
#     print("Final class distribution after balancing:\n", final_counts)
#
# # Load the original cleaned data
# original_data = pd.read_csv("Updated_cleaned_data_no_outliers.csv")
#
# # Split features and target
# X = original_data.drop(columns=["y"])  # Features
# y = original_data["y"]                 # Target
#
# # Apply train-test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
#
# # Combine training data
# train_data = pd.concat([X_train, y_train], axis=1)
#
# # Save initial combined dataset and ensure balance
# output_file = "resampled_and_original_data1.csv"
# ensure_balanced_data(train_data, target_column="y", output_file=output_file)

Iteration 0: Class distribution:
y
0    24527
1     3222
Name: count, dtype: int64
Iteration 1: Class distribution:
y
0    24527
1    22072
Name: count, dtype: int64
Iteration 2: Class distribution:
y
0    24527
1    24372
Name: count, dtype: int64
Iteration 3: Class distribution:
y
0    24527
1    24510
Name: count, dtype: int64
Iteration 4: Class distribution:
y
0    24527
1    24522
Name: count, dtype: int64
Iteration 5: Class distribution:
y
0    24527
1    24526
Name: count, dtype: int64
Iteration 6: Class distribution:
y
0    24527
1    24527
Name: count, dtype: int64
Balanced dataset achieved.
Final balanced dataset saved to 'resampled_and_original_data1.csv'
Final class distribution after balancing:
 y
0    24527
1    24527
Name: count, dtype: int64


In [110]:
# import pandas as pd
# from imblearn.over_sampling import SMOTE
#
# # Step 1: Load the dataset
# data = pd.read_csv("Updated_cleaned_data_no_outliers.csv")
#
# # Step 2: Separate features and target
# X = data.drop(columns=["y"])  # Features
# y = data["y"]                 # Target (prediction column)
#
# # Step 3: Apply SMOTE
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X, y)
#
# # Step 4: Convert resampled data to a DataFrame
# resampled_data = pd.DataFrame(X_resampled, columns=X.columns)
# resampled_data["y"] = y_resampled
#
# # Step 5: Save only the resampled data
# resampled_data.to_csv("resampled_dataset_testing.csv", index=False)
#
# # Step 6: Verify class balance
# unique_counts = resampled_data["y"].value_counts()
# print("Class distribution in the resampled dataset:")
# print(unique_counts)
#
# # Step 7: Check if the counts are equal
# if len(unique_counts) == 2 and unique_counts[0] == unique_counts[1]:
#     print("Prediction column has equal count for both classes.")
# else:
#     print("Class imbalance remains in the saved dataset.")

Class distribution in the resampled dataset:
y
0    30659
1    30659
Name: count, dtype: int64
Prediction column has equal count for both classes.
