In [1]:
import pandas as pd

# Load the CSV file into a DataFrame
file_path = 'processed_bank_data_final.csv'  # Replace with your CSV file path
df = pd.read_csv(file_path)

# Display initial DataFrame information
print("Original DataFrame:")
print(df.head())  # Display the first few rows
print(f"\nTotal rows in the original DataFrame: {len(df)}")

Original DataFrame:
   age  job_blue-collar  job_entrepreneur  job_housemaid  job_management  \
0   56                0                 0              1               0   
1   57                0                 0              0               0   
2   37                0                 0              0               0   
3   40                0                 0              0               0   
4   56                0                 0              0               0   

   job_retired  job_self-employed  job_services  job_student  job_technician  \
0            0                  0             0            0               0   
1            0                  0             1            0               0   
2            0                  0             1            0               0   
3            0                  0             0            0               0   
4            0                  0             1            0               0   

   ...  campaign  pdays  previous  poutcom

In [2]:
# Find duplicate rows and mark the first occurrence index for duplicates
duplicates = df[df.duplicated(keep=False)]  # Get all duplicated rows

# Add a new column 'is_duplicate' and 'duplicate_with' to highlight duplicates and show their first occurrence
df['is_duplicate'] = ''
df['duplicate_with'] = ''

# Get the index of the first occurrence of each duplicate
first_occurrence_index = df[df.duplicated(keep='first')].index

# Loop through the DataFrame to mark duplicates and their first occurrence
for idx in first_occurrence_index:
    # Find the first occurrence index
    first_occurrence = df[df.iloc[:, :].eq(df.iloc[idx, :]).all(axis=1)].index[0]

    # Mark the duplicate row and the first occurrence
    df.at[idx, 'is_duplicate'] = 'Duplicate'
    df.at[idx, 'duplicate_with'] = f"Row {first_occurrence}"

In [3]:
# Display the DataFrame with the new 'is_duplicate' and 'duplicate_with' columns
print("\nDataFrame with 'is_duplicate' and 'duplicate_with' columns:")
print(df.head())  # Display the first few rows of the updated DataFrame


DataFrame with 'is_duplicate' and 'duplicate_with' columns:
   age  job_blue-collar  job_entrepreneur  job_housemaid  job_management  \
0   56                0                 0              1               0   
1   57                0                 0              0               0   
2   37                0                 0              0               0   
3   40                0                 0              0               0   
4   56                0                 0              0               0   

   job_retired  job_self-employed  job_services  job_student  job_technician  \
0            0                  0             0            0               0   
1            0                  0             1            0               0   
2            0                  0             1            0               0   
3            0                  0             0            0               0   
4            0                  0             1            0               0   

 

In [4]:
# Count duplicates
duplicate_count = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicate_count}")


Number of duplicate rows: 277


In [5]:
# Save the updated DataFrame with the new columns to a CSV file
output_file_path = 'highlighted_duplicates_with_row_numbers.csv'  # Replace with your desired output file path
df.to_csv(output_file_path, index=False)
print(f"\nUpdated DataFrame saved to: {output_file_path}")


Updated DataFrame saved to: highlighted_duplicates_with_row_numbers.csv


In [6]:
# Remove the rows that are marked as duplicates
df_filtered = df[df['is_duplicate'] != 'Duplicate']

# Count the total number of rows after removing duplicates
total_rows_after_removal = df_filtered.shape[0]
print(f"\nTotal number of rows after removing duplicates: {total_rows_after_removal}")

# Save the filtered DataFrame (without duplicates) to the original file or a new file
filtered_file_path = 'cleaned data.csv'  # Replace with the desired output file path
df_filtered.to_csv(filtered_file_path, index=False)
print(f"\nFiltered DataFrame (without duplicates) saved to: {filtered_file_path}")


Total number of rows after removing duplicates: 38931

Filtered DataFrame (without duplicates) saved to: cleaned data.csv


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Load the cleaned dataset
df = pd.read_csv("cleaned data.csv")

# Separate features (X) and target (y)
X = df.drop(columns=['y'])  # Features
y = df['y']  # Target

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Random Forest Accuracy: {rf_accuracy:.2f}")

# Neural Network Classifier
nn_model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=300, random_state=42)
nn_model.fit(X_train, y_train)
nn_predictions = nn_model.predict(X_test)
nn_accuracy = accuracy_score(y_test, nn_predictions)
print(f"Neural Network Accuracy: {nn_accuracy:.2f}")

# Assess dataset readiness
if rf_accuracy > 0.75 and nn_accuracy > 0.75:
    print("The dataset is likely ready for prediction.")
else:
    print("Further preprocessing or feature engineering may be required.")

Random Forest Accuracy: 0.89
Neural Network Accuracy: 0.87
The dataset is likely ready for prediction.
