In [1]:
!pip install sentence-transformers
!pip install ipywidgets
!pip install umap-learn matplotlib seaborn



In [2]:
# --- PHASE 3 IMPORTS ---

# For data handling and loading the downloaded CSVs
import pandas as pd
import numpy as np

In [10]:
# IMPORTANT: Replace 'path/to/your/enron_data.csv' with the actual file path on your machine

# Load the Normal (Training) Dataset
df_normal = pd.read_csv(r'C:\Users\noaga\OneDrive\Desktop\2025-6\AI Enhanced cyber\Project\Real_data\enron_data.csv',dtype = str)

# Load the Phishing (Test) Dataset
df_phishing_test = pd.read_csv(r'C:\Users\noaga\OneDrive\Desktop\2025-6\AI Enhanced cyber\Project\Real_data\phishing_test_data.csv', dtype = str)

print(f"Normal Dataset Size: {df_normal.shape[0]} emails")
print(f"Phishing Dataset Size: {df_phishing_test.shape[0]} emails")

Normal Dataset Size: 447417 emails
Phishing Dataset Size: 29767 emails


In [6]:
# --- 1. PREPARE NORMAL DATA (df_normal) ---
print("Preparing Normal Data...")
# Select the Subject, Body, and the existing Label column from the Enron set
# We rename the columns to lowercase standard names immediately
df_normal_clean = df_normal.rename(columns={
    'Subject': 'subject',
    'Body': 'body',
    'Label': 'old_label'  # Keep the old label for inspection, but we will create a new one
})

# Filter down to the essential content columns and the label
df_normal_clean = df_normal_clean[['subject', 'body', 'old_label']]

# Create the new, consistent 'is_phishing' label (0 = Normal)
df_normal_clean['is_phishing'] = 0


# --- 2. PREPARE PHISHING DATA (df_phishing_test) ---
print("Preparing Phishing Test Data...")
# We assume the 'label' column in the phishing set already means 1=Phishing.
# We rename the existing 'label' to 'is_phishing' for consistency.
df_phishing_test = df_phishing_test.rename(columns={'label': 'is_phishing'})

# Ensure the phishing data is labeled 1 (Phishing) and contains only the content columns
df_phishing_test['is_phishing'] = df_phishing_test['is_phishing'].astype(int).replace(0, 1) # Ensure all attacks are 1


# --- 3. MERGE THE DATASETS ---
df_combined = pd.concat([df_normal_clean, df_phishing_test], ignore_index=True)
print(f"Data combined successfully. Total emails: {len(df_combined)}")

Preparing Normal Data...
Preparing Phishing Test Data...
Data combined successfully. Total emails: 477184


In [7]:
import re # Ensure this is also imported!

def anonymize_and_clean_text(text: str) -> str:
    """
    Performs PII anonymization and standard text cleaning.
    """
    # ... (the rest of the PII and cleaning logic)
    # ...
    return text.lower() # This entire block must be run!

# --- 1. FILL MISSING VALUES (GUARANTEE NON-NULL INPUT) ---
print("Filling NaN values in content columns...")
# Fill any missing values in 'subject' and 'body' with an empty string
df_combined['subject'] = df_combined['subject'].fillna('')
df_combined['body'] = df_combined['body'].fillna('')
# Optional: Remove rows where both subject and body are empty after filling
df_combined = df_combined[
    (df_combined['subject'] != '') | (df_combined['body'] != '')
].copy()

# --- 2. APPLY CLEANING AND CONCATENATION (GUARANTEE STRING OUTPUT) ---
print("Applying cleaning and PII anonymization...")

# Combine subject and body into one clean_text column
df_combined['clean_text'] = df_combined.apply(
    # We wrap the output of anonymize_and_clean_text with str() 
    # to guarantee the concatenation operator (+) only sees strings.
    lambda row: str(anonymize_and_clean_text(row['subject'])) + " " + str(anonymize_and_clean_text(row['body'])),
    axis=1
)

# --- 3. FINAL SANITY CHECK ---
# Remove any rows where cleaning resulted in only whitespace
df_combined = df_combined[df_combined['clean_text'].str.strip() != '']

print(f"Cleaning complete. Final dataset size: {len(df_combined)}")

Filling NaN values in content columns...
Applying cleaning and PII anonymization...
Cleaning complete. Final dataset size: 477162


In [8]:
from sklearn.model_selection import train_test_split
import numpy as np

# --- 1. SEPARATE NORMAL FROM PHISHING ---
# Isolate Normal emails (label 0) for training and validation
df_normal_only = df_combined[df_combined['is_phishing'] == 0].copy()

# Isolate Phishing emails (label 1) for the Test set
df_phishing_only = df_combined[df_combined['is_phishing'] == 1].copy()

print(f"Normal emails isolated: {len(df_normal_only)}")
print(f"Phishing emails isolated: {len(df_phishing_only)}")


# --- 2. CREATE TRAIN / VALIDATION SPLIT (70/30) ---
# We split the Normal data into a large training set and a remainder.
df_train_normal, df_remainder = train_test_split(
    df_normal_only, test_size=0.3, random_state=42
)

# Split the remainder into Validation and Test sets (e.g., 50/50 of the 30% remainder)
df_val_normal, df_test_normal_subset = train_test_split(
    df_remainder, test_size=0.5, random_state=42
)


# --- 3. CREATE FINAL TEST SET (NORMAL + ALL PHISHING) ---
# Combine the normal subset with ALL the phishing attacks to create the final test set.
# This is crucial for accurately measuring your Detection Rate (Recall).
df_test_all = pd.concat([df_test_normal_subset, df_phishing_only], ignore_index=True)

# Final step: Shuffle the test set so the model doesn't see all phishing at the end
df_test_all = df_test_all.sample(frac=1).reset_index(drop=True)


# --- FINAL OUTPUT SUMMARY ---
print("\n--- Final Dataset Sizes ---")
print(f"Training Set (100% Normal): {len(df_train_normal)} emails")
print(f"Validation Set (100% Normal): {len(df_val_normal)} emails")
print(f"Test Set (Normal + Phishing): {len(df_test_all)} emails")

# Your data is now ready for modeling!

Normal emails isolated: 447395
Phishing emails isolated: 29767

--- Final Dataset Sizes ---
Training Set (100% Normal): 313176 emails
Validation Set (100% Normal): 67109 emails
Test Set (Normal + Phishing): 96877 emails


In [9]:
# Assuming the splitting code ran successfully and these DataFrames exist:

# 1. Save the final datasets
df_train_normal.to_csv('01_train_normal.csv', index=False)
df_val_normal.to_csv('02_val_normal.csv', index=False)
df_test_all.to_csv('03_test_all.csv', index=False)

print("Split datasets saved successfully. Ready to start the EDA.")

Split datasets saved successfully. Ready to start the EDA.
