In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

# Define the dataset folder path
base_path = '/home/kosmas/projects/llm-in-cybersecurity/final-project/datasets/raw/'

# List of datasets to merge
dataset_files = [
    'CEAS_08.csv',
    'Enron.csv',
    'Ling.csv',
    'SpamAssasin.csv',
    'TREC_05.csv',
    'TREC_06.csv',
    'TREC_07.csv'
]

# Function to read and preprocess each dataset
def load_dataset(file_path):
    try:
        # Try to determine the file's format and load accordingly
        df = pd.read_csv(file_path, encoding='utf-8')
        
        # Check if the dataset has the expected structure
        # Most email datasets should have at least 2 columns (text and label)
        if df.shape[1] < 2:
            print(f"Warning: {file_path} has fewer than 2 columns. Skipping.")
            return None
        
        # Add a source column to track which dataset each row came from
        df['source'] = os.path.basename(file_path)
        
        # Some basic preprocessing
        print(f"Loaded {file_path}: {df.shape[0]} rows")
        return df
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

# Load and merge all datasets
dfs = []
for file in dataset_files:
    file_path = os.path.join(base_path, file)
    if os.path.exists(file_path):
        df = load_dataset(file_path)
        if df is not None:
            dfs.append(df)
    else:
        print(f"File not found: {file_path}")

# Check if we have any dataframes to merge
if not dfs:
    print("No valid datasets found!")
else:
    # Determine common columns for standardization
    print("\nAnalyzing dataset structures...")
    for i, df in enumerate(dfs):
        print(f"Dataset {i} ({dataset_files[i]}): Columns = {df.columns.tolist()}")
    
    # Merge all datasets (this may need customization based on column structure)
    print("\nMerging datasets...")
    # We'll customize this part after examining the actual structure of your datasets
    # For now, we'll assume a simple concat operation
    merged_df = pd.concat(dfs, ignore_index=True, sort=False)
    
    print(f"Merged dataset shape: {merged_df.shape}")
    
    # Save the merged dataset
    merged_path = os.path.join(base_path, '../merged_emails.csv')
    merged_df.to_csv(merged_path, index=False)
    print(f"Merged dataset saved to {merged_path}")
    
    # Split into train, validation, and test sets (70%, 15%, 15%)
    # First split: 70% train, 30% temp
    train_df, temp_df = train_test_split(merged_df, test_size=0.3, random_state=42, stratify=merged_df['label'] if 'label' in merged_df.columns else None)
    
    # Second split: divide the temp into validation and test (50% each of the 30%, so 15% of original dataset)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'] if 'label' in temp_df.columns else None)
    
    print(f"\nSplit sizes:")
    print(f"Training set: {train_df.shape[0]} rows ({train_df.shape[0]/merged_df.shape[0]:.1%})")
    print(f"Validation set: {val_df.shape[0]} rows ({val_df.shape[0]/merged_df.shape[0]:.1%})")
    print(f"Test set: {test_df.shape[0]} rows ({test_df.shape[0]/merged_df.shape[0]:.1%})")
    
    # Save the split datasets
    train_df.to_csv(os.path.join(base_path, '../train_emails.csv'), index=False)
    val_df.to_csv(os.path.join(base_path, '../val_emails.csv'), index=False)
    test_df.to_csv(os.path.join(base_path, '../test_emails.csv'), index=False)
    print("Dataset splits saved successfully.")

Loaded /home/kosmas/projects/llm-in-cybersecurity/final-project/datasets/raw/CEAS_08.csv: 39154 rows
Loaded /home/kosmas/projects/llm-in-cybersecurity/final-project/datasets/raw/Enron.csv: 29767 rows
Loaded /home/kosmas/projects/llm-in-cybersecurity/final-project/datasets/raw/Ling.csv: 2859 rows
Loaded /home/kosmas/projects/llm-in-cybersecurity/final-project/datasets/raw/SpamAssasin.csv: 5809 rows
Error loading /home/kosmas/projects/llm-in-cybersecurity/final-project/datasets/raw/TREC_05.csv: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.

Error loading /home/kosmas/projects/llm-in-cybersecurity/final-project/datasets/raw/TREC_06.csv: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.

Loaded /home/kosmas/projects/llm-in-cybersecurity/final-project/datasets/raw/TREC_07.csv: 53757 rows

Analyzing dataset structures...
Dataset 0 (CEAS_08.csv): Columns = ['sender', 'receiver', 'date', 'subject', 'body', 'label',

In [2]:
# Basic dataset analysis showing samples from each source

# Get the unique sources
sources = merged_df['source'].unique()

# Create a dataframe to store samples from each source
all_samples = pd.DataFrame()

# Get 5 rows from each source
for source in sources:
    sample = merged_df[merged_df['source'] == source].sample(5, random_state=42)
    all_samples = pd.concat([all_samples, sample])

# Reset index for better display
all_samples = all_samples.reset_index(drop=True)

# Display the samples
print(f"Showing 5 random samples from each of the {len(sources)} sources")
display(all_samples)

# Display label distribution by source
label_dist = merged_df.groupby('source')['label'].value_counts(normalize=True).unstack().fillna(0)
label_dist.columns = ['Ham (0)', 'Spam (1)']
label_dist = label_dist * 100  # Convert to percentages

print("\nLabel distribution by source (%):")
display(label_dist)

# Show basic statistics about the dataset
print("\nBasic dataset statistics:")
print(f"Total emails: {len(merged_df)}")
print(f"Ham emails: {len(merged_df[merged_df['label'] == 0])} ({len(merged_df[merged_df['label'] == 0])/len(merged_df):.2%})")
print(f"Spam emails: {len(merged_df[merged_df['label'] == 1])} ({len(merged_df[merged_df['label'] == 1])/len(merged_df):.2%})")

Showing 5 random samples from each of the 5 sources


Unnamed: 0,sender,receiver,date,subject,body,label,urls,source
0,Jeff Chan <ociwu@surbl.org>,zqoqi@spamassassin.apache.org,"Tue, 05 Aug 2008 18:31:40 -0600",Re: what are the criteria for being listed in\...,"Also, the sa-blacklist inclusion policy is at:...",0,1.0,CEAS_08.csv
1,Freeman Cabrera <ChadwickzerothRosario@million...,twanna_patient@gvc.ceas-challenge.cc,"Wed, 06 Aug 2008 04:32:27 -0200",Guaranteed Erection Fast,\nSize DOES matter - change your life today!\n...,1,1.0,CEAS_08.csv
2,Daily Top 10 <notelliu1989@arte-m.de>,user8.2-ext1@gvc.ceas-challenge.cc,"Wed, 06 Aug 2008 15:31:48 +0200",CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1.0,CEAS_08.csv
3,Chris Poteet <jhsluxg@siolon.com>,PownceAPI <zgdjhgesv@googlegroups.com>,"Wed, 06 Aug 2008 08:39:40 -0700",[PownceAPI] Dates in API?,\nLet's say I wanted to return the actual date...,0,1.0,CEAS_08.csv
4,Gene Oneil <auralx8@pinnacleframe.com>,user8.1@gvc.ceas-challenge.cc,"Fri, 08 Aug 2008 11:15:14 +0200",Re:,\nLove is the beginning of all the joy which n...,1,1.0,CEAS_08.csv
5,,,,karthik rajan - interview schedule,attached you will find the interview packet fo...,0,,Enron.csv
6,,,,re : a 3 al ad hoc review team,"in the week of the 25 th . , the 26 th and 27 ...",0,,Enron.csv
7,,,,not for sale in the u . a . e - usb voip hands...,not for sale in the u . a . e\nusb voip handse...,1,,Enron.csv
8,,,,pharma,stop wasting money on prescription drugs . get...,1,,Enron.csv
9,,,,california update - - 10 . 02 . 01 : puc turns...,agreement with dwr\nin what will likely be vie...,0,,Enron.csv



Label distribution by source (%):


Unnamed: 0_level_0,Ham (0),Spam (1)
source,Unnamed: 1_level_1,Unnamed: 2_level_1
CEAS_08.csv,44.21515,55.78485
Enron.csv,53.048678,46.951322
Ling.csv,83.980413,16.019587
SpamAssasin.csv,70.425202,29.574798
TREC_07.csv,45.311308,54.688692



Basic dataset statistics:
Total emails: 131346
Ham emails: 63953 (48.69%)
Spam emails: 67393 (51.31%)
