In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# Load Dataset
df = pd.read_csv("01_Updated_Dataset.csv")
print(f"Original dataset size: {df.shape[0]} rows")

# Classification task
df['text_type'] = df['text_type'].str.lower() # ensure all lowercase
before_filter = df.shape[0] 

df = df[df['text_type'].isin(['ham', 'spam'])] # keep only relevant classes (Upadated_Dataset.csv have ham,spam)
after_filter = df.shape[0]
print(f"After filtering text_type: {after_filter} rows ({before_filter - after_filter} rows dropped)")

label_map = {'ham': 0, 'spam': 1} # binary classification: ham (0) vs spam (1)
df['label_encoded'] = df['text_type'].map(label_map)

def clean_text(text):
    text = text.lower() # lowercase

    # Normalize phishing features
    text = re.sub(r"http\S+", "<URL>", text)
    text = re.sub(r"\S+@\S+", "<EMAIL>", text)
    text = re.sub(r"\b\d{8,}\b", "<PHONE>", text)  # long numbers, phones

    # Remove other punctuation/noise except placeholders
    text = re.sub(r"[^a-zA-Z0-9\s<>]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    return text

df['cleaned_text'] = df['text'].astype(str).apply(clean_text)

# Check duplicates
before_dup = df.shape[0]
df = df.drop_duplicates(subset='cleaned_text')
after_dup = df.shape[0]
print(f"After dropping duplicates: {after_dup} rows ({before_dup - after_dup} duplicates removed)")

# Check missing text
missing_count = df['cleaned_text'].isna().sum()
print(f"Missing text before filling: {missing_count}")
df['cleaned_text'] = df['cleaned_text'].fillna('')

# Saved
output_file = "03_Cleaned_Updated_Dataset.csv"
df.to_csv(output_file, index=False)
print(f"Cleaned dataset saved to '{output_file}' with {df.shape[0]} rows")


Original dataset size: 20348 rows
After filtering text_type: 20348 rows (0 rows dropped)
After dropping duplicates: 20104 rows (244 duplicates removed)
Missing text before filling: 0
Cleaned dataset saved to '03_Cleaned_Updated_Dataset.csv' with 20104 rows
