# Data Preprocessing


In [55]:
# Import necessary libraries
import pandas as pd
import re  # Regular expressions
import nltk  # Natural Language Toolkit
import os  # Operating system utilities (for file paths)
from bs4 import BeautifulSoup  # For parsing HTML
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Define the path to the dataset
file_dir = "../DS/enron.csv"

# Load the dataset
df = pd.read_csv(f"{file_dir}")

# Display the first 5 rows to understand the data
df.head()

Unnamed: 0,email_subject,email_body,is_spam
0,ena sales on hpl,just to update you on this project ' s status ...,0
1,98 - 6736 & 98 - 9638 for 1997 ( ua 4 issues ),the above referenced meters need to be placed ...,0
2,"hpl nominations for december 28 , 1999",( see attached file : hpll 228 . xls )\n- hpll...,0
3,revised nom - kcs resources,"daren ,\nit ' s in .\nbob\n- - - - - - - - - -...",0
4,new production - sitara deals needed,"daren ,\nfyi .\nbob\n- - - - - - - - - - - - -...",0


## Part 1: Basic Text Preprocessing

- Only keep colums for the email's body text and the label
- Drop rows with duplicated or missing values
- Clean email body text by 
    - Extracting plain text
    - Normalizing whitespace
    - Removing emails, URLs, and punctuations
    - Lowercasing words and stripping leading/trailing spaces

In [56]:
# If changing dataset, check which column has the email body 
# and and which column has label to keep them

# Keep only the 'body' (email text) and 'label' (spam/not spam)
df = df[['email_body', 'is_spam']]

# Rename columns to body and label
df.columns = ['body', 'label']
df.head()

Unnamed: 0,body,label
0,just to update you on this project ' s status ...,0
1,the above referenced meters need to be placed ...,0
2,( see attached file : hpll 228 . xls )\n- hpll...,0
3,"daren ,\nit ' s in .\nbob\n- - - - - - - - - -...",0
4,"daren ,\nfyi .\nbob\n- - - - - - - - - - - - -...",0


### Data Integrity Checks

In [57]:
# 1. Remove rows with missing values (NaN)
df = df.dropna()
print("Missing values after dropna():")
print(df.isnull().sum())

# 2. Remove rows with duplicated email bodies
df = df.drop_duplicates(subset=['body'])
print(f"\nRemaining duplicates: {df.duplicated(subset=['body']).sum()}")

Missing values after dropna():
body     0
label    0
dtype: int64

Remaining duplicates: 0


In [58]:
def clean_email_body(html_text):
    """
    Cleans raw email text by:
    1. Parsing HTML and extracting plain text.
    2. Normalizing whitespace (newlines, tabs, etc.).
    3. Removing URLs, email addresses, and all non-alphabetic characters.
    4. Lowercasing and stripping final whitespace.
    """
    # 1. Parse HTML and extract plain text
    try:
        soup = BeautifulSoup(html_text, 'html.parser')
        text = soup.get_text()
    except:
        # Handle cases where the body might not be HTML (e.g., just plain text)
        # Ensure the input is treated as a string
        text = str(html_text)

    # 2. Normalize whitespace (replace multiple spaces, newlines, tabs with a single space)
    text = re.sub(r'\s+', ' ', text)

    # 3. Remove noise: URLs, emails, and punctuation
    text = re.sub(r'http\S+', ' ', text)      # Replace URLs with a space
    text = re.sub(r'\S+@\S+', ' ', text)     # Replace emails with a space
    text = re.sub(r'[^a-zA-Z\s]', ' ', text) # Replace non-letters/non-spaces with a space

    # 4. Final cleanup: lowercase and strip leading/trailing spaces
    text = text.lower().strip()
    
    return text

# Apply the cleaning function to the 'body' column
df["cleaned_email_body"] = df["body"].apply(clean_email_body)

# Drop the original, raw 'body' column as it's no longer needed
df = df.drop(columns="body")

print("\nDataFrame after basic cleaning:")
df.head()


DataFrame after basic cleaning:


Unnamed: 0,label,cleaned_email_body
0,0,just to update you on this project s status ...
1,0,the above referenced meters need to be placed ...
2,0,see attached file hpll xls hpll ...
3,0,daren it s in bob ...
4,0,daren fyi bob ...


## Part 2: NLP Processing

- Downloaded the necessary NLTK packages (punkt, stopwords, wordnet).
- Initialized the `WordNetLemmatizer` and created a set of English `stopwords` for fast lookup.
- Defined a function (`preprocess_text_2`) that:
    - Tokenized the clean text (split it into a list of words).
    - Looped through the list and kept only words that were not in the stopword list.
    - Lemmatized each of those remaining words.
    - Returned `None` if the text became empty (e.g., it only contained stopwords).
    - Joined the final list of processed words back into a single string.
- Applied this function to the `cleaned_email_body` column.
- Stored the output in a new `final_text` column.
- Dropped the intermediate `cleaned_email_body` column.
- Dropped any rows that was null from the DataFrame.

### Download NLTK Assets

In [59]:
# These only need to be downloaded once.
nltk.download('punkt')      # For the tokenizer
nltk.download('punkt_tab')  # Additional tokenizer resource
nltk.download('stopwords')  # For the list of stopwords
nltk.download('wordnet')    # For the lemmatizer
nltk.download('omw-1.4')    # Additional lemmatizer resource
nltk.download('words')      # For English vocabulary (less common, but good to have)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

### Initialize NLP Tools


In [60]:
# Load stopwords into a set for faster lookup
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

print(f"\nStopwords loaded: {len(stop_words)}")
print(f"Tokenizer test: {word_tokenize('hello world!')}")

def preprocess_text_2(text, min_words=1):
    """
    Applies tokenization, stopword removal, and lemmatization.
    """
    # 1. Tokenize: Split text into a list of words
    tokens = word_tokenize(text)

    processed_tokens = []
    for word in tokens:
        # 2. Filter: Remove stopwords and keep only alphabetic words
        if word not in stop_words:
            # 3. Lemmatize: Reduce word to its root form
            processed_tokens.append(lemmatizer.lemmatize(word))

    # 4. Optional: Skip texts that become empty after processing
    # (e.g., an email that only contained stopwords)
    if len(processed_tokens) < min_words:
        return None # This will be dropped as NaN later

    # 5. Re-join the processed tokens into a single string
    return " ".join(processed_tokens)

# Apply the advanced NLP function
df['final_text'] = df['cleaned_email_body'].apply(preprocess_text_2)

# Drop the intermediate 'cleaned_email_body' column
df = df.drop(columns="cleaned_email_body")

# Drop any rows that became empty (NaN) during the NLP step
df = df.dropna()

print("\nMissing values per column after NLP processing:")
print(df.isna().sum())
df.head()


Stopwords loaded: 198
Tokenizer test: ['hello', 'world', '!']

Missing values per column after NLP processing:
label         0
final_text    0
dtype: int64


Unnamed: 0,label,final_text
0,0,update project status based new report scott m...
1,0,referenced meter need placed k please note inf...
2,0,see attached file hpll xl hpll xl
3,0,daren bob forwarded robert cotten hou ect pm e...
4,0,daren fyi bob forwarded robert cotten hou ect ...


### DataFrame Formatting

In [61]:
# Rename 'label' to 'is_spam' for better readability
df.rename(columns={'label': 'is_spam'}, inplace=True)

# Reorder columns: feature (X) first, target (y) second
df = df[['final_text', 'is_spam']]

# Reset the DataFrame index after dropping rows
df = df.reset_index(drop=True)

print("\nFinal formatted DataFrame:")
df.head()


Final formatted DataFrame:


Unnamed: 0,final_text,is_spam
0,update project status based new report scott m...,0
1,referenced meter need placed k please note inf...,0
2,see attached file hpll xl hpll xl,0
3,daren bob forwarded robert cotten hou ect pm e...,0
4,daren fyi bob forwarded robert cotten hou ect ...,0


In [62]:
# --- File Saving Utility ---

def get_filename_without_ext(file_dir):
    """
    Gets the filename (e.g., "Ling") without its extension
    from a given path (e.g., "../DS/Ling.csv").
    """
    # 1. Get the full filename (e.g., "Ling.csv")
    filename = os.path.basename(file_dir)
    # 2. Split the filename from its extension and return just the name
    filename_without_ext = os.path.splitext(filename)[0]
    return filename_without_ext

In [63]:
# --- Save Cleaned Data ---

# Define the output directory
output_dir = "../cleaned_DS/"

# Create the output directory if it doesn't already exist
os.makedirs(output_dir, exist_ok=True)

# Generate the new filename
filename_without_ext = get_filename_without_ext(file_dir)
output_path = f"{output_dir}{filename_without_ext}_cleaned.csv"

# Save the cleaned DataFrame to a new CSV, without the pandas index
df.to_csv(output_path, index=False)

print(f"\nSuccessfully cleaned and saved data to: {output_path}")
print(f"Total processed emails: {len(df)}")


Successfully cleaned and saved data to: ../cleaned_DS/enron_cleaned.csv
Total processed emails: 29754
