In [None]:
"""
Phishing vs Safe Email Sentiment + Language Pattern Starter Pipeline (NLTK)

What this script does (Parts 1–3):
------------------------------------------------------------
Part 1 - Load & inspect:
  - Load CSV with pandas
  - Show first 5 rows
  - Show class counts (phishing vs safe/other)
  - Print 3 example phishing emails and 3 example safe emails

Part 2 - Preprocessing with NLTK:
  - Tokenize
  - Lowercase
  - Remove stopwords
  - Remove punctuation
  - Show original email + cleaned token list (sample)

Part 3 - Sentiment analysis with VADER:
  - Compute sentiment score (compound) for each email
  - Add sentiment_score column
  - Compute average sentiment by class (phishing vs safe)

How to run:
------------------------------------------------------------
python phishing_sentiment_pipeline.py --csv_path "your_file.csv"

Optional:
python phishing_sentiment_pipeline.py --csv_path "your_file.csv" --phishing_label "phishing" --safe_label "safe"
"""

import string
from typing import List, Optional

import pandas as pd

# NLTK imports
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer


# ============================================================
# HARDCODED DATASET PATH
# ============================================================
CSV_PATH = r"D:\Year 2 semester 2\Sys. and Proj\sentitment analysis\sample_email_dataset.csv"


# -------------------------------
# NLTK setup helpers
# -------------------------------
def ensure_nltk_resources() -> None:
    try:
        nltk.data.find("tokenizers/punkt")
    except LookupError:
        nltk.download("punkt")

    try:
        nltk.data.find("tokenizers/punkt_tab")
    except LookupError:
        nltk.download("punkt_tab")

    try:
        nltk.data.find("corpora/stopwords")
    except LookupError:
        nltk.download("stopwords")

    try:
        nltk.data.find("sentiment/vader_lexicon")
    except LookupError:
        nltk.download("vader_lexicon")


# -------------------------------
# Text preprocessing
# -------------------------------
def preprocess_email(text: str, stop_words: set) -> List[str]:

    if not isinstance(text, str):
        text = "" if text is None else str(text)

    # 1) Tokenization
    tokens = word_tokenize(text)

    cleaned_tokens = []
    for tok in tokens:
        # 2) Lowercasing
        tok = tok.lower()

        # Remove punctuation
        tok = tok.strip(string.punctuation)

        if not tok:
            continue

        # 3) Remove stopwords
        if tok in stop_words:
            continue

        cleaned_tokens.append(tok)

    return cleaned_tokens


# -------------------------------
# Label normalization
# -------------------------------
def normalize_label(label_val: str) -> str:
    if label_val is None:
        return ""
    return str(label_val).strip().lower()


# -------------------------------
# Main pipeline
# -------------------------------
def main():

    ensure_nltk_resources()

    # ---------------------------------
    # Part 1: Load & inspect data
    # ---------------------------------
    df = pd.read_csv(CSV_PATH)

    # Column names (modify here if needed)
    LABEL_COL = "label"
    TEXT_COL = "text"

    if LABEL_COL not in df.columns or TEXT_COL not in df.columns:
        raise ValueError(
            f"CSV must contain columns '{LABEL_COL}' and '{TEXT_COL}'. "
            f"Found columns: {list(df.columns)}"
        )

    df[LABEL_COL] = df[LABEL_COL].apply(normalize_label)

    print("\n=== Part 1: Dataset Loaded ===")
    print(f"Shape: {df.shape}")
    print("\nFirst 5 rows:")
    print(df[[LABEL_COL, TEXT_COL]].head(5))

    unique_labels = sorted(df[LABEL_COL].dropna().unique().tolist())

    print("\nLabel values found:", unique_labels)

    print("\nCounts by label:")
    print(df[LABEL_COL].value_counts(dropna=False))

    # Identify phishing and safe automatically
    phishing_label = "phishing"
    safe_label = None

    for lab in unique_labels:
        if lab != phishing_label:
            safe_label = lab
            break

    if safe_label:
        phishing_count = int((df[LABEL_COL] == phishing_label).sum())
        safe_count = int((df[LABEL_COL] == safe_label).sum())
        print(f"\nPhishing count ('{phishing_label}'): {phishing_count}")
        print(f"Safe count ('{safe_label}'): {safe_count}")

    # Print example emails
    def print_examples(label_name: str, n: int = 3):
        subset = df[df[LABEL_COL] == label_name]
        if subset.empty:
            return
        print(f"\n--- {n} example emails for label '{label_name}' ---")
        for i, txt in enumerate(subset[TEXT_COL].head(n).tolist(), start=1):
            print(f"\nExample {i}:\n{txt}")

    print("\n=== Examples ===")
    print_examples(phishing_label, 3)
    if safe_label:
        print_examples(safe_label, 3)

    # ---------------------------------
    # Part 2: Preprocessing
    # ---------------------------------
    print("\n=== Part 2: Preprocessing with NLTK ===")

    stop_words = set(stopwords.words("english"))

    df["clean_tokens"] = df[TEXT_COL].apply(lambda t: preprocess_email(t, stop_words))

    print("\nShowing original email + cleaned token list for first 3 samples:\n")

    for idx in range(min(3, len(df))):
        original = df.iloc[idx][TEXT_COL]
        tokens = df.iloc[idx]["clean_tokens"]
        label_val = df.iloc[idx][LABEL_COL]
        print(f"Sample {idx+1} (label={label_val})")
        print("Original email:")
        print(original)
        print("Cleaned token list:")
        print(tokens)
        print("-" * 60)

    # ---------------------------------
    # Part 3: Sentiment Analysis
    # ---------------------------------
    print("\n=== Part 3: Sentiment Analysis (VADER) ===")

    sia = SentimentIntensityAnalyzer()

    def vader_compound_score(text: str) -> float:
        if not isinstance(text, str):
            text = "" if text is None else str(text)
        return float(sia.polarity_scores(text)["compound"])

    df["sentiment_score"] = df[TEXT_COL].apply(vader_compound_score)

    phishing_avg = df.loc[df[LABEL_COL] == phishing_label, "sentiment_score"].mean()
    print(f"\nAverage sentiment_score for phishing ('{phishing_label}'): {phishing_avg:.4f}")

    if safe_label:
        safe_avg = df.loc[df[LABEL_COL] == safe_label, "sentiment_score"].mean()
        print(f"Average sentiment_score for safe ('{safe_label}'): {safe_avg:.4f}")

    print("\nAverage sentiment_score by label:")
    print(df.groupby(LABEL_COL)["sentiment_score"].mean().sort_values(ascending=False))

    print("\nDone.")


if __name__ == "__main__":
    main()


=== Part 1: Dataset Loaded ===
Shape: (40, 2)

First 5 rows:
      label                                               text
0  phishing  Urgent: Your account has been compromised. Ver...
1  phishing  We detected unusual activity. Click the link b...
2  phishing  Your payment could not be processed. Update yo...
3  phishing  Final notice: Failure to respond will result i...
4  phishing  You have received a secure message. Open the a...

Label values found: ['phishing', 'safe']

Counts by label:
label
phishing    20
safe        20
Name: count, dtype: int64

Phishing count ('phishing'): 20
Safe count ('safe'): 20

=== Examples ===

--- 3 example emails for label 'phishing' ---

Example 1:
Urgent: Your account has been compromised. Verify your information immediately to restore access.

Example 2:
We detected unusual activity. Click the link below to confirm your identity.

Example 3:
Your payment could not be processed. Update your billing details now.

--- 3 example emails for label 'sa

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Amponsah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Amponsah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Amponsah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Amponsah\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


What This Pipeline Does


This pipeline analyzes phishing and safe emails to understand how their language differs, how emotional tone differs and whether phishing emails use more urgency or pressure. It does this in three major stages:


PART 1 - Loading and Inspecting the Data

1.	Loading the Dataset: The script uses pandas to load the CSV file. Once loaded, pandas stores everything in a DataFrame, which works like a spreadsheet inside Python.


2.	Inspecting the Dataset: The script prints the dataset shape and the first 5 rows. This confirms that labels are correct and that emails are loaded properly.


3.	Counting Email Types: It then counts how many phishing vs safe emails exist.



4.	Printing Example Emails: The script prints 3 phishing and 3 safe emails.



PART 2 - Text Preprocessing (Cleaning the Emails)

Before analyzing language, the emails must be cleaned. Raw text contains capital letters, punctuation, common filler words, formatting noise etc. Computers work better with structured tokens.

1.	Tokenization: The email is split into individual words.
Example:


Urgent: The account has been compromised.

Becomes:

[‘Urgent’, ‘:’, ‘The’, ‘account’, ‘has’, ‘been’, ‘compromised’, ‘.’]


3.	Lowercasing: All words are converted to lowercase.


4.	Removing Punctuation: Symbols like ‘:’, ‘.’, ‘,’, ‘!’ etc. are removed because they don’t carry strong meaning for this analysis.


5.	Removing Stopwords: Stopwords are very common words like ‘the’, ‘is’, ‘has’, ‘been’ etc. They appear in almost every sentence and don’t help distinguish phishing from safe emails.

After cleaning:

‘Urgent: The account has been compromised.’

Becomes:

[‘urgent’, ‘account’, ‘compromised’, ‘verify’, ‘information’, ‘immediately’, ‘restore’, ‘access’]


PART 3 - Sentiment Analysis (Using VADER)

This measures the emotional tone.
The script uses:
NLTK’s VADER (Valence Aware Dictionary for Sentiment Reasoning)

VADER assigns four scores to each email:

•	positive
•	negative
•	neutral
•	compound (overall score)


The pipeline uses the compound score, which ranges from -1 to 1.

The script calculates the average sentiment for phishing emails and for safe emails.

This is a structured linguistic analysis system that cleans email text, extracts meaningful language features, computes emotional tone, and statistically compares phishing vs safe communication patterns.
