In [None]:
# Hugging Face Transformers
# Used to load pretrained tokenizers, models, and easy-to-use inference pipelines
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# TextBlob
# Simple library for basic sentiment analysis and text polarity
from textblob import TextBlob

# CleanText
# Used for cleaning raw text (removing emojis, special characters, etc.)
from cleantext import clean

# Pandas
# Core library for working with DataFrames and CSV data
import pandas as pd

# NLTK (Natural Language Toolkit)
# Used for tokenization, stopwords, lemmatization, and sentiment analysis
import nltk

# spaCy
# Advanced NLP library for POS tagging, dependency parsing, and NER
import spacy

# Stopwords from NLTK
# Common words (the, is, and, etc.) removed during preprocessing
from nltk.corpus import stopwords

# Tokenizer
# Splits text into individual words (tokens)
from nltk.tokenize import word_tokenize

# Lemmatizer
# Converts words to their base form (e.g., "running" â†’ "run")
from nltk.stem import WordNetLemmatizer

# Regular expressions
# Used for pattern-based text cleaning
import re

# Download required NLTK datasets
nltk.download('punkt')       # Tokenizer models
nltk.download('stopwords')   # Stopword lists
nltk.download('wordnet')     # Lemmatization dictionary

# Load English stopwords into a set for fast lookup
stop_words = set(stopwords.words('english'))

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# NumPy
# Numerical operations and array handling
import numpy as np

# PyTorch
# Backend used by Hugging Face models for inference
import torch

# VADER Sentiment Analyzer (NLTK)
# Rule-based sentiment analysis optimized for short text
from nltk.sentiment import SentimentIntensityAnalyzer

# Matplotlib
# Used for plotting sentiment distributions, keyword frequency, etc.
import matplotlib.pyplot as plt

In [None]:
# Read the CSV
emails = pd.read_csv("emails.csv", header=None)  # Use header=None if your CSV has no header
emails.columns = ["email_text"]  # Name the single column

# Add an ID column
emails["id"] = range(1, len(emails) + 1)

# Reorder so 'id' is first
emails = emails[["id", "email_text"]]

# Check the result
print(emails.head())
emails.info()

In [None]:
# Clean the email texts 
emails["text_clean"] = emails["email_text"].apply( lambda x: clean(x, no_emoji=True) ) 

emails["text_clean"] = emails["text_clean"].apply( lambda x: re.sub(r"[^\w\s]", "", x) )

#  Tokenize text FIRST
emails["tokens"] = emails["text_clean"].apply(word_tokenize)

# Remove stopwords (NOW this works correctly)
stop_words = set(stopwords.words("english"))

emails["tokens_no_stopwords"] = emails["tokens"].apply(
    lambda tokens: [word for word in tokens if word not in stop_words]
)

# lemmatization
lemmatizer = WordNetLemmatizer()

emails["lemmatized_tokens"] = emails["tokens_no_stopwords"].apply(
    lambda tokens: [lemmatizer.lemmatize(word) for word in tokens]
)

#clean tokens to keep only alphanumeric
emails["clean_tokens"] = emails["lemmatized_tokens"].apply(
    lambda tokens: [token for token in tokens if token.isalnum()]
)

emails.head(15)

In [None]:
# Make sure you have the POS tagger data
nlp = spacy.load("en_core_web_sm")

# Function to process tokens and get POS tags
def spacy_pos(tokens):
    # Join tokens into a single string (spaCy works on text)
    text = " ".join(tokens)
    doc = nlp(text)
    # Return a list of (token, POS) tuples
    return [(token.text, token.pos_) for token in doc]

# Apply to your DataFrame
emails['pos_tags'] = emails['clean_tokens'].apply(spacy_pos)

# Preview
print(emails[['email_text', 'clean_tokens', 'pos_tags']].head(5))

In [None]:
emails[emails['id'] == 2]['pos_tags'].values[0]

In [None]:
# ---------------------------------------------
# Load phishing-related keywords from file
# ---------------------------------------------
# Each line in phishing_keywords.txt represents
# one keyword commonly found in phishing emails
with open("phishing_keywords.txt", "r") as f:
    # Strip whitespace, convert to lowercase,
    # and ignore empty lines
    keywords = [line.strip().lower() for line in f if line.strip()]


# ---------------------------------------------
# Function to flag presence of phishing keywords
# ---------------------------------------------
def keyword_flags(tokens):
    """
    Given a list of cleaned tokens from an email,
    return a dictionary indicating whether each
    phishing keyword is present (1) or not (0).
    """
    return {f"{k}": int(k in tokens) for k in keywords}


# ---------------------------------------------
# Apply keyword flagging to each email
# ---------------------------------------------
# Expand the dictionary of keyword flags into
# individual dataframe columns
keyword_df = emails["clean_tokens"].apply(keyword_flags).apply(pd.Series)

# Merge keyword feature columns back into main dataset
emails = pd.concat([emails, keyword_df], axis=1)


# ---------------------------------------------
# Visualize phishing keyword frequency
# ---------------------------------------------
# Count how many emails contain each keyword
keyword_counts = keyword_df.sum().sort_values()

# Create a horizontal bar chart
plt.figure(figsize=(10, 20))
keyword_counts.plot.barh()

# Chart labeling
plt.title("Phishing Keyword Frequency")
plt.xlabel("Number of Emails Containing Keyword")
plt.ylabel("Keyword")

# Adjust layout to prevent label cutoff
plt.tight_layout()
plt.show()

In [None]:
# --------------------------------
# Count how often each token-POS pair appears
# --------------------------------
pos_df_counts = (
    emails.explode("pos_tags")
    .assign(token=lambda df: df['pos_tags'].str[0], pos_tag=lambda df: df['pos_tags'].str[1])
    .groupby(['token', 'pos_tag'])
    .size()
    .reset_index(name="counts")
    .sort_values("counts", ascending=False)
)

# Show the 10 most common token + POS combinations
pos_df_counts.head(15)

In [None]:
# --------------------------------
# Count how many unique tokens belong to each POS tag
# --------------------------------

unique_tokens_per_pos = (
    pos_df_counts.groupby("pos_tag")["token"]
    .nunique()
    .reset_index(name="unique_token_count")
    .sort_values("unique_token_count", ascending=False)
)
unique_tokens_per_pos

In [None]:
# ---------------------------------------------
# VADER imports for rule-based sentiment analysis
# ---------------------------------------------
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

emails["vader_sentiment_score"] = emails["text_clean"].apply(
    lambda r: analyzer.polarity_scores(r)["compound"]
)

emails.head(15)

In [None]:
# Define sentiment score ranges for classification
# VADER compound scores range from -1 (very negative) to +1 (very positive)
bins = [-1, -0.1, 0.1, 1]

# Human-readable sentiment labels for each range
labels = ["Negative", "Neutral", "Positive"]

# Convert numeric VADER sentiment scores into categorical labels
# pd.cut assigns each score to a bin based on the ranges above
emails["vader_sentiment_label"] = pd.cut(
    emails["vader_sentiment_score"],
    bins=bins,
    labels=labels
)

# Count how many emails fall into each sentiment category
# and plot the distribution as a bar chart
emails["vader_sentiment_label"].value_counts().plot.bar()

# Add a title and axis labels to the plot
plt.title("VADER Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Number of Reviews")

# Display the plot
plt.show()

In [None]:
# Import the Hugging Face pipeline utility for easy model inference
from transformers import pipeline

# Load a pretrained BERT model fine-tuned specifically for phishing detection
# "text-classification" tells Transformers we want a classifier
# truncation=True ensures long emails are safely trimmed to the model's max length
phish_detection = pipeline(
    "text-classification",
    model="ealvaradob/bert-finetuned-phishing",
    truncation=True
)

# Loop through each cleaned email text in the dataset
for e in emails["text_clean"]:
    
    # Run the phishing detection model on the email text
    result = phish_detection(e)
    
    # Print the original email text
    print(f"Text: {e}")
    
    # Print the model's prediction label and confidence score
    print(f"Prediction: {result}\n")