In [None]:
# Install dependencies
!pip install -U \
    pandas \
    numpy \
    scikit-learn \
    torch \
    transformers \
    datasets \
    huggingface_hub \
    fsspec \
    accelerate \
    evaluate


In [None]:
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"

import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

import torch

from torch.utils.data import Dataset, DataLoader
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification
)
from torch.optim import AdamW


In [None]:
torch.set_num_threads(1)
torch.set_num_interop_threads(1)


In [None]:
splits = {'train': 'train.jsonl', 'test': 'test.jsonl'}

df_train = pd.read_json(
    "hf://datasets/SetFit/enron_spam/" + splits["train"],
    lines=True
)
df_test = pd.read_json(
    "hf://datasets/SetFit/enron_spam/" + splits["test"],
    lines=True
)

df_train.head()


In [None]:
print(df_train.label.value_counts())
print(df_test.label.value_counts())


In [None]:
# 2. Naive Bayes Baseline (TF-IDF)

In [None]:
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=20000,
    ngram_range=(1, 2)
)

X_train_tfidf = vectorizer.fit_transform(df_train["text"])
X_test_tfidf = vectorizer.transform(df_test["text"])

In [None]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, df_train["label"])

y_pred_nb = nb_model.predict(X_test_tfidf)
y_proba_nb = nb_model.predict_proba(X_test_tfidf)

print("Naive Bayes Accuracy:", accuracy_score(df_test["label"], y_pred_nb))
print(classification_report(
    df_test["label"], y_pred_nb,
    target_names=["Ham", "Spam"]
))

In [None]:
cm = confusion_matrix(df_test["label"], y_pred_nb)
print(cm)


In [None]:
log_proba_nb = nb_model.predict_log_proba(X_test_tfidf)

y_pred_nb = log_proba_nb.argmax(axis=1)

print("Naive Bayes Accuracy:", accuracy_score(df_test["label"], y_pred_nb))
print(classification_report(
    df_test["label"], y_pred_nb,
    target_names=["Ham", "Spam"]
))

In [None]:
nb_model.feature_log_prob_.shape
feature_names = vectorizer.get_feature_names_out()

log_prob_df = pd.DataFrame(
    nb_model.feature_log_prob_,
    columns=feature_names,
    index=["Ham", "Spam"]
)



In [None]:
log_prob_df.loc["Spam", "free"] - log_prob_df.loc["Ham", "free"]


In [None]:
print(log_prob_df.loc["Spam", "company"] - log_prob_df.loc["Ham", "company"])
print(log_prob_df.loc["Spam", "new"] - log_prob_df.loc["Ham", "new"])
print(log_prob_df.loc["Spam", "fw"] - log_prob_df.loc["Ham", "fw"])

In [None]:
log_prob_df.loc["Spam","new"]

In [None]:
log_odds = (
    log_prob_df.loc["Spam"] -
    log_prob_df.loc["Ham"]
).sort_values(ascending=False)

log_odds.head(20)


In [None]:
log_odds.tail(20)

In [None]:
results_nb = pd.DataFrame({
    "text": df_test["text"].values,
    "true_label": df_test["label"].values,
    "predicted_label": y_pred_nb,
    "prob_ham": y_proba_nb[:, 0],
    "prob_spam": y_proba_nb[:, 1],
})

results_nb["confidence"] = results_nb[["prob_ham", "prob_spam"]].max(axis=1)


In [None]:
misclassified_nb = results_nb[
    results_nb.true_label != results_nb.predicted_label
]


In [None]:
false_positives_nb = misclassified_nb[
    (misclassified_nb.true_label == 0) &
    (misclassified_nb.predicted_label == 1)
]

false_negatives_nb = misclassified_nb[
    (misclassified_nb.true_label == 1) &
    (misclassified_nb.predicted_label == 0)
]
confident_wrong = results_nb[
    (results_nb.true_label != results_nb.predicted_label) &
    (results_nb.confidence > 0.9)
]

confident_wrong.head()

In [None]:
def print_nb_examples(df, n=5):
    for _, row in df.head(n).iterrows():
        print("=" * 80)
        print("TRUE LABEL:", "Spam" if row.true_label == 1 else "Ham")
        print("PREDICTED:", "Spam" if row.predicted_label == 1 else "Ham")
        print("EMAIL TEXT:")
        print(row.text[:1000])  # truncate long emails
        print()


print("\nFALSE POSITIVES")
print_nb_examples(false_positives_nb, n=5)

print("\nFALSE NEGATIVES")
print_nb_examples(false_negatives_nb, n=14)


In [None]:
# Most Common Words in Spam vs Ham Emails (Stopwords removed)

In [None]:
from collections import Counter
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re


In [None]:
stopwords = set(ENGLISH_STOP_WORDS) | {"com", "www", "http", "subject", "message", "ect", "hou"}


def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in stopwords and len(t) > 2]
    return tokens

In [None]:
spam_texts = df_train[df_train["label"] == 1]["text"]
ham_texts  = df_train[df_train["label"] == 0]["text"]

spam_tokens = []
ham_tokens = []

for text in spam_texts:
    spam_tokens.extend(tokenize(text))

for text in ham_texts:
    ham_tokens.extend(tokenize(text))

In [None]:
spam_counter = Counter(spam_tokens)
ham_counter = Counter(ham_tokens)

spam_counter.most_common(20), ham_counter.most_common(20)

In [None]:
# Log-Odds (NOT USED!)

In [None]:
import math

vocab = set(spam_counter.keys()) | set(ham_counter.keys())

def log_odds(word, alpha=1):
    spam_count = spam_counter.get(word, 0) + alpha
    ham_count = ham_counter.get(word, 0) + alpha
    return math.log(spam_count / ham_count)

log_odds_scores = {
    word: log_odds(word) for word in vocab
}

top_spam_words = sorted(
    log_odds_scores.items(),
    key=lambda x: x[1],
    reverse=True
)[:20]

top_ham_words = sorted(
    log_odds_scores.items(),
    key=lambda x: x[1]
)[:20]

top_spam_words, top_ham_words


In [None]:
# DistilBERT

In [None]:
class EmailDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    attn_implementation="eager"
)

torch.set_num_threads(2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("CUDA available:", torch.cuda.is_available())
model.to(device)
print(next(model.parameters()).device)

In [None]:
# Check if graphics card uses memory
#!nvidia-smi


In [None]:
print("PyTorch version:", torch.__version__)
print("CUDA available in PyTorch:", torch.cuda.is_available())
print("CUDA version PyTorch was built with:", torch.version.cuda)


In [None]:
train_dataset = EmailDataset(
    df_train["text"].tolist(),
    df_train["label"].tolist(),
    tokenizer
)

test_dataset = EmailDataset(
    df_test["text"].tolist(),
    df_test["label"].tolist(),
    tokenizer
)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False, num_workers=0, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=32)


In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
epochs = 5

for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")


In [None]:
model.eval() 

In [None]:
all_preds = []
all_labels = []
all_indices = []

with torch.no_grad():
    for i, batch in enumerate(test_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        start_idx = i * test_loader.batch_size
        batch_size = labels.size(0)
        all_indices.extend(range(start_idx, start_idx + batch_size))



In [None]:


accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average="binary")
recall = recall_score(all_labels, all_preds, average="binary")
f1 = f1_score(all_labels, all_preds, average="binary")

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 score:  {f1:.4f}")

cm = confusion_matrix(all_labels, all_preds)
print("\nConfusion Matrix:")
print(cm)

In [None]:
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=["Ham", "Spam"]))

In [None]:
results_df = pd.DataFrame({
    "index": all_indices,
    "text": df_test.iloc[all_indices]["text"].values,
    "true_label": all_labels,
    "predicted_label": all_preds
})

misclassified = results_df[results_df.true_label != results_df.predicted_label]


In [None]:
false_positives = misclassified[
    (misclassified.true_label == 0) &
    (misclassified.predicted_label == 1)
]

false_negatives = misclassified[
    (misclassified.true_label == 1) &
    (misclassified.predicted_label == 0)
]


In [None]:
def print_examples(df, n=5):
    for i, row in df.head(n).iterrows():
        print("="*80)
        print("TRUE LABEL:", "Spam" if row.true_label == 1 else "Ham")
        print("Predicted:", "Spam" if row.predicted_label == 1 else "Ham")
        print("Email text:")
        print(row.text[:1000]) 
        print()

print("FALSE POSITIVES")
print_examples(false_positives, n=5)

print("\nFALSE NEGATIVES")
print_examples(false_negatives, n=13)


Attention

In [None]:
model.config.attn_implementation = "eager"
model.config.output_attentions = True
model.config.return_dict = True

model.eval()
with torch.no_grad():
    outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
    )

attentions = outputs.attentions

tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

attn = attentions[-1][0]
attn_mean = attn.mean(dim=0)
cls_attention = attn_mean[0]

token_attention = list(
    zip(tokens, cls_attention.cpu().numpy())
)


In [None]:
top_tokens = sorted(
    token_attention,
    key=lambda x: x[1],
    reverse=True
)

for tok, score in top_tokens[:15]:
    print(f"{tok:>15s}  {score:.4f}")
