In [None]:
from transformers import pipeline
import nltk
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import pandas as pd  # Missing in your snippetâ€”add this for pd.isna and DataFrame

In [None]:
stop_words = set(stopwords.words('english'))

def preprocess(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'[^a-zA-Z\s]', ' ', str(text).lower())
    text = re.sub(r'\s+', ' ', text)
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    return ' '.join(tokens)

In [None]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
labels = ["business", "personal", "promotions", "spam", "education"]

In [None]:
sample_emails = [
    "Get 40% off Premium Black Friday deal QuillBot upgrade now sign up by Dec 2025",  # Promotions
    "Last chance Pro plan free for university students Gemini no cost 12 months December 2025 terms apply",  # Promotions/Education
    "Security alert verify account now or suspended",  # Spam
    "Doooooo Ggggg",  # Vague â†’ Personal
    "Meeting agenda Q4 review attached team call 2pm"  # Business (Enron-like)
]

predictions = []
for email in sample_emails:
    cleaned = preprocess(email)
    result = classifier(cleaned, labels, multi_label=False)
    pred = result['labels'][0]
    conf = f"{result['scores'][0]:.2f}"  # Top score
    predictions.append({
        'raw_email': email,
        'cleaned_text': cleaned,
        'predicted_category': pred,
        'confidence': conf
    })

In [None]:
test_df = pd.DataFrame(predictions)
print("ðŸ§ª Zero-Shot Model Test Results:")
print(test_df.to_string(index=False))