In [1]:
# Core libraries
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score, f1_score, classification_report, multilabel_confusion_matrix, ConfusionMatrixDisplay

# NLTK for stopwords and stemming
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download NLTK stopwords if needed
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hout\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from datasets import load_dataset

# Load dataset
dataset = load_dataset("imnim/multiclass-email-classification")
train_data = dataset["train"]

# Combine subject + body
texts_raw = [s + " " + b for s, b in zip(train_data["subject"], train_data["body"])]
labels_raw = train_data["labels"]

print("Sample text:", texts_raw[0])
print("Sample labels:", labels_raw[0])


  from .autonotebook import tqdm as notebook_tqdm


Sample text: Meeting Reminder: Quarterly Sales Review Tomorrow Dear Team, Just a friendly reminder that our Quarterly Sales Review meeting is scheduled for tomorrow at 10:00 AM in the conference room. Please make sure to bring your sales reports and any relevant updates. Coffee and pastries will be provided. Looking forward to a productive meeting. Best regards, [Your Name]
Sample labels: ['Business', 'Reminders']


In [3]:
def preprocess_email(text):
    """
    Enhanced normalization:
    - Remove placeholders like [Your Name], [Recipient], etc.
    - Lowercase
    - Remove punctuation
    - Remove stopwords
    - Simple stemming
    - Remove extra spaces
    """
    # Remove anything inside square brackets
    text = re.sub(r'\[.*?\]', '', text)
    
    # Lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize, remove stopwords, and stem
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words and len(word) > 1]
    
    # Recombine
    return " ".join(tokens)

# Apply preprocessing
texts = [preprocess_email(t) for t in texts_raw]
labels = labels_raw  # keep as is for now


In [4]:
# Convert labels to binary matrix
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(labels)

# Split
X_train, X_test, y_train, y_test = train_test_split(
    texts, y, test_size=0.2, random_state=42
)

# Keep raw text for later display
X_train_raw, X_test_raw = train_test_split(
    texts_raw, test_size=0.2, random_state=42
)
labels_train, labels_test = train_test_split(y, test_size=0.2, random_state=42)


In [5]:
# TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1,2),  # unigrams + bigrams
    sublinear_tf=True
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# OneVsRestClassifier with ComplementNB
model = OneVsRestClassifier(ComplementNB())
model.fit(X_train_vec, y_train)


0,1,2
,estimator,ComplementNB()
,n_jobs,
,verbose,0

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,
,norm,False


In [6]:
# Predict with threshold 0.5
threshold = 0.5
y_proba_test = model.predict_proba(X_test_vec)
y_pred_test = (y_proba_test >= threshold).astype(int)

# Subset accuracy
subset_acc = accuracy_score(y_test, y_pred_test)
print("âœ… Subset Accuracy:", subset_acc)

# Per-class classification report
print("\nClassification Report per Class:")
print(classification_report(y_test, y_pred_test, target_names=mlb.classes_))

# Micro / Macro F1
micro_f1 = f1_score(y_test, y_pred_test, average='micro')
macro_f1 = f1_score(y_test, y_pred_test, average='macro')
print(f"\nMicro F1: {micro_f1:.3f}, Macro F1: {macro_f1:.3f}")


âœ… Subset Accuracy: 0.6175771971496437

Classification Report per Class:
                      precision    recall  f1-score   support

            Business       0.75      0.93      0.83       174
    Customer Support       0.80      0.73      0.76        48
Events & Invitations       0.75      0.91      0.82       127
     Finance & Bills       0.87      0.98      0.93        63
     Job Application       1.00      0.81      0.89        26
         Newsletters       0.88      0.46      0.60        46
            Personal       1.00      0.21      0.35        52
          Promotions       0.72      0.78      0.75        27
           Reminders       0.68      0.67      0.68        70
   Travel & Bookings       1.00      0.95      0.97        58

           micro avg       0.79      0.79      0.79       691
           macro avg       0.85      0.74      0.76       691
        weighted avg       0.81      0.79      0.78       691
         samples avg       0.82      0.82      0.80     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [7]:
def analyze_test_example(index, threshold=0.5):
    email_text = preprocess_email(X_test_raw[index])
    true_labels_bin = labels_test[index]
    true_labels_names = [mlb.classes_[i] for i, val in enumerate(true_labels_bin) if val == 1]

    # Vectorize & predict probabilities
    vec = vectorizer.transform([email_text])
    proba = model.predict_proba(vec)[0]

    # Show email preview
    print("\nðŸ“© EMAIL PREVIEW:", X_test_raw[index][:200], "...")
    print("ðŸŽ¯ TRUE LABELS:", true_labels_names)

    # Show label probabilities
    print("\nðŸ“Š LABEL PROBABILITIES:")
    for label, p in zip(mlb.classes_, proba):
        print(f"- {label}: {p:.3f}")

    # Predict using threshold
    pred_bin = (proba >= threshold).astype(int)
    pred_labels = [mlb.classes_[i] for i, val in enumerate(pred_bin) if val == 1]

    print(f"\nPredicted Label (Threshold {threshold}): {pred_labels}")

    return {
        "index": index,
        "text": X_test_raw[index],
        "true_labels": true_labels_names,
        "probs": dict(zip(mlb.classes_, proba)),
        "predicted_labels": pred_labels
    }

# Example usage
result = analyze_test_example(235)



ðŸ“© EMAIL PREVIEW: Upcoming Webinar: Maximizing Your Business's Online Presence Dear valued customers, We are excited to announce our upcoming webinar on 'Maximizing Your Business's Online Presence.' Join us on Thursday ...
ðŸŽ¯ TRUE LABELS: ['Business', 'Events & Invitations']

ðŸ“Š LABEL PROBABILITIES:
- Business: 0.996
- Customer Support: 0.004
- Events & Invitations: 0.998
- Finance & Bills: 0.001
- Job Application: 0.001
- Newsletters: 0.128
- Personal: 0.002
- Promotions: 0.002
- Reminders: 0.003
- Travel & Bookings: 0.001

Predicted Label (Threshold 0.5): ['Business', 'Events & Invitations']
