In [151]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score, classification_report, f1_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import re
import numpy as np


In [152]:
dataset = load_dataset("imnim/multiclass-email-classification")
train_data = dataset["train"]

# Check columns
print(train_data.column_names)
# ['subject', 'body', 'labels']


['subject', 'body', 'labels']


In [153]:
# Combine subject + body
texts = [s + " " + b for s, b in zip(train_data["subject"], train_data["body"])]

# MultiLabelBinarizer converts labels into binary matrix
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(train_data["labels"])

# Preview
print(texts[0])
print(train_data["labels"][0])
print(y[0])  # multi-hot encoded labels


Meeting Reminder: Quarterly Sales Review Tomorrow Dear Team, Just a friendly reminder that our Quarterly Sales Review meeting is scheduled for tomorrow at 10:00 AM in the conference room. Please make sure to bring your sales reports and any relevant updates. Coffee and pastries will be provided. Looking forward to a productive meeting. Best regards, [Your Name]
['Business', 'Reminders']
[1 0 0 0 0 0 0 0 1 0]


In [154]:
X_train, X_test, y_train, y_test = train_test_split(
    texts, y, test_size=0.2, random_state=42
)


In [155]:
# Custom tokenizer removes punctuation, numbers, very short words
def custom_tokenizer(text):
    tokens = re.findall(r'\b[a-z]{2,}\b', text.lower())  # only words with letters, len >= 2
    return tokens

vectorizer = TfidfVectorizer(
    tokenizer=custom_tokenizer,
    stop_words='english',
    max_features=10000,       # increase features
    ngram_range=(1,2),        # unigrams + bigrams
    sublinear_tf=True          # smooth term frequencies
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print("Training shape:", X_train_vec.shape)
print("Testing shape:", X_test_vec.shape)


Training shape: (1684, 10000)
Testing shape: (421, 10000)




In [156]:
# OneVsRestClassifier handles multi-label
model = OneVsRestClassifier(ComplementNB())
model.fit(X_train_vec, y_train)


0,1,2
,estimator,ComplementNB()
,n_jobs,
,verbose,0

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,
,norm,False


In [157]:
# Default threshold can be tuned
threshold = 0.6

# Predict probabilities
y_proba = model.predict_proba(X_test_vec)

# Apply threshold to get multi-label predictions
y_pred = (y_proba >= threshold).astype(int)


In [158]:
# Subset accuracy (strict)
subset_acc = accuracy_score(y_test, y_pred)
print("✅ Subset Accuracy:", subset_acc)

# Per-class classification report
print("\nClassification Report per Class:")
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

# F1 scores
micro_f1 = f1_score(y_test, y_pred, average='micro')
macro_f1 = f1_score(y_test, y_pred, average='macro')
print(f"\nMicro F1: {micro_f1:.3f}, Macro F1: {macro_f1:.3f}")


✅ Subset Accuracy: 0.6294536817102138

Classification Report per Class:
                      precision    recall  f1-score   support

            Business       0.76      0.91      0.83       174
    Customer Support       0.78      0.75      0.77        48
Events & Invitations       0.77      0.91      0.83       127
     Finance & Bills       0.83      0.98      0.90        63
     Job Application       1.00      0.81      0.89        26
         Newsletters       0.87      0.57      0.68        46
            Personal       1.00      0.23      0.38        52
          Promotions       0.72      0.78      0.75        27
           Reminders       0.66      0.67      0.67        70
   Travel & Bookings       1.00      0.97      0.98        58

           micro avg       0.79      0.80      0.80       691
           macro avg       0.84      0.76      0.77       691
        weighted avg       0.81      0.80      0.78       691
         samples avg       0.82      0.82      0.81       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [159]:
# Example emails
test_emails = [
    "Quarterly Sales Report Hi team, please find attached the Q3 sales report. We need to review the numbers before the next management meeting.",
    "Issue with my order #12345 Hello, I received the wrong item in my order. Can you please assist with a replacement?",
    "Invitation: Annual Company Gala You are cordially invited to attend our Annual Company Gala on December 5th. RSVP by November 20th.",
    "Your October Electricity Bill Dear customer, your electricity bill for October is $120. Please pay by November 15th to avoid late fees.",
    "Application for Software Developer Role Dear HR, I am writing to apply for the Software Developer position. My resume and portfolio are attached for your review."
]

# Vectorize
test_vec = vectorizer.transform(test_emails)

# Predict probabilities
test_proba = model.predict_proba(test_vec)

# Apply threshold
threshold = 0.3
test_pred = (test_proba >= threshold).astype(int)

# Show top 3 predictions per email
for i, email in enumerate(test_emails):
    print(f"\nEmail {i+1}: {email[:60]}...")
    probs_sorted = sorted(zip(mlb.classes_, test_proba[i]), key=lambda x: x[1], reverse=True)
    for label, p in probs_sorted[:3]:
        print(f"{label}: {p:.3f}")



Email 1: Quarterly Sales Report Hi team, please find attached the Q3 ...
Business: 0.883
Reminders: 0.619
Finance & Bills: 0.338

Email 2: Issue with my order #12345 Hello, I received the wrong item ...
Customer Support: 0.583
Finance & Bills: 0.536
Business: 0.407

Email 3: Invitation: Annual Company Gala You are cordially invited to...
Events & Invitations: 0.994
Business: 0.689
Personal: 0.084

Email 4: Your October Electricity Bill Dear customer, your electricit...
Finance & Bills: 0.990
Customer Support: 0.071
Travel & Bookings: 0.042

Email 5: Application for Software Developer Role Dear HR, I am writin...
Job Application: 0.963
Customer Support: 0.213
Travel & Bookings: 0.162


In [160]:
email_text = """
Hello team,

I hope everyone is doing well. I am sharing the draft of the Q4 financial report for our review. 
Please go through the document carefully and check the budget allocations for each department. 
We also need to finalize any pending approvals, especially for upcoming projects and vendor payments. 
Let me know if there are discrepancies or suggestions for adjustments. 
Additionally, remember that the management meeting is scheduled for next Friday, 
where we will discuss key performance indicators, revenue targets, and upcoming initiatives for the next quarter. 
Your timely feedback will be greatly appreciated.

Best regards,
Finance Department
"""

# -----------------------------
# Vectorize the email
email_pred = (email_proba >= threshold).astype(int)

email_vec = vectorizer.transform([email_text])

# -----------------------------
# Predict probabilities using trained model
email_proba = model.predict_proba(email_vec)[0]  # get probabilities for the single email

# -----------------------------
# Display probabilities for all labels
print("Predicted probabilities for all labels:\n")
for label, proba in zip(mlb.classes_, email_proba):
    print(f"{label}: {proba:.3f}")


Predicted probabilities for all labels:

Business: 0.928
Customer Support: 0.074
Events & Invitations: 0.192
Finance & Bills: 0.287
Job Application: 0.033
Newsletters: 0.020
Personal: 0.128
Promotions: 0.008
Reminders: 0.484
Travel & Bookings: 0.017
