In [2]:

import os
import re
import argparse
from email import policy
from email.parser import BytesParser

import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Ensure NLTK stopwords are downloaded
nltk.download('stopwords', quiet=True)
STOPWORDS = set(stopwords.words('english'))

def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)      # remove ursls
    text = re.sub(r'[^a-z0-9\s]', ' ', text)               # and punctuation
    text = re.sub(r'\s+', ' ', text).strip()    # add stemming or lemmatizngn??
    return text
print("done")

done


In [4]:
def load_emails(maildir: str):
    texts, labels = [], []
    for user in os.listdir(maildir):
        user_dir = os.path.join(maildir, user)
        if not os.path.isdir(user_dir):
            continue

        # only look at the “sent_items” folder
        folder_dir = os.path.join(user_dir, "sent_items")
        if not os.path.isdir(folder_dir):
            continue

        for fname in os.listdir(folder_dir):
            path = os.path.join(folder_dir, fname)
            try:
                with open(path, 'rb') as f:
                    msg = BytesParser(policy=policy.default).parse(f) #read binary
                body = msg.get_body(preferencelist=('plain',))
                if body is None:
                    continue
                raw = body.get_content()
                text = clean_text(raw)
                if not text:
                    continue
                tokens = [w for w in text.split() if w not in STOPWORDS]  #cleanning. Added lemmentatizn or stemming?
                texts.append(' '.join(tokens))
                labels.append(user)
            except Exception:
                continue

    return texts, labels
print("done")

done


In [5]:
DATA_DIR = "../maildir"     # path to Enron stuf
TEST_SIZE = 0.2
RANDOM_STATE = 42

print("Loading and cleaning emails…")
texts, labels = load_emails(DATA_DIR)
print(f"→ {len(texts)} messages from {len(set(labels))} authors")

Loading and cleaning emails…
→ 6400 messages from 20 authors


In [6]:
print("Vectorizing with TF–IDF…")
vect = TfidfVectorizer(max_features=20_000)
X = vect.fit_transform(texts)
y = labels

Vectorizing with TF–IDF…


In [7]:
X_train, X_test, y_train, y_test = train_test_split( #split and train
    X, y,
    test_size=TEST_SIZE,
    stratify=y,
    random_state=RANDOM_STATE
)
print(f"→ Training on {X_train.shape[0]} docs; testing on {X_test.shape[0]}")

→ Training on 5120 docs; testing on 1280


In [8]:
models = {
    "Naive Bayes": MultinomialNB(alpha=1.0),
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Decision Tree": DecisionTreeClassifier(max_depth=20)
}

for name, model in models.items():
    print(f"\n{name}")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(classification_report(y_test, preds))
    print("Confusion matrix:")
    print(confusion_matrix(y_test, preds))


Naive Bayes
              precision    recall  f1-score   support

    badeer-r       0.00      0.00      0.00         1
    benson-r       0.00      0.00      0.00         3
     blair-l       0.57      0.99      0.73       186
      cash-m       1.00      0.84      0.92       109
 causholli-m       1.00      0.62      0.77        45
    corman-s       0.94      0.46      0.62       125
    cuilla-m       0.00      0.00      0.00        28
     heard-m       0.86      0.96      0.91       157
    keiser-k       1.00      0.60      0.75        73
       lay-k       0.00      0.00      0.00         3
    martin-t       0.00      0.00      0.00        45
       may-l       0.00      0.00      0.00         8
     mckay-b       0.00      0.00      0.00         9
      neal-s       0.00      0.00      0.00        45
    presto-k       0.45      0.97      0.61       191
   quigley-d       0.99      0.72      0.83       100
      ring-r       0.00      0.00      0.00        10
  scholtes-d  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

    badeer-r       0.00      0.00      0.00         1
    benson-r       0.00      0.00      0.00         3
     blair-l       0.96      0.99      0.98       186
      cash-m       0.99      0.97      0.98       109
 causholli-m       0.98      0.96      0.97        45
    corman-s       0.87      0.90      0.89       125
    cuilla-m       1.00      0.43      0.60        28
     heard-m       0.97      0.97      0.97       157
    keiser-k       0.99      0.90      0.94        73
       lay-k       0.00      0.00      0.00         3
    martin-t       0.95      0.78      0.85        45
       may-l       1.00      0.12      0.22         8
     mckay-b       1.00      0.33      0.50         9
      neal-s       0.91      0.71      0.80        45
    presto-k       0.67      0.98      0.80       191
   quigley-d       0.90      0.85      0.88       100
      ring-r       1.00      0.10      0.18        10
  scholtes-d       1.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
