In [None]:

import os
import re
import argparse
from email import policy
from email.parser import BytesParser

import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Ensure NLTK stopwords are downloaded
nltk.download('stopwords', quiet=True)
STOPWORDS = set(stopwords.words('english'))

def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)      # remove ursls
    text = re.sub(r'[^a-z0-9\s]', ' ', text)               # and punctuation
    text = re.sub(r'\s+', ' ', text).strip()    # add stemming or lemmatizngn??
    return text

In [None]:
def load_emails(maildir: str):
    texts, labels = [], []
    for user in os.listdir(maildir):
        user_dir = os.path.join(maildir, user)
        if not os.path.isdir(user_dir):
            continue
        for folder in os.listdir(user_dir):
            folder_dir = os.path.join(user_dir, folder)
            if not os.path.isdir(folder_dir):
                continue
            for fname in os.listdir(folder_dir):
                path = os.path.join(folder_dir, fname)
                try:
                    with open(path, 'rb') as f:
                        msg = BytesParser(policy=policy.default).parse(f) # build message in binaryy 
                    body = msg.get_body(preferencelist=('plain',)) # turn into tsring
                    if body is None: 
                        continue
                    raw = body.get_content()
                    text = clean_text(raw)
                    if not text:
                        continue
                    # data cleaning
                    tokens = [w for w in text.split() if w not in STOPWORDS] #cleanning. Added lemmentatizn or stemming?
                    texts.append(' '.join(tokens))
                    labels.append(user)
                except Exception:
                    continue
    return texts, labels


In [None]:
DATA_DIR = "maildir"     # path to Enron stuf
TEST_SIZE = 0.2
RANDOM_STATE = 42

print("Loading and cleaning emails…")
texts, labels = load_emails(DATA_DIR)
print(f"→ {len(texts)} messages from {len(set(labels))} authors")

In [None]:
print("Vectorizing with TF–IDF…")
vect = TfidfVectorizer(max_features=20_000)
X = vect.fit_transform(texts)
y = labels

In [None]:
X_train, X_test, y_train, y_test = train_test_split( #split and train
    X, y,
    test_size=TEST_SIZE,
    stratify=y,
    random_state=RANDOM_STATE
)
print(f"→ Training on {X_train.shape[0]} docs; testing on {X_test.shape[0]}")

In [None]:
models = {
    "Naive Bayes": MultinomialNB(alpha=1.0),
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Decision Tree": DecisionTreeClassifier(max_depth=20)
}

for name, model in models.items():
    print(f"\n{name}")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(classification_report(y_test, preds))
    print("Confusion matrix:")
    print(confusion_matrix(y_test, preds))