In [9]:

import os
import re
import argparse
from email import policy
from email.parser import BytesParser

import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Ensure NLTK stopwords are downloaded
nltk.download('stopwords', quiet=True)
STOPWORDS = set(stopwords.words('english'))

import re

from nltk.stem import PorterStemmer, WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
STOPWORDS = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # strip URLs
    text = re.sub(r'[^a-z0-9\s]', ' ', text)           # strip punctuation
    text = re.sub(r'\s+', ' ', text).strip()           # collapse whitespace

    tokens = []
    for word in text.split():
        if word in STOPWORDS:
            continue
        stemmed = stemmer.stem(word)
        lemma = lemmatizer.lemmatize(stemmed)
        tokens.append(lemma)

    return ' '.join(tokens)
print("DONE")

DONE


In [10]:
def load_emails(maildir: str):
    texts, labels = [], []
    cnt = 0
    for user in os.listdir(maildir):
        user_dir = os.path.join(maildir, user)
        if not os.path.isdir(user_dir):
            continue

        # only look at the “sent_items” folder
        folder_dir = os.path.join(user_dir, "sent_items")
        print(f"Processing user #{cnt}: {user}")
        cnt += 1
        if not os.path.isdir(folder_dir):
            continue

        for fname in os.listdir(folder_dir):
            path = os.path.join(folder_dir, fname)
            try:
                with open(path, 'rb') as f:
                    msg = BytesParser(policy=policy.default).parse(f)
                body = msg.get_body(preferencelist=('plain',))
                if body is None:
                    continue
                raw = body.get_content()
                text = clean_text(raw)
                if text:  # already lemmatized and stopwords‐free
                    texts.append(text)
                    labels.append(user)
            except Exception:
                continue

    return texts, labels

print("done")

done


In [16]:
import os
import sys
import statistics

def count_sent_messages(maildir):
    counts = []
    for user in os.listdir(maildir):
        sent_dir = os.path.join(maildir, user, "sent_items")
        if os.path.isdir(sent_dir):
            n = sum(
                1
                for fname in os.listdir(sent_dir)
                if os.path.isfile(os.path.join(sent_dir, fname))
            )
            counts.append(n)
    return counts

def getAve(maildir):
    counts = count_sent_messages(maildir)
    if not counts:
        print("No users or no sent_items folders found.")
        return

    avg    = statistics.mean(counts)
    med    = statistics.median(counts)
    stddev = statistics.stdev(counts)

    print(f"Average # of sent messages per user: {avg:.2f}")
    print(f"Median  # of sent messages per user: {med}")
    print(f"Std Dev # of sent messages per user: {stddev:.2f}")
getAve("/WAVE/projects/CSEN-140-Sp25/HHJ140Proj/Sent_Items_only")


Average # of sent messages per user: 278.82
Median  # of sent messages per user: 144.5
Std Dev # of sent messages per user: 318.26


In [11]:
DATA_DIR = "/WAVE/projects/CSEN-140-Sp25/HHJ140Proj/Sent_Items_only"     # path to Enron stuf
TEST_SIZE = 0.3 #preprocess work
RANDOM_STATE = 36

print("Loading and cleaning emails…")
texts, labels = load_emails(DATA_DIR)
print(f"→ {len(texts)} messages from {len(set(labels))} authors")
print(len(texts))

Loading and cleaning emails…
Processing user #0: neal-s
Processing user #1: griffith-j
Processing user #2: zipper-a
Processing user #3: baughman-d
Processing user #4: kuykendall-t
Processing user #5: saibi-e
Processing user #6: quenet-j
Processing user #7: buy-r
Processing user #8: heard-m
Processing user #9: presto-k
Processing user #10: stepenovitch-j
Processing user #11: pimenov-v
Processing user #12: hayslett-r
Processing user #13: parks-j
Processing user #14: holst-k
Processing user #15: campbell-l
Processing user #16: crandell-s
Processing user #17: bass-e
Processing user #18: geaccone-t
Processing user #19: carson-m
Processing user #20: quigley-d
Processing user #21: wolfe-j
Processing user #22: gay-r
Processing user #23: hendrickson-s
Processing user #24: causholli-m
Processing user #25: mann-k
Processing user #26: shively-h
Processing user #27: love-p
Processing user #28: scott-s
Processing user #29: martin-t
Processing user #30: mims-thurston-p
Processing user #31: schwieger-

In [12]:
print("Vectorizing with TF–IDF…")
vect = TfidfVectorizer(max_features=20_000)
X = vect.fit_transform(texts)
y = labels

Vectorizing with TF–IDF…


In [13]:
from collections import Counter

counts = Counter(labels)

keep = {lbl for lbl, cnt in counts.items() if cnt >= 2}

texts_filt = [t for t, l in zip(texts, labels) if l in keep]
labels_filt = [l for l in labels if l in keep]

X = vect.transform(texts_filt)   
y = labels_filt

X_train, X_test, y_train, y_test = train_test_split( #split and train
    X, y,
    test_size=TEST_SIZE,
    stratify=y,
    random_state=RANDOM_STATE
)
print(f"→ Training on {X_train.shape[0]} docs; testing on {X_test.shape[0]}")

→ Training on 26364 docs; testing on 11299


In [17]:
from sklearn.svm import LinearSVC

models = {
    "Linear SVC": LinearSVC(C=1.0, max_iter=10000, dual=False)
}

for name, model in models.items():
    print(f"\n{name}")
    model.fit(X_train, y_train)
    print("fitted")
    preds = model.predict(X_test)
    print(classification_report(y_test, preds))
    #print("Confusion matrix:")
    #print(confusion_matrix(y_test, preds))


Linear SVC
fitted
                 precision    recall  f1-score   support

        allen-p       0.79      0.75      0.77       103
       arnold-j       0.72      0.71      0.72       216
        arora-h       0.79      0.55      0.65        20
       badeer-r       0.50      0.50      0.50         2
       bailey-s       1.00      1.00      1.00         4
         bass-e       0.76      0.70      0.73        73
     baughman-d       0.75      0.64      0.69        28
         beck-s       0.87      0.90      0.89       145
       benson-r       0.50      0.40      0.44         5
        blair-l       0.90      0.95      0.93       278
      brawner-s       0.71      0.68      0.70        22
          buy-r       0.86      0.84      0.85       117
     campbell-l       0.82      0.91      0.86        34
       carson-m       0.89      0.91      0.90        34
         cash-m       0.88      0.88      0.88       164
    causholli-m       0.96      1.00      0.98        67
       corm

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Linear SVM is the best. Preform For the full data set.