In [2]:

import os
import re
import argparse
from email import policy
from email.parser import BytesParser

import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Ensure NLTK stopwords are downloaded
nltk.download('stopwords', quiet=True)
STOPWORDS = set(stopwords.words('english'))

import re
from nltk.stem import PorterStemmer

STOPWORDS = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # strip URLs
    text = re.sub(r'[^a-z0-9\s]', ' ', text)           # strip punctuation
    text = re.sub(r'\s+', ' ', text).strip()           # collapse whitespace

    tokens = [
        stemmer.stem(word)
        for word in text.split()
        if word not in STOPWORDS
    ]
    return ' '.join(tokens)
print("done")

done


In [3]:
def load_emails(maildir: str):
    texts, labels = [], []
    cnt=0
    for user in os.listdir(maildir):
        user_dir = os.path.join(maildir, user)
        if not os.path.isdir(user_dir):
            continue

        # only look at the “sent_items” folder
        folder_dir = os.path.join(user_dir, "sent_items")
        print(cnt)
        cnt+=1
        if not os.path.isdir(folder_dir):
            continue

        for fname in os.listdir(folder_dir):
            path = os.path.join(folder_dir, fname)
            try:
                with open(path, 'rb') as f:
                    msg = BytesParser(policy=policy.default).parse(f) #read binary
                body = msg.get_body(preferencelist=('plain',))
                if body is None:
                    continue
                raw = body.get_content()
                text = clean_text(raw)
                if not text:
                    continue
                tokens = [w for w in text.split() if w not in STOPWORDS]  #cleanning. Added lemmentatizn or stemming?
                texts.append(' '.join(tokens))
                labels.append(user)
            except Exception:
                continue

    return texts, labels
print("done")


done


In [None]:
DATA_DIR = "/WAVE/projects/CSEN-140-Sp25/HHJ140Proj/Sent_Items_only"     # path to Enron stuf
TEST_SIZE = 0.3
RANDOM_STATE = 36

print("Loading and cleaning emails…")
texts, labels = load_emails(DATA_DIR)
print(f"→ {len(texts)} messages from {len(set(labels))} authors")
print(len(texts))

Loading and cleaning emails…
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40


In [13]:
print("Vectorizing with TF–IDF…")
vect = TfidfVectorizer(max_features=20_000)
X = vect.fit_transform(texts)
y = labels

Vectorizing with TF–IDF…


In [14]:
from collections import Counter

counts = Counter(labels)

keep = {lbl for lbl, cnt in counts.items() if cnt >= 2}

texts_filt = [t for t, l in zip(texts, labels) if l in keep]
labels_filt = [l for l in labels if l in keep]

X = vect.transform(texts_filt)   
y = labels_filt

X_train, X_test, y_train, y_test = train_test_split( #split and train
    X, y,
    test_size=TEST_SIZE,
    stratify=y,
    random_state=RANDOM_STATE
)
print(f"→ Training on {X_train.shape[0]} docs; testing on {X_test.shape[0]}")

→ Training on 26364 docs; testing on 11299


In [15]:
models = {
    #"Naive Bayes": MultinomialNB(alpha=1.0),
    #"Logistic Regression 4000": LogisticRegression(max_iter=4000),
    #"Logistic Regression 2000": LogisticRegression(max_iter=2000),
    #"Logistic Regression 1000": LogisticRegression(max_iter=1000),
    #"Logistic Regression 500": LogisticRegression(max_iter=500),
    #"Logistic Regression 250": LogisticRegression(max_iter=250),
    "Logistic Regression 125": LogisticRegression(max_iter=125),
    "Logistic Regression 62": LogisticRegression(max_iter=62),
    "Logistic Regression 31": LogisticRegression(max_iter=31),
    #"Decision Tree": DecisionTreeClassifier(max_depth=20)
}

for name, model in models.items():
    print(f"\n{name}")
    model.fit(X_train, y_train)
    print("fitted")
    preds = model.predict(X_test)
    print(classification_report(y_test, preds))
    #print("Confusion matrix:")
    #print(confusion_matrix(y_test, preds))


Logistic Regression 125
fitted
                 precision    recall  f1-score   support

        allen-p       0.90      0.68      0.77       103
       arnold-j       0.69      0.69      0.69       216
        arora-h       1.00      0.30      0.46        20
       badeer-r       0.00      0.00      0.00         2
       bailey-s       0.00      0.00      0.00         4
         bass-e       0.79      0.66      0.72        73
     baughman-d       0.93      0.46      0.62        28
         beck-s       0.84      0.90      0.87       145
       benson-r       0.00      0.00      0.00         5
        blair-l       0.90      0.96      0.93       278
      brawner-s       0.83      0.23      0.36        22
          buy-r       0.91      0.82      0.86       117
     campbell-l       0.81      0.50      0.62        34
       carson-m       0.96      0.68      0.79        34
         cash-m       0.87      0.87      0.87       164
    causholli-m       0.98      0.94      0.96        6

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


fitted
                 precision    recall  f1-score   support

        allen-p       0.89      0.68      0.77       103
       arnold-j       0.69      0.69      0.69       216
        arora-h       1.00      0.30      0.46        20
       badeer-r       0.00      0.00      0.00         2
       bailey-s       0.00      0.00      0.00         4
         bass-e       0.77      0.66      0.71        73
     baughman-d       0.93      0.46      0.62        28
         beck-s       0.84      0.90      0.87       145
       benson-r       0.00      0.00      0.00         5
        blair-l       0.90      0.96      0.93       278
      brawner-s       0.80      0.18      0.30        22
          buy-r       0.91      0.82      0.86       117
     campbell-l       0.81      0.50      0.62        34
       carson-m       0.96      0.68      0.79        34
         cash-m       0.87      0.87      0.87       164
    causholli-m       0.98      0.94      0.96        67
       corman-s       0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Logistical regression is the best. Preform For the full data set.

In [9]:
models= {"Logistic Regression 100000000": LogisticRegression(max_iter=100)}
for name, model in models.items():
    print(f"\n{name}")
    model.fit(X_train, y_train)
    print("fitted")
    preds = model.predict(X_test)
    print(classification_report(y_test, preds))
    #print("Confusion matrix:")
    #print(confusion_matrix(y_test, preds))


Logistic Regression 1000000
fitted
                 precision    recall  f1-score   support

        allen-p       0.83      0.79      0.81        68
       arnold-j       0.70      0.74      0.72       144
        arora-h       0.67      0.14      0.24        14
       badeer-r       0.00      0.00      0.00         1
       bailey-s       0.00      0.00      0.00         3
         bass-e       0.85      0.57      0.68        49
     baughman-d       0.86      0.32      0.46        19
         beck-s       0.88      0.86      0.87        96
       benson-r       0.00      0.00      0.00         3
        blair-l       0.92      0.92      0.92       186
      brawner-s       1.00      0.27      0.42        15
          buy-r       0.82      0.81      0.81        78
     campbell-l       0.71      0.45      0.56        22
       carson-m       1.00      0.74      0.85        23
         cash-m       0.86      0.90      0.88       109
    causholli-m       0.98      0.96      0.97     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
