In [10]:

import os
import re
import argparse
from email import policy
from email.parser import BytesParser

import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Ensure NLTK stopwords are downloaded
nltk.download('stopwords', quiet=True)
STOPWORDS = set(stopwords.words('english'))

def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)      # remove ursls
    text = re.sub(r'[^a-z0-9\s]', ' ', text)               # and punctuation
    text = re.sub(r'\s+', ' ', text).strip()    # add stemming or lemmatizngn??
    return text
print("done")

done


In [15]:
def load_emails(maildir: str):
    texts, labels = [], []
    cnt=0
    for user in os.listdir(maildir):
        user_dir = os.path.join(maildir, user)
        if not os.path.isdir(user_dir):
            continue

        # only look at the “sent_items” folder
        folder_dir = os.path.join(user_dir, "sent_items")
        print(cnt)
        cnt+=1
        if not os.path.isdir(folder_dir):
            continue

        for fname in os.listdir(folder_dir):
            path = os.path.join(folder_dir, fname)
            try:
                with open(path, 'rb') as f:
                    msg = BytesParser(policy=policy.default).parse(f) #read binary
                body = msg.get_body(preferencelist=('plain',))
                if body is None:
                    continue
                raw = body.get_content()
                text = clean_text(raw)
                if not text:
                    continue
                tokens = [w for w in text.split() if w not in STOPWORDS]  #cleanning. Added lemmentatizn or stemming?
                texts.append(' '.join(tokens))
                labels.append(user)
            except Exception:
                continue

    return texts, labels
print("done")

done


In [16]:
DATA_DIR = "../maildir"     # path to Enron stuf
TEST_SIZE = 0.2
RANDOM_STATE = 42

print("Loading and cleaning emails…")
texts, labels = load_emails(DATA_DIR)
print(f"→ {len(texts)} messages from {len(set(labels))} authors")

Loading and cleaning emails…
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
→ 37689 messages from 136 authors


In [17]:
print("Vectorizing with TF–IDF…")
vect = TfidfVectorizer(max_features=20_000)
X = vect.fit_transform(texts)
y = labels

Vectorizing with TF–IDF…


In [23]:
from collections import Counter

counts = Counter(labels)

keep = {lbl for lbl, cnt in counts.items() if cnt >= 2}

texts_filt = [t for t, l in zip(texts, labels) if l in keep]
labels_filt = [l for l in labels if l in keep]

X = vect.transform(texts_filt)   
y = labels_filt

X_train, X_test, y_train, y_test = train_test_split( #split and train
    X, y,
    test_size=TEST_SIZE,
    stratify=y,
    random_state=RANDOM_STATE
)
print(f"→ Training on {X_train.shape[0]} docs; testing on {X_test.shape[0]}")

→ Training on 30150 docs; testing on 7538


In [None]:
models = {
    #"Naive Bayes": MultinomialNB(alpha=1.0),
    "Logistic Regression 4000": LogisticRegression(max_iter=4000),
    "Logistic Regression 2000": LogisticRegression(max_iter=2000),
    "Logistic Regression 1000": LogisticRegression(max_iter=1000),
    "Logistic Regression 500": LogisticRegression(max_iter=500),
    "Logistic Regression 250": LogisticRegression(max_iter=250),
    "Logistic Regression 125": LogisticRegression(max_iter=125),
    "Logistic Regression 62": LogisticRegression(max_iter=62),
    "Logistic Regression 31": LogisticRegression(max_iter=31),
    #"Decision Tree": DecisionTreeClassifier(max_depth=20)
}

for name, model in models.items():
    print(f"\n{name}")
    model.fit(X_train, y_train)
    print("fitted")
    preds = model.predict(X_test)
    print(classification_report(y_test, preds))
    #print("Confusion matrix:")
    #print(confusion_matrix(y_test, preds))


Logistic Regression 4000


In [None]:
# Logistical regression is the best. Preform For the full data set.