In [1]:
import pandas as pd
import os

def read_spam():
    category = 'spam'
    directory = './enron1/spam'
    return read_category(category, directory)

def read_ham():
    category = 'ham'
    directory = './enron1/ham'
    return read_category(category, directory)

def read_category(category, directory):
    emails = []
    for filename in os.listdir(directory):
        if not filename.endswith(".txt"):
            continue
        with open(os.path.join(directory, filename), 'r') as fp:
            try:
                content = fp.read()
                emails.append({'name': filename, 'content': content, 'category': category})
            except:
                print(f'skipped {filename}')
    return emails
ham = read_ham()
spam = read_spam()

df = pd.DataFrame.from_records(ham)
df = pd.concat([pd.DataFrame.from_records(ham),pd.DataFrame.from_records(spam)])

skipped 2248.2004-09-23.GP.spam.txt
skipped 2526.2004-10-17.GP.spam.txt
skipped 2698.2004-10-31.GP.spam.txt
skipped 4566.2005-05-24.GP.spam.txt


In [3]:
import re

def preprocessor(text):
    re.sub(r'[^a-zA-Z]',' ', text)
    text = text.lower()
    return text

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

vectorizer = CountVectorizer(preprocessor=preprocessor)
matrix = vectorizer.fit_transform(df['content'])

X_train, X_test, y_train, y_test = train_test_split(matrix, df['category'], test_size=0.2, random_state=42)
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


model = LogisticRegression()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)


Shape of X_train: (4134, 50445)
Shape of X_test: (1034, 50445)
Shape of y_train: (4134,)
Shape of y_test: (1034,)
Accuracy: 0.9777562862669246
Confusion Matrix:
[[718  11]
 [ 12 293]]
Classification Report:
              precision    recall  f1-score   support

         ham       0.98      0.98      0.98       729
        spam       0.96      0.96      0.96       305

    accuracy                           0.98      1034
   macro avg       0.97      0.97      0.97      1034
weighted avg       0.98      0.98      0.98      1034



In [5]:

vocabulary = vectorizer.get_feature_names_out()

coefficients = model.coef_[0]

abs_coefficients = abs(coefficients)
sorted_indices = abs_coefficients.argsort()[::-1]

top_positive_features = [vocabulary[i] for i in sorted_indices[:10]]
top_negative_features = [vocabulary[i] for i in sorted_indices[-10:]]

print("Top 10 Positive Features (Spam):", top_positive_features)
print("Top 10 Negative Features (Ham):", top_negative_features)



Top 10 Positive Features (Spam): ['thanks', 'doc', 'enron', 'attached', 'pictures', 'daren', 'xls', 'neon', 'deal', 'revised']
Top 10 Negative Features (Ham): ['burro', 'platte', 'playersclipper', 'burnout', 'playhouse', 'playtime', 'plcs', 'pleasantly', 'pleasures', 'zzsyt']
