In [5]:
import pandas as pd
import os

def read_spam():
    category = 'spam'
    directory = 'enron1\enron1\spam'
    return read_category(category, directory)

def read_ham():
    category = 'ham'
    directory = 'enron1\enron1\ham'
    return read_category(category, directory)

def read_category(category, directory):
    emails = []
    for filename in os.listdir(directory):
        if not filename.endswith(".txt"):
            continue
        with open(os.path.join(directory, filename), 'r') as fp:
            try:
                content = fp.read()
                emails.append({'name': filename, 'content': content, 'category': category})
            except:
                print(f'skipped {filename}')
    return emails

ham = read_ham()
spam = read_spam()

df_ham = pd.DataFrame.from_records(ham)
df_spam = pd.DataFrame.from_records(spam)


df = pd.concat([df_ham, df_spam], ignore_index=True)


  directory = 'enron1\enron1\spam'
  directory = 'enron1\enron1\ham'


skipped 0754.2004-04-01.GP.spam.txt
skipped 1414.2004-06-24.GP.spam.txt
skipped 2042.2004-08-30.GP.spam.txt
skipped 2140.2004-09-13.GP.spam.txt
skipped 2248.2004-09-23.GP.spam.txt
skipped 2526.2004-10-17.GP.spam.txt
skipped 2649.2004-10-27.GP.spam.txt
skipped 2698.2004-10-31.GP.spam.txt
skipped 3304.2004-12-26.GP.spam.txt
skipped 3364.2005-01-01.GP.spam.txt
skipped 4142.2005-03-31.GP.spam.txt
skipped 4201.2005-04-05.GP.spam.txt
skipped 4350.2005-04-23.GP.spam.txt
skipped 4566.2005-05-24.GP.spam.txt
skipped 5105.2005-08-31.GP.spam.txt


In [6]:
import re

def preprocessor(e):
    e = re.sub('[^a-zA-Z]', ' ', e)
    return e.lower()
df['content'] = df['content'].apply(preprocessor)


Step 3. We will now train the machine learning model. All the functions that you will need are imported for you. The instructions explain how the work and hint at which functions to use. You will likely need to refer to the scikit learn documentation to see how exactly to invoke the functions. It will be handy to keep that tab open.

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

vectorizer = CountVectorizer(preprocessor=preprocessor)

X_train, X_test, y_train, y_test = train_test_split(df['content'], df['category'], test_size=0.3, random_state=42)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = LogisticRegression()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)


accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")


Accuracy: 0.9767441860465116
Confusion Matrix:
[[1058   23]
 [  13  454]]
Classification Report:
              precision    recall  f1-score   support

         ham       0.99      0.98      0.98      1081
        spam       0.95      0.97      0.96       467

    accuracy                           0.98      1548
   macro avg       0.97      0.98      0.97      1548
weighted avg       0.98      0.98      0.98      1548



Step 4.

In [8]:
feature_names = vectorizer.get_feature_names_out()

importance = model.coef_[0]

top_positive_indices = importance.argsort()[-10:][::-1]
top_negative_indices = importance.argsort()[:10]

print("Top 10 spam words:")
for i in top_positive_indices:
    print(f"{feature_names[i]}: {importance[i]}")

print("\nTop 10 ham words:")
for i in top_negative_indices:
    print(f"{feature_names[i]}: {importance[i]}")


Top 10 spam words:
no: 1.005176874424519
http: 0.9617231391948734
prices: 0.7864730083916752
here: 0.7595177333997772
more: 0.7114563564000623
hello: 0.6832872351265962
off: 0.6792688361908978
rolex: 0.658806728771389
paliourg: 0.6464562411556374
removed: 0.6319666048781791

Top 10 ham words:
attached: -1.4563094813428676
enron: -1.3984084856579755
thanks: -1.3198596174222768
daren: -1.1996589492546939
pictures: -1.1951347880027505
doc: -1.1858760845107634
deal: -1.1669655173851465
neon: -1.0905403084610414
xls: -1.0873520699384176
hpl: -1.0663621894270328
