In [1]:
#import necessary libraries
import os
import tarfile
import urllib.request
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics


In [2]:
# Download and extract the SpamAssassin datasets
url = "https://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2"
ham_file = "20021010_easy_ham.tar.bz2"
url2 = "https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2"
spam_file = "20021010_spam.tar.bz2"


In [3]:
urllib.request.urlretrieve(url, ham_file)
urllib.request.urlretrieve(url2, spam_file)

with tarfile.open(ham_file, "r:bz2") as ham_tar:
    ham_tar.extractall("ham")

with tarfile.open(spam_file, "r:bz2") as spam_tar:
    spam_tar.extractall("spam")


Preprocess email data and create feature vectors


In [4]:
def preprocess_email(email):
    # Convert to lowercase
    email = email.lower()
    
    # Remove punctuation
    email = re.sub(r"[^\w\s]", "", email)
    
    # Replace URLs with 'URL'
    email = re.sub(r"http\S+|www\S+|https\S+", "URL", email)
    
    # Replace numbers with 'NUMBER'
    email = re.sub(r"\d+", "NUMBER", email)
    
    # Perform stemming (optional)
    # Add additional preprocessing steps if required
    
    return email


In [11]:
def create_feature_vectors(directory):
    emails = []
    labels = []
    for root, dirs, files in os.walk(directory):
        for filename in files:
            with open(os.path.join(root, filename), "r", encoding="latin1") as file:
                email = file.read()
                email = preprocess_email(email)
                emails.append(email)
                labels.append(directory)
    
    return emails, labels



In [12]:
# Load and preprocess email data from ham and spam directories
ham_emails, ham_labels = create_feature_vectors("ham")
spam_emails, spam_labels = create_feature_vectors("spam")


In [13]:
# Combine ham and spam data
emails = ham_emails + spam_emails
labels = ham_labels + spam_labels


In [14]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(emails, labels, test_size=0.2, random_state=42)



In [15]:
# Convert emails to feature vectors
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)


In [16]:
# Train the classifier (Multinomial Naive Bayes)
classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)



In [17]:
# Predict on the test set
y_pred = classifier.predict(X_test_vectorized)


In [19]:
# Evaluate the classifier
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred, pos_label="spam")
recall = metrics.recall_score(y_test, y_pred, pos_label="spam")
f1_score = metrics.f1_score(y_test, y_pred, pos_label="spam")

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)


Accuracy: 0.9819967266775778
Precision: 0.9764705882352941
Recall: 0.9021739130434783
F1 Score: 0.9378531073446328
