In [None]:

# Produced by: Kirubel Temesgen
# College ID: C00260396
# Description: This is a Na誰ve Bayes model implementation for email spam detection.

In [None]:
import pandas as pd
import numpy as np
import re

# Load dataset (only 10% of rows for testing)
file_path = r"D:\college\Sem2\DataSci ML\archive\emails.csv"
df = pd.read_csv(file_path)

# Check for missing data and duplicates
print(df.info())
print(df.head())

duplicate_count = df.duplicated(subset=['message']).sum()
print(f"Number of duplicate emails: {duplicate_count}")


In [None]:
# Extract components from emails
def extract_email_parts(email):
    from_ = re.search(r'From: (.+)', email)
    subject = re.search(r'Subject: (.*)', email)
    body = re.search(r'\n\n(.*)', email, re.DOTALL)
    return {
        "From": from_.group(1).strip() if from_ else None,
        "Subject": subject.group(1).strip() if subject else None,
        "Body": body.group(1).strip() if body else None
    }

parts = df['message'].apply(extract_email_parts)
df = pd.concat([df, pd.DataFrame(parts.tolist())], axis=1)

# Drop missing values
df = df.dropna(subset=["From", "Subject", "Body"])


In [None]:
# Text cleaning function
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r'\b(re|fw)\b', '', text)  # Remove "Re:" and "FW:"
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)   # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df["Processed_Subject"] = df["Subject"].apply(clean_text)
df["Processed_Body"] = df["Body"].apply(clean_text)


In [None]:
# Define spam keywords
spam_keywords = [
    "win", "lottery", "free", "offer", "click here", "urgent", "claim", 
    "money", "prize", "limited time"
]

# Compile regex pattern for keyword matching
spam_pattern = re.compile(r'\b(' + '|'.join(spam_keywords) + r')\b', re.IGNORECASE)

# Function to check if an email is spam
# Points based 
def is_spam(row):
    s = row["Processed_Subject"]
    b = row["Processed_Body"]
    f = row["From"]
    score = 0
    score += 1 if spam_pattern.search(s) else 0
    score += 1 if spam_pattern.search(b) else 0
    score += 2 if f and not f.endswith("@enron.com") else 0
    return 1 if score >= 3 else 0

df["Spam_Label"] = df.apply(is_spam, axis=1)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

# Initialise TF-IDF Vectorizer - Text to numerical values
vectorizer = TfidfVectorizer(stop_words="english", max_features=500, ngram_range=(1, 2))

# Transform Subject & Body separately
tfidf_subject = vectorizer.fit_transform(df["Processed_Subject"])
tfidf_body = vectorizer.fit_transform(df["Processed_Body"])

# Merge TF-IDF features into a single sparse matrix
X = hstack([tfidf_subject, tfidf_body])
y = df["Spam_Label"].values


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Na誰ve Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Predict using test data
y_pred = nb_model.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Na誰ve Bayes Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
plt.figure(figsize=(6,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap="Blues",
            xticklabels=["Not Spam", "Spam"], yticklabels=["Not Spam", "Spam"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Na誰ve Bayes")
plt.show()
