In [None]:
# Produced by: Kirubel Temesgen
# College ID: C00260396
# Description: Neural Network (MLP) for Spam Email Classification


In [None]:

import pandas as pd
import numpy as np
import re

file_path = r"D:\college\Sem2\DataSci ML\archive\emails.csv"
df = pd.read_csv(file_path)

def extract_email_parts(email):
    from_ = re.search(r'From: (.+)', email)
    subject = re.search(r'Subject: (.*)', email)
    body = re.search(r'\n\n(.*)', email, re.DOTALL)
    return {
        "From": from_.group(1).strip() if from_ else None,
        "Subject": subject.group(1).strip() if subject else None,
        "Body": body.group(1).strip() if body else None
    }

parts = df['message'].apply(extract_email_parts)
df = pd.concat([df, pd.DataFrame(parts.tolist())], axis=1)
df = df.dropna(subset=["From", "Subject", "Body"])


In [None]:
def clean(text):
    text = text.lower()
    text = re.sub(r'\b(re|fw)\b', '', text)
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\d+', '', text)
    return re.sub(r'\s+', ' ', text).strip()

df["Processed_Subject"] = df["Subject"].apply(clean)
df["Processed_Body"] = df["Body"].apply(clean)

# Spam keywords
keywords = ["win", "lottery", "free", "offer", "click here", "urgent", "claim", "money", "prize", "limited time"]
pattern = re.compile(r'\b(' + '|'.join(keywords) + r')\b', re.IGNORECASE)

# Rule based spam classification
def is_spam(row):
    s = row["Processed_Subject"]
    b = row["Processed_Body"]
    f = row["From"]
    score = 0
    score += 1 if pattern.search(s) else 0
    score += 1 if pattern.search(b) else 0
    score += 2 if f and not f.endswith("@enron.com") else 0
    return 1 if score >= 3 else 0

df["Spam_Label"] = df.apply(is_spam, axis=1)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack

tfidf = TfidfVectorizer(stop_words="english", max_features=1000, ngram_range=(1,2))
tfidf_subject = tfidf.fit_transform(df["Processed_Subject"])
tfidf_body = tfidf.fit_transform(df["Processed_Body"])
X = hstack([tfidf_subject, tfidf_body])
y = df["Spam_Label"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

model = MLPClassifier(hidden_layer_sizes=(64,), activation='relu', solver='adam', max_iter=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Neural Network Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

plt.figure(figsize=(6,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap="Purples",
            xticklabels=["Not Spam", "Spam"], yticklabels=["Not Spam", "Spam"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Neural Network")
plt.show()
