# 📧 Spam Email Analysis and Detection
This notebook performs two major tasks:

1. **Spam Source Tracing** – Extract IP addresses from email headers to analyze spam origins.
2. **Spam Detection Engine** – Train a machine learning model to classify spam and ham emails.

In [17]:
import os
import re
import email
import pandas as pd
import requests
from email import policy
from email.parser import BytesParser
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [18]:
def parse_email(file_path):
    with open(file_path, 'rb') as f:
        msg = BytesParser(policy=policy.default).parse(f)
    headers = dict(msg.items())
    body = msg.get_body(preferencelist=('plain', 'html'))
    return {
        "subject": headers.get("Subject", ""),
        "from": headers.get("From", ""),
        "to": headers.get("To", ""),
        "received": headers.get("Received", ""),
        "date": headers.get("Date", ""),
        "body": body.get_content() if body else ""
    }

In [19]:
from bs4 import BeautifulSoup

def clean_email_body(raw_html):
    if not raw_html:
        return ""
    soup = BeautifulSoup(raw_html, "html.parser")
    return soup.get_text(separator=" ", strip=True)


In [20]:
# ----------------------------
def extract_info_from_received(received_header):
    ip_match = re.search(r'\[?(\d{1,3}(?:\.\d{1,3}){3})\]?', received_header)
    ip = ip_match.group(1) if ip_match else None

    domain_match = re.search(r'from\s+([\w\.-]+)', received_header, re.IGNORECASE)
    domain = domain_match.group(1) if domain_match else None

    return ip, domain

In [None]:
def load_emails_from_folder(folder_path, label):
    emails = []
    for file in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file)
        try:
            parsed = parse_email(file_path)
            ip, domain = extract_info_from_received(parsed["received"])
            emails.append({
                "from": parsed["from"],
                "subject": parsed["subject"],
                "date": parsed["date"],
                "text": clean_email_body(parsed["body"]),
                "ip": ip,
                "domain": domain,
                "label": label
            })
        except Exception as e:
            continue
    return emails


In [22]:
# ----------------------------
# Load folders and combine into a single DataFrame
# ----------------------------
ham_folder = "easy_ham"
spam_folder = "spam"
spam2_folder = "spam_2"

ham_emails = load_emails_from_folder(ham_folder, 0)
spam_emails = load_emails_from_folder(spam_folder, 1)
spam2_emails = load_emails_from_folder(spam2_folder, 1)

In [23]:
def enrich_ip_metadata(ip):
    if not ip or ip.startswith("192.") or ip.startswith("10.") or ip.startswith("127."):
        return {"country": None, "org": None}
    try:
        r = requests.get(f"http://ip-api.com/json/{ip}", timeout=5)
        if r.status_code == 200:
            data = r.json()
            return {
                "country": data.get("country"),
                "org": data.get("org")
            }
    except:
        pass
    return {"country": None, "org": None}

In [26]:
all_emails = pd.DataFrame(ham_emails + spam_emails + spam2_emails)
all_emails = all_emails[all_emails['text'].notnull()].sample(frac=1).reset_index(drop=True)

# ----------------------------
# Enrich top 40 spam IPs (to avoid rate limiting)
# ----------------------------
sampled = all_emails[all_emails['label'] == 1].dropna(subset=["ip"]).head(40).copy()
metadata = sampled['ip'].apply(enrich_ip_metadata)
metadata_df = pd.DataFrame(metadata.tolist())

# Combine enriched metadata with original DataFrame
sampled = pd.concat([sampled.reset_index(drop=True), metadata_df], axis=1)
sampled['text_preview'] = sampled['text'].str.replace('\n', ' ').str.replace('\r', ' ')


# Show results
print(sampled[['ip', 'domain', 'country', 'org', 'from', 'subject', 'date', 'label', 'text_preview']])

                 ip                   domain         country  \
0     155.89.28.179            155.89.28.179          Angola   
1     212.43.201.51            email1.atc.cz          France   
2     200.75.120.56       1-800-plumbing.com       Venezuela   
3         127.0.0.1                localhost            None   
4     209.197.199.5            209.197.199.5   United States   
5    61.142.238.123  smtp0301.mail.yahoo.com           China   
6    216.227.131.17         national-adv.com   United States   
7      63.180.6.118                      122         Germany   
8    157.237.139.64   aa64.toplinequotes.com          Norway   
9   172.190.151.202         mx06.hotmail.com   United States   
10     149.89.93.47                  unknown   United States   
11    64.86.155.148                 mail.com   United States   
12   217.35.110.225       mx2.mail.yahoo.com  United Kingdom   
13    64.86.155.135                     None   United States   
14    211.162.70.68            211.162.7

In [27]:
# Export enriched spam analysis data to CSV and TXT
csv_path = "spam_analysis_output.csv"
txt_path = "spam_analysis_output.txt"

# Save as CSV
sampled.to_csv(csv_path, index=False)

# Save as TXT (formatted nicely)
with open(txt_path, "w", encoding="utf-8") as f:
    for _, row in sampled.iterrows():
        f.write(f"IP: {row['ip']}\n")
        f.write(f"Domain: {row['domain']}\n")
        f.write(f"Country: {row['country']}\n")
        f.write(f"Org: {row['org']}\n")
        f.write(f"From: {row['from']}\n")
        f.write(f"Subject: {row['subject']}\n")
        f.write(f"Date: {row['date']}\n")
        f.write(f"Label: {'Spam' if row['label'] == 1 else 'Ham'}\n")
        f.write(f"Preview: {row['text_preview']}\n")
        f.write("-" * 60 + "\n")

print(f"\n✅ Files saved:\nCSV: {csv_path}\nTXT: {txt_path}")



✅ Files saved:
CSV: spam_analysis_output.csv
TXT: spam_analysis_output.txt
