In [None]:
import os
import re

# Define paths (edit if needed)
spam_dirs = ["./spam", "./spam_2"]  # Set to your actual relative or absolute paths

# Keyword heuristics for categories
CATEGORY_KEYWORDS = {
  "Phishing": [
        "password", "verify", "account", "login", "bank", "confirm", "update", "ssn", "secure",
        "security alert", "reset", "your information", "identity", "limited time", "reactivate",
        "unusual activity", "locked", "access", "credentials", "statement", "protection", "atm",
        "security notice", "validate", "immediately", "confirm your", "click here to", "log in",
        "suspend", "alert", "dear customer", "customer service", "your paypal", "your amazon",
        "verify your email", "we detected", "compromised", "security check"
    ],    "Impersonation": [
        "ceo", "urgent", "wire transfer", "invoice", "payment", "request", "manager", "attention",
        "director", "immediate", "funds", "action required", "process payment", "approve", "remit",
        "from the desk of", "please reply", "staff", "employee", "vendor", "remittance", "outstanding",
        "authorize", "executive", "human resources", "payroll", "boss", "confidential", "financial dept",
        "hr department", "final notice", "settlement", "your supervisor", "tax"
    ],
   "Promotional Ads": [
        "free", "offer", "buy now", "winner", "discount", "click here", "order now", "limited time",
        "sale", "deal", "prize", "promotion", "gift", "risk-free", "guaranteed", "act now", "save big",
        "cheap", "lowest price", "win", "selected", "cash", "bonus", "opportunity", "get paid",
        "amazing", "exclusive", "luxury", "vacation", "bargain", "unsubscribe", "satisfaction", "no cost",
        "trial", "fast cash", "zero risk", "hot deal", "special offer"
    ]
}

In [None]:

def classify_spam(text):
    text_lc = text.lower()
    for category, keywords in CATEGORY_KEYWORDS.items():
        for kw in keywords:
            if kw in text_lc:
                return category
    return "Other"

# Gather results
results = {"Phishing": [], "Impersonation": [], "Promotional Ads": [], "Other": []}

# Read spam files
for folder in spam_dirs:
    for fname in os.listdir(folder):
        fpath = os.path.join(folder, fname)
        try:
            with open(fpath, encoding="latin1") as f:
                content = f.read()
                # For classification, you can combine subject+body if you extract subject
                results[classify_spam(content)].append(fname)
        except Exception as e:
            print(f"Could not read {fname}: {e}")


In [None]:
# Print summary
for category, files in results.items():
    print(f"{category}: {len(files)} emails")
    # Optional: print sample emails
    for sample in files[:3]:
        print(f"  - {sample}")

Phishing: 898 emails
  - 00278.b62c5fc23a2f87760696cb9fa51f073c
  - 00279.1d58a13e343c1e53aca2ed2121a3f815
Promotional Ads: 626 emails
  - 00322.7d39d31fb7aad32c15dff84c14019b8c
  - 00328.73c1a9f83d3b1247522c26eb6d74c215
Other: 95 emails
  - 00010.445affef4c70feec58f9198cfbc22997
  - 00074.51aab41b27a9ba7736803318a2e4c8de


In [19]:

# Output to txt file for your report
with open("spam_categories_summary.txt", "w") as out:
    for category, files in results.items():
        out.write(f"\n{category} ({len(files)} emails):\n")
        for fname in files:  # Only sample 10 per category
            out.write(f"  - {fname}\n")
print("Summary saved to spam_categories_summary.txt")

Summary saved to spam_categories_summary.txt


In [None]:
import os
import shutil

CATEGORY_FOLDER_MAP = {
    "Phishing": "phising", 
    "Promotional Ads": "Promotional Ads"
}

spam_dirs = ["./spam", "./spam_2"]

# Move up to 30 files per category
for category, files in results.items():
    folder_name = CATEGORY_FOLDER_MAP.get(category)
    if not folder_name:
        continue  # Skip categories with no folder

    # Only move up to 30
    for fname in files[:40]:
        found = False
        # Search both spam folders for the file
        for src_dir in spam_dirs:
            src_path = os.path.join(src_dir, fname)
            if os.path.exists(src_path):
                dst_path = os.path.join(folder_name, fname)
                shutil.move(src_path, dst_path)
                print(f"Moved {fname} to {folder_name}")
                found = True
                break
        if not found:
            print(f"{fname} not found in spam folders!")

print("Done moving files.")


Moved 00278.b62c5fc23a2f87760696cb9fa51f073c to phising
Moved 00279.1d58a13e343c1e53aca2ed2121a3f815 to phising
Moved 00280.026da2bd191f11081b8d8428134b0c66 to phising
Moved 00282.0e230e05877f40a522bfb93aa3e314f3 to phising
Moved 00283.e8e42ee52f919afd2a453983f1256b1d to phising
Moved 00285.8a06c91fcdf4a1ae8ca928f3ef3feecb to phising
Moved 00286.efd0b8f0c9c779b7a0ad93505c9b0bae to phising
Moved 00287.b0495a4dbdff36654c3b3ee2f92bdbf3 to phising
Moved 00289.61a681a72c71512f115ad65033acc7c9 to phising
Moved 00290.eb053a191b7509a9399aa16717630414 to phising
Moved 00291.7aa227e74e89bdd529a3875459d0d5a2 to phising
Moved 00292.dbf78a2aaa230d288eb80ab843804252 to phising
Moved 00294.df27a988d82cc82296e33e6d727ac47e to phising
Moved 00301.68fe7955b96d085360ca916289e8e716 to phising
Moved 00302.544366fa4cd0f5d210dd8443a1c2c95a to phising
Moved 00304.ed5fbfc3e6f2be662f29f43f172a1fb3 to phising
Moved 00305.f80c21904d6d4f6facd036450a588b0d to phising
Moved 00307.7ed50c6d80c6e37c8cc1b132f4a19e4d to 