In [5]:
#AHT Note: This implementation expects enron dataset stored locally. For me, this was in my home /Downloads path
from __future__ import annotations

from pathlib import Path
from email.parser import BytesParser
from email import policy
from email.utils import getaddresses
from collections import Counter
import csv

# Path Configuration
BASE_DIR = Path.home() / "Downloads" / "Enron"
INCLUDE_HEADERS = ("From", "To", "Cc", "Bcc")
WRITE_DIR = Path.home() / "Downloads"
NODES_CSV = WRITE_DIR / "enron_nodes.csv"
EDGES_CSV = WRITE_DIR / "enron_edges.csv"
UNDIRECTED = False


def normalize_email(addr: str) -> str | None:
    if not addr:
        return None
    addr = addr.strip().lower()
    if "@" not in addr:
        return None
    return addr

def parse_headers(fp: Path):
    try:
        with fp.open("rb") as f:
            msg = BytesParser(policy=policy.default).parse(f, headersonly=True)
    except Exception:
        return None, set()

    # from address, take first valid one
    from_addrs = [normalize_email(a) for _, a in getaddresses([msg.get("From", "")])]
    from_addr = next((a for a in from_addrs if a), None)

    # recipients, from To/Cc/Bcc configurable
    recips = set()
    for h in INCLUDE_HEADERS:
        if h == "From":
            continue
        pairs = getaddresses([msg.get(h, "")])
        for _, a in pairs:
            na = normalize_email(a)
            if na:
                recips.add(na)

    # remove self-loops, if any
    if from_addr in recips:
        recips.discard(from_addr)

    return from_addr, recips

def iter_all_document_files(base_dir: Path):
    if not base_dir.exists():
        raise FileNotFoundError(f"Base directory not found: {base_dir}")
    for all_docs_dir in base_dir.glob("*/all_documents"):
        if all_docs_dir.is_dir():
            for fp in all_docs_dir.rglob("*"):
                if fp.is_file():
                    yield fp

def build_graph(base_dir: Path):
    nodes = set()
    edges = Counter()
    total_files = 0

    for fp in iter_all_document_files(base_dir):
        total_files += 1
        from_addr, recips = parse_headers(fp)

        if from_addr:
            nodes.add(from_addr)
        nodes.update(recips)

        if from_addr and recips:
            if UNDIRECTED:
                for r in recips:
                    a, b = sorted([from_addr, r])
                    edges[(a, b)] += 1
            else:
                for r in recips:
                    edges[(from_addr, r)] += 1

        if total_files % 5000 == 0:
            print(f"...{total_files:,} files scanned; "
                  f"{len(nodes):,} nodes; {len(edges):,} edges so far")

    print(f"Done. Scanned {total_files:,} files.")
    print(f"Unique nodes: {len(nodes):,}")
    print(f"Unique {'undirected' if UNDIRECTED else 'directed'} edges: {len(edges):,}")
    return nodes, edges

def write_nodes_csv(nodes: set[str], path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["email"])
        for addr in sorted(nodes):
            w.writerow([addr])
    print(f"Wrote nodes to: {path}")

def write_edges_csv(edges: Counter, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["source", "target", "weight"])
        for (src, dst), wt in edges.items():
            w.writerow([src, dst, wt])
    print(f"Wrote edges to: {path}")

if __name__ == "__main__":
    nodes, edges = build_graph(BASE_DIR)
    write_nodes_csv(nodes, NODES_CSV)
    write_edges_csv(edges, EDGES_CSV)


...5,000 files scanned; 2,872 nodes; 6,099 edges so far
...10,000 files scanned; 5,776 nodes; 12,172 edges so far
...15,000 files scanned; 8,066 nodes; 18,900 edges so far
...20,000 files scanned; 10,814 nodes; 26,530 edges so far
...25,000 files scanned; 13,282 nodes; 36,986 edges so far
...30,000 files scanned; 16,193 nodes; 53,446 edges so far
...35,000 files scanned; 18,409 nodes; 62,737 edges so far
...40,000 files scanned; 20,193 nodes; 75,535 edges so far
...45,000 files scanned; 22,239 nodes; 84,690 edges so far
...50,000 files scanned; 24,906 nodes; 90,881 edges so far
...55,000 files scanned; 26,072 nodes; 98,045 edges so far
...60,000 files scanned; 28,076 nodes; 108,213 edges so far
...65,000 files scanned; 29,056 nodes; 115,583 edges so far
...70,000 files scanned; 29,468 nodes; 118,440 edges so far
...75,000 files scanned; 30,245 nodes; 121,781 edges so far
...80,000 files scanned; 31,615 nodes; 125,651 edges so far
...85,000 files scanned; 32,160 nodes; 129,262 edges so 