In [3]:
# ============================
# Imports & Regex definitions
# ============================

import re
from urllib.parse import urlparse
from collections import Counter

import numpy as np
import pandas as pd

# Optional libs
try:
    from bs4 import BeautifulSoup
except Exception:
    BeautifulSoup = None

try:
    import tldextract
except Exception:
    tldextract = None

# Email parsing
from email import policy
from email.parser import BytesParser, Parser

# sklearn (only used for quick baseline)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_validate

# Regexes
URL_RE = re.compile(r'https?://[^\s\'">)]+', re.IGNORECASE)
IP_URL_RE = re.compile(r'https?://(?:\d{1,3}\.){3}\d{1,3}(?:[:/]|$)')


In [4]:
# ======================================
# Parse raw RFC-822 email into components
# ======================================

def parse_raw_email(raw):
    """
    Parse raw RFC-822 email (str or bytes) and return dict:
    { 'headers', 'subject', 'plain', 'html', 'attachments' }
    """
    if raw is None:
        return None

    if isinstance(raw, bytes):
        msg = BytesParser(policy=policy.default).parsebytes(raw)
    else:
        msg = Parser(policy=policy.default).parsestr(raw)

    headers = dict(msg.items())
    subject = msg.get("Subject", "") or ""

    plain_parts, html_parts, attachments = [], [], []

    if msg.is_multipart():
        for part in msg.walk():
            ctype = part.get_content_type()
            disp = part.get_content_disposition()

            # Attachments
            if disp == "attachment" or part.get_filename():
                try:
                    payload_bytes = part.get_payload(decode=True) or b""
                    size = len(payload_bytes)
                except Exception:
                    size = 0

                attachments.append({
                    "filename": part.get_filename(),
                    "content_type": ctype,
                    "size": size
                })
                continue

            # Body
            try:
                payload = part.get_content()
            except Exception:
                payload = None

            if ctype == "text/plain" and payload:
                plain_parts.append(str(payload))
            elif ctype == "text/html" and payload:
                html_parts.append(str(payload))
    else:
        ctype = msg.get_content_type()
        try:
            payload = msg.get_content()
        except Exception:
            payload = msg.get_payload(decode=True)
            if isinstance(payload, bytes):
                payload = payload.decode(errors="replace")

        if ctype == "text/plain":
            plain_parts.append(payload or "")
        elif ctype == "text/html":
            html_parts.append(payload or "")

    return {
        "headers": headers,
        "subject": subject,
        "plain": "\n".join([p for p in plain_parts if p]),
        "html": "\n".join([p for p in html_parts if p]),
        "attachments": attachments
    }


In [5]:
# ============================
# URL extraction and features
# ============================

def extract_urls(text):
    if not isinstance(text, str) or not text:
        return []
    return URL_RE.findall(text)


def url_features(urls):
    feats = {}

    feats["url_count"] = len(urls)
    feats["unique_domain_count"] = len({urlparse(u).netloc for u in urls})
    feats["has_ip_url"] = int(any(IP_URL_RE.search(u) for u in urls))

    feats["max_url_length"] = max([len(u) for u in urls], default=0)
    feats["mean_url_length"] = float(np.mean([len(u) for u in urls])) if urls else 0.0

    feats["has_punycode"] = int(any("xn--" in urlparse(u).netloc for u in urls))

    suspicious_tokens = ["login", "secure", "account", "update", "verify", "confirm", "bank", "ebay", "paypal"]
    feats["suspicious_token_in_url"] = int(
        sum(any(tok in u.lower() for tok in suspicious_tokens) for u in urls)
    )

    # Domain extraction
    domains = []
    for u in urls:
        netloc = urlparse(u).netloc.lower()

        if tldextract:
            try:
                ex = tldextract.extract(u)
                domain = ".".join([ex.domain, ex.suffix]) if ex.suffix else ex.domain
            except Exception:
                domain = netloc
        else:
            domain = netloc

        domains.append(domain)

    feats["most_common_domain_freq"] = Counter(domains).most_common(1)[0][1] if domains else 0

    return feats


In [6]:
# ================================
# HTML-based (if exists) features
# ================================

def html_features(html):
    feats = {}

    if not isinstance(html, str) or html.strip() == "":
        feats.update({
            "html_length": 0,
            "html_script_tags": 0,
            "html_iframe_tags": 0,
            "html_hidden_inputs": 0
        })
        return feats

    feats["html_length"] = len(html)

    if BeautifulSoup:
        soup = BeautifulSoup(html, "html.parser")
        feats["html_script_tags"] = len(soup.find_all("script"))
        feats["html_iframe_tags"] = len(soup.find_all("iframe"))
        feats["html_hidden_inputs"] = len(soup.find_all("input", {"type": "hidden"}))
    else:
        feats["html_script_tags"] = html.lower().count("<script")
        feats["html_iframe_tags"] = html.lower().count("<iframe")
        feats["html_hidden_inputs"] = html.lower().count('type="hidden"')

    return feats


In [13]:
# ======================
# Header-based features
# ======================

def header_features(headers):
    feats = {}

    # Normalize keys
    headers = {str(k).lower(): (v if isinstance(v, str) else str(v)) for k, v in headers.items()}

    # Safe get + convert to string
    reply_to = str(headers.get("reply-to", "") or "").lower()
    from_h  = str(headers.get("from", "") or "").lower()

    # Safe boolean
    if reply_to and from_h:
        feats["reply_to_diff_from"] = int(reply_to not in from_h)
    else:
        feats["reply_to_diff_from"] = 0  # no info → not phishing by default

    # SPF / DKIM / DMARC
    auth_res = str(headers.get("authentication-results", "") or "").lower()
    feats["spf_pass"] = int("spf=pass" in auth_res)
    feats["dkim_pass"] = int("dkim=pass" in auth_res)
    feats["dmarc_pass"] = int("dmarc=pass" in auth_res)

    # Subject
    subject = str(headers.get("subject", "") or "")
    s = subject.lower()

    feats["subject_len"] = len(subject)
    feats["subject_has_urgent"] = int("urgent" in s)
    feats["subject_has_verify"] = int("verify" in s)
    feats["subject_has_action"] = int("action required" in s)

    return feats



In [8]:
# ==============================
# Combine everything
# ==============================

def extract_all_features(raw_email):
    parsed = parse_raw_email(raw_email)

    urls = extract_urls(parsed["plain"]) + extract_urls(parsed["html"])

    feats = {}
    feats.update(url_features(urls))
    feats.update(html_features(parsed["html"]))
    feats.update(header_features(parsed["headers"]))

    return feats


In [9]:
# ==============================
# Convert list of raw emails to
# a dataframe of features
# ==============================

def emails_to_feature_df(raw_emails):
    rows = []
    for raw in raw_emails:
        feats = extract_all_features(raw)
        rows.append(feats)
    return pd.DataFrame(rows)


In [16]:
# DIAG 1 — quick sanity checks on df_raw
print("df_raw shape:", df_raw.shape)
print("first 10 rows (id, label, path):")
display(df_raw.head(10)[["id", "label", "path"]])

# show path sizes and counts
import os
sizes = []
for p in df_raw["path"].tolist()[:30]:
    try:
        sizes.append((p, os.path.getsize(p)))
    except Exception:
        sizes.append((p, None))
print("sample path sizes (first 30):")
for p,s in sizes[:30]:
    print(p, "->", s)
    
print("\nLabel distribution:")
print(df_raw['label'].value_counts(dropna=False))


df_raw shape: (1, 5)
first 10 rows (id, label, path):


Unnamed: 0,id,label,path
0,trec05p-1.tgz,spam,/kaggle/input/emails-for-spam-or-ham-classific...


sample path sizes (first 30):
/kaggle/input/emails-for-spam-or-ham-classification-trec-2005/trec05p-1.tgz -> 317224633

Label distribution:
label
spam    1
Name: count, dtype: int64


In [17]:
# Cell 1 — find CSVs and load them (auto-detect common names)
from pathlib import Path
import pandas as pd
import json

INPUT_ROOT = Path("/kaggle/input")
cand = [p for p in INPUT_ROOT.iterdir() if p.is_dir()]

# try to find the folder that contains your unzipped dataset
dataset_root = None
for p in cand:
    name = p.name.lower()
    if "trec" in name or "emails" in name or "email" in name or "spam" in name:
        dataset_root = p
        break
if dataset_root is None:
    dataset_root = cand[0] if cand else None

if dataset_root is None:
    raise RuntimeError("Could not find dataset folder under /kaggle/input. Please set dataset_root manually.")

print("Using dataset_root:", dataset_root)

# List CSVs in that folder (non-recursive) and try to pick header/body CSVs
csvs = list(dataset_root.glob("*.csv"))
print("Top-level CSV files:", [c.name for c in csvs])

# If many CSVs exist, search recursively (some datasets place CSVs in subfolders)
if not csvs:
    csvs = list(dataset_root.rglob("*.csv"))
    print("Recursive CSV files:", [c.name for c in csvs])

# heuristics to choose header and body csv
hdr_candidates = [c for c in csvs if any(k in c.name.lower() for k in ("header","headers","meta"))]
body_candidates = [c for c in csvs if any(k in c.name.lower() for k in ("body","bodies","text","message"))]

# fallback: if no candidates, pick first two distinct CSVs
if not hdr_candidates and len(csvs) >= 1:
    hdr_candidates = [csvs[0]]
if not body_candidates and len(csvs) >= 2:
    body_candidates = [csvs[1]] if csvs[1] != hdr_candidates[0] else ([csvs[0]] if len(csvs)>0 else [])

print("Header candidate(s):", [p.name for p in hdr_candidates])
print("Body candidate(s):", [p.name for p in body_candidates])

if not hdr_candidates or not body_candidates:
    raise RuntimeError("Couldn't auto-detect header/body CSVs. Please check the dataset structure or set paths manually.")

hdr_path = hdr_candidates[0]
body_path = body_candidates[0]

print("Loading header CSV:", hdr_path)
print("Loading body   CSV:", body_path)

df_hdr = pd.read_csv(hdr_path, low_memory=False)
df_body = pd.read_csv(body_path, low_memory=False)

print("Header DF shape:", df_hdr.shape)
print("Body   DF shape:", df_body.shape)

display(df_hdr.head(3))
display(df_body.head(3))


Using dataset_root: /kaggle/input/emails-for-spam-or-ham-classification-trec-2005
Top-level CSV files: ['email_origin.csv', 'email_text.csv']
Header candidate(s): ['email_origin.csv']
Body candidate(s): ['email_text.csv']
Loading header CSV: /kaggle/input/emails-for-spam-or-ham-classification-trec-2005/email_origin.csv
Loading body   CSV: /kaggle/input/emails-for-spam-or-ham-classification-trec-2005/email_text.csv
Header DF shape: (92189, 2)
Body   DF shape: (55075, 2)


Unnamed: 0,label,origin
0,0,Received: from NAHOU-MSMBX01V ([192.168.110.39...
1,0,Received: from nahou-msmbx03v.corp.enron.com (...
2,0,Received: from NAHOU-MSMBX01V ([192.168.110.39...


Unnamed: 0,label,text
0,0,user id enrondlr pw bnawebescapenumber origina...
1,0,hi chris tonight we are rolling out a new repo...
2,0,rika r these new original message from thomas ...


In [18]:
# Cell 2 — detect ID columns and text columns, normalize formats
from ast import literal_eval

def find_id_col(df):
    # common id names
    for c in ("id","msg_id","message_id","mail_id","filename","file"):
        if c in df.columns:
            return c
    # fallback: index as id
    return None

def find_text_col_for_body(df):
    for c in ("body","raw","raw_body","message","text","plain","content","email"):
        if c in df.columns:
            return c
    # fallback: first object column
    for c in df.columns:
        if df[c].dtype == object:
            return c
    return None

def find_headers_col(df):
    for c in ("headers","raw_headers","header","all_headers","metadata"):
        if c in df.columns:
            return c
    # fallback: any object column
    for c in df.columns:
        if df[c].dtype == object:
            return c
    return None

id_hdr = find_id_col(df_hdr)
id_body = find_id_col(df_body)
body_col = find_text_col_for_body(df_body)
hdr_col = find_headers_col(df_hdr)

print("Detected header id col:", id_hdr)
print("Detected body id col:  ", id_body)
print("Detected body text col:", body_col)
print("Detected header col:   ", hdr_col)

# If no id columns, create synthetic ids by index
if id_hdr is None:
    df_hdr = df_hdr.reset_index().rename(columns={"index":"id_hdr"})
    id_hdr = "id_hdr"
if id_body is None:
    df_body = df_body.reset_index().rename(columns={"index":"id_body"})
    id_body = "id_body"

# If ids exist but names differ, try to rename to "id" for merge simplicity
if id_hdr != "id":
    df_hdr = df_hdr.rename(columns={id_hdr:"id"})
    id_hdr = "id"
if id_body != "id":
    df_body = df_body.rename(columns={id_body:"id"})
    id_body = "id"

# If the header column appears to be JSON strings, parse them
def safe_parse_headers_cell(x):
    if pd.isna(x):
        return {}
    if isinstance(x, dict):
        return x
    if isinstance(x, str):
        s = x.strip()
        # try json
        try:
            return json.loads(s)
        except Exception:
            pass
        # try literal_eval (python dict-like)
        try:
            return literal_eval(s)
        except Exception:
            pass
        # otherwise, return the raw string under a special key
        return {"raw_headers_text": s}
    # other types -> string
    return {"raw_headers_text": str(x)}

# Apply parsing if we found a hdr_col
if hdr_col is not None and hdr_col in df_hdr.columns:
    # create a normalized headers_dict column
    df_hdr["_headers_dict"] = df_hdr[hdr_col].apply(safe_parse_headers_cell)
else:
    df_hdr["_headers_dict"] = [{} for _ in range(len(df_hdr))]

# Show a sample header dict
display(df_hdr[["_headers_dict"]].head(3))


Detected header id col: None
Detected body id col:   None
Detected body text col: text
Detected header col:    origin


Unnamed: 0,_headers_dict
0,{'raw_headers_text': 'Received: from NAHOU-MSM...
1,{'raw_headers_text': 'Received: from nahou-msm...
2,{'raw_headers_text': 'Received: from NAHOU-MSM...


In [19]:
# Cell 3 — merge on 'id' and create a synthetic raw_str if needed
import numpy as np

# Merge
df = pd.merge(df_hdr, df_body, on="id", how="outer", suffixes=("_hdr","_body"))
print("Merged df shape:", df.shape)
display(df.head(3))

# Determine actual body column name after merge (body_col may have been renamed)
possible_body_cols = [c for c in df.columns if any(k in c.lower() for k in ("body","plain","text","raw","message","content"))]
# prefer exact body_col if detected earlier
if body_col and body_col in df.columns:
    body_col_name = body_col
else:
    # pick the best candidate
    body_col_name = possible_body_cols[0] if possible_body_cols else None

print("Using body column:", body_col_name)

# Build raw_str: if we have headers dict, convert to header block; else use headers text if available.
def headers_dict_to_block(h):
    if not h:
        return ""
    # if it's already a dict of header->value
    if isinstance(h, dict):
        lines = []
        for k,v in h.items():
            lines.append(f"{k}: {v}")
        return "\n".join(lines)
    # fallback
    return str(h)

def make_raw_str(row):
    hdr_dict = row.get("_headers_dict", {}) or {}
    hdr_block = headers_dict_to_block(hdr_dict)
    # if there is a raw headers string column (some CSVs keep a text column), prefer it
    fallback_hdr_texts = [c for c in df.columns if c.lower() in ("raw_headers","headers","raw","header","all_headers","metadata")]
    hdr_text = ""
    for c in fallback_hdr_texts:
        if c in row and pd.notna(row[c]):
            hdr_text = str(row[c])
            break
    # choose the most informative header part
    header_section = hdr_text if hdr_text else hdr_block

    body = ""
    if body_col_name and pd.notna(row.get(body_col_name, "")):
        body = str(row.get(body_col_name, ""))
    # combine
    combined = f"{header_section}\n\n{body}"
    return combined

df["raw_str"] = df.apply(make_raw_str, axis=1)

# sanity check counts
print("Rows after merge:", len(df))
print("Sample id/path/label/raw_str preview:")
display(df[["id"]].head(3))
print(df["raw_str"].iloc[0][:500])


Merged df shape: (92189, 6)


Unnamed: 0,id,label_hdr,origin,_headers_dict,label_body,text
0,0,0,Received: from NAHOU-MSMBX01V ([192.168.110.39...,{'raw_headers_text': 'Received: from NAHOU-MSM...,0.0,user id enrondlr pw bnawebescapenumber origina...
1,1,0,Received: from nahou-msmbx03v.corp.enron.com (...,{'raw_headers_text': 'Received: from nahou-msm...,0.0,hi chris tonight we are rolling out a new repo...
2,2,0,Received: from NAHOU-MSMBX01V ([192.168.110.39...,{'raw_headers_text': 'Received: from NAHOU-MSM...,0.0,rika r these new original message from thomas ...


Using body column: text
Rows after merge: 92189
Sample id/path/label/raw_str preview:


Unnamed: 0,id
0,0
1,1
2,2


raw_headers_text: Received: from NAHOU-MSMBX01V ([192.168.110.39]) by NAHOU-MSMBX05V.corp.enron.com with Microsoft SMTPSVC(5.0.2195.1600);
	 Fri, 29 Jun 2001 08:36:10 -0500
X-MimeOLE: Produced By Microsoft Exchange V6.0.4418.65
content-class: urn:content-classes:message
Subject: FW: June 29 -- BNA, Inc. Daily Labor Report
MIME-Version: 1.0
Content-Type: text/plain;
Content-Transfer-Encoding: binary
Date: Fri, 29 Jun 2001 08:36:09 -0500
Message-ID: <77DA52C3FD86904D8209C9750CD310B9C79BB3@NAHOU-MS


In [20]:
# Cell 4 — apply extract_all_features (robustly) and save features to CSV
import csv
from tqdm.notebook import tqdm

OUT = "/kaggle/working/trec_features_from_csvs.csv"

# build union of feature keys from a small sample (to create header)
sample_n = min(200, len(df))
all_keys = set()
for i in range(sample_n):
    raw = df.loc[i, "raw_str"]
    try:
        feats = extract_all_features(raw)
        if isinstance(feats, dict):
            all_keys.update(feats.keys())
    except Exception:
        # fallback partial extraction using parse_raw_email + helpers
        try:
            parsed = parse_raw_email(raw)
            tmp = {}
            tmp.update(url_features(extract_urls(parsed.get("plain","") or "")))
            tmp.update(html_features(parsed.get("html","") or ""))
            tmp.update(header_features(parsed.get("headers", {}) or {}))
            all_keys.update(tmp.keys())
        except Exception:
            pass

feat_cols = sorted(list(all_keys))
base_cols = ["id"]
header = base_cols + feat_cols + ["label"]
print("Feature columns detected:", len(feat_cols))

# write header
with open(OUT, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=header)
    writer.writeheader()

# extract for all rows
for idx in tqdm(range(len(df)), desc="extract_all"):
    rid = df.loc[idx, "id"]
    lbl = df.loc[idx].get("label", "")
    raw = df.loc[idx, "raw_str"]
    try:
        feats = extract_all_features(raw)
        if not isinstance(feats, dict):
            feats = {}
    except Exception as e:
        try:
            parsed = parse_raw_email(raw)
            feats = {}
            feats.update(url_features(extract_urls(parsed.get("plain","") or "")))
            feats.update(html_features(parsed.get("html","") or ""))
            feats.update(header_features(parsed.get("headers", {}) or {}))
        except Exception as e2:
            feats = {"error_extract": str(e)}
    rowd = {"id": rid, "label": lbl}
    for c in feat_cols:
        rowd[c] = feats.get(c, 0)
    # append row
    with open(OUT, "a", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=header)
        writer.writerow(rowd)

print("Saved features to:", OUT)


Feature columns detected: 20


extract_all:   0%|          | 0/92189 [00:00<?, ?it/s]

Saved features to: /kaggle/working/trec_features_from_csvs.csv


In [22]:
# Cell 5 — load the resulting features and check
import pandas as pd
df_feats = pd.read_csv("/kaggle/working/trec_features_from_csvs.csv")
print("Features shape:", df_feats.shape)
display(df_feats)
print("Label distribution:")
print(df_feats['label'].value_counts(dropna=False))


Features shape: (92189, 22)


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,dkim_pass,dmarc_pass,has_ip_url,has_punycode,html_hidden_inputs,html_iframe_tags,html_length,html_script_tags,max_url_length,...,reply_to_diff_from,spf_pass,subject_has_action,subject_has_urgent,subject_has_verify,subject_len,suspicious_token_in_url,unique_domain_count,url_count,label
0,0,0,0,0,0,0,0,0,0,49,...,0,0,0,0,0,43,0,3,41,
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,18,0,0,0,
2,2,0,0,0,0,0,0,0,0,60,...,0,0,0,0,0,17,0,1,3,
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,36,0,0,0,
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,19,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92184,92184,0,0,0,0,0,0,16667,0,107,...,0,0,0,0,0,32,0,2,71,
92185,92185,0,0,0,0,0,0,16667,0,107,...,0,0,0,0,0,32,0,2,71,
92186,92186,0,0,0,0,0,0,16667,0,107,...,0,0,0,0,0,32,0,2,71,
92187,92187,0,0,0,0,0,0,16667,0,107,...,0,0,0,0,0,32,0,2,71,


Label distribution:
label
NaN    92189
Name: count, dtype: int64
