# 02-preprocessing.ipynb
Preprocessing notebook: clean PhishTank dump, extract domains and safe flags, and extract URLs/senders from the Enron mail corpus (if available).

**Safety note:** this notebook does NOT fetch or visit any URLs. It only parses text and extracts features. Do not add code that performs live HTTP requests without sandboxing.

In [3]:
# 1. Environment (use inside notebook)
%pip install --quiet pandas tldextract python-whois ipaddress
import sys
print('python', sys.version.splitlines()[0])

Note: you may need to restart the kernel to use updated packages.
python 3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]


In [4]:
# 2. Imports and helpers
import os
import re
import csv
import pandas as pd
import numpy as np
import tldextract
import ipaddress
from datetime import datetime

# fix: escape the double-quote inside the regex string so the string literal is valid
URL_PATTERN = re.compile(r"https?://[^\s'\"<>]+", flags=re.IGNORECASE)

def extract_urls_from_text(text):
    if pd.isna(text):
        return []
    return URL_PATTERN.findall(str(text))

def extract_domain(url):
    # returns domain.suffix or empty string for invalid input
    try:
        ext = tldextract.extract(str(url))
        if ext.suffix:
            return f"{ext.domain}.{ext.suffix}"
        return ext.domain or ''
    except Exception:
        return ''

def is_ip_host(url):
    try:
        host = re.findall(r"https?://([^/]+)/?", str(url))
        if not host:
            return False
        h = host[0]  # may include port
        h = h.split(':')[0]
        ipaddress.ip_address(h)
        return True
    except Exception:
        return False

In [5]:
# 3. Load and clean PhishTank CSV -> produce data/processed/phish_url_summary.csv
RAW_PHISH = os.path.join('data','raw','verified_online.csv')
OUT_DIR = os.path.join('data','processed')
os.makedirs(OUT_DIR, exist_ok=True)
PHISH_SUMMARY_OUT = os.path.join(OUT_DIR,'phish_url_summary.csv')

def process_phishtank(raw_path=RAW_PHISH, out_path=PHISH_SUMMARY_OUT):
    if not os.path.exists(raw_path):
        print(f'Raw PhishTank file not found: {raw_path}')
        return pd.DataFrame(columns=['phish_id','url','domain','phish_detail_url','submission_time','verified','verification_time','online','target'])
    # read with pandas, keep relevant columns if present
    df = pd.read_csv(raw_path, dtype=str, keep_default_na=False)
    # normalize column names (in case header differs)
    expected = ['phish_id','url','phish_detail_url','submission_time','verified','verification_time','online','target']
    cols = [c for c in expected if c in df.columns]
    df = df[cols].copy()
    # ensure url column exists
    if 'url' not in df.columns:
        # try common alternatives
        candidates = [c for c in df.columns if 'url' in c.lower()]
        if candidates:
            df = df.rename(columns={candidates[0]:'url'})
        else:
            print('No url column found in PhishTank file')
            return pd.DataFrame(columns=['phish_id','url','domain','phish_detail_url','submission_time','verified','verification_time','online','target'])
    df['url'] = df['url'].astype(str)
    # extract domain and flags
    df['domain'] = df['url'].apply(extract_domain)
    df['is_ip'] = df['url'].apply(is_ip_host)
    # reduce to canonical columns
    out_cols = [c for c in ['phish_id','url','domain','is_ip','phish_detail_url','submission_time','verified','verification_time','online','target'] if c in df.columns or c in ['is_ip','domain']]
    df_out = df.reindex(columns=out_cols)
    df_out.to_csv(out_path, index=False)
    print(f'Wrote processed PhishTank summary to {out_path} with shape {df_out.shape}')
    return df_out

# run processing now (safe: no network calls)
phish_summary = process_phishtank()
phish_summary.head()

Raw PhishTank file not found: data/raw/verified_online.csv


Unnamed: 0,phish_id,url,domain,phish_detail_url,submission_time,verified,verification_time,online,target


In [6]:
# 4. Extract URLs from Enron maildir (if available) and save senders & urls
ENRON_DIR = os.path.join('data','raw','enron_emails')
ENRON_URLS_OUT = os.path.join(OUT_DIR,'enron_urls.csv')
ENRON_SENDERS_OUT = os.path.join(OUT_DIR,'enron_senders.csv')

def extract_enron(enron_dir=ENRON_DIR, urls_out=ENRON_URLS_OUT, senders_out=ENRON_SENDERS_OUT):
    if not os.path.exists(enron_dir):
        print(f'Enron maildir not found: {enron_dir}')
        return pd.DataFrame(columns=['url']), pd.DataFrame(columns=['sender'])
    rows = []
    senders = []
    for root, _, files in os.walk(enron_dir):
        for fname in files:
            path = os.path.join(root, fname)
            try:
                with open(path, 'r', errors='ignore') as f:
                    text = f.read()
                    urls = extract_urls_from_text(text)
                    for u in urls:
                        rows.append({'url': u})
                    # try to get From header if present
                    m = re.search(r'^From:*(.+)$', text, flags=re.IGNORECASE|re.MULTILINE)
                    if m:
                        senders.append({'sender': m.group(1).strip()})
            except Exception:
                continue
    df_urls = pd.DataFrame(rows).drop_duplicates().reset_index(drop=True)
    df_senders = pd.DataFrame(senders).drop_duplicates().reset_index(drop=True)
    if not df_urls.empty:
        df_urls.to_csv(urls_out, index=False)
        print(f'Wrote {len(df_urls)} extracted Enron URLs to {urls_out}')
    else:
        print('No URLs extracted from Enron corpus (or corpus missing)')
    if not df_senders.empty:
        df_senders.to_csv(senders_out, index=False)
        print(f'Wrote {len(df_senders)} Enron senders to {senders_out}')
    return df_urls, df_senders

enron_urls, enron_senders = extract_enron()

Enron maildir not found: data/raw/enron_emails


In [7]:
# 5. Quick checks and simple unit-style assertions
# ensure output files exist when dataframes non-empty
if not phish_summary.empty:
    assert os.path.exists(PHISH_SUMMARY_OUT), 'Processed phish summary missing'
    print('Phish summary OK:', phish_summary.shape)
else:
    print('Phish summary is empty (no raw data found or file empty)')

if 'enron_urls' in globals() and not enron_urls.empty:
    assert os.path.exists(ENRON_URLS_OUT), 'Enron URLs output missing'
    print('Enron URLs OK:', enron_urls.shape)
else:
    print('Enron URLs empty or not extracted')

Phish summary is empty (no raw data found or file empty)
Enron URLs empty or not extracted


## Notes and next steps
- This notebook creates `data/processed/phish_url_summary.csv` with columns `phish_id,url,domain,is_ip,...`.
- It does NOT fetch or validate live URLs. For WHOIS or hosting features, add optional cells but run them in an isolated environment.
- Use `phish_summary` as positive training examples and `enron_urls` as candidate negatives (verify and dedupe).

## CI / test snippet
Use this command in CI to run tests for the notebook's helper functions (example):
```
pip install -r requirements.txt && pytest -q
```