In [1]:
# 02-feature-engineering.ipynb
# Feature Engineering: URL & Metadata

# Ensure required packages are available in the notebook environment
# Use the notebook magic to install missing packages if needed.
# This is necessary to avoid ModuleNotFoundError in interactive sessions.
%pip install --quiet pandas numpy tldextract python-whois

"""
## 02 - Feature Engineering

This notebook develops core features for phishing detection:

1. Load summary CSVs from Notebook 01 (phish_url_summary, enron_senders).
2. Extract URLs from email bodies and build unified DataFrame.
3. Compute lexical URL features (length, special chars, token counts).
4. Parse domain information using tldextract and WHOIS (domain age).
5. Identify IP-based URLs and subdomain counts.
6. Merge sender-domain mismatch flags.
7. Save engineered features to CSV for model input.
"""

#%%
# 1. Imports and Config
import os
import re
import logging
import pandas as pd
import numpy as np
import tldextract
import whois
from datetime import datetime

# Notebook options
USE_WHOIS = False  # set True to enable network WHOIS lookups (may be slow)
WHOIS_CACHE = os.path.join('..','data','processed','whois_cache.csv')

# Reduce verbosity from whois library
logging.getLogger('whois.whois').setLevel(logging.ERROR)

# Simple cache helpers
def load_whois_cache(path):
    if os.path.exists(path):
        try:
            return pd.read_csv(path, index_col=0)
        except Exception:
            return pd.DataFrame(columns=['domain','age_days']).set_index('domain')
    return pd.DataFrame(columns=['domain','age_days']).set_index('domain')

def save_whois_cache(df, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    df.to_csv(path)

whois_cache = load_whois_cache(WHOIS_CACHE)

# File paths
PHISH_SUMMARY = os.path.join('..','data','processed','phish_url_summary.csv')
ENRON_SENDERS = os.path.join('..','data','processed','enron_senders.csv')
OUTPUT_FEATURES = os.path.join('..','data','processed','phishing_features.csv')

#%%
# 2. Load Summaries (robust)
def load_summary(processed_path, raw_path, key_col='url'):
    """Try to load a processed summary CSV, fall back to a raw CSV if present."""
    import os
    if os.path.exists(processed_path):
        try:
            df = pd.read_csv(processed_path)
            print(f'Loaded processed summary: {processed_path} with shape {df.shape}')
            return df
        except Exception as e:
            print(f'Failed to read processed file {processed_path}: {e}')
    # processed missing or unreadable -> try raw
    if os.path.exists(raw_path) and os.path.getsize(raw_path) > 0:
        try:
            df = pd.read_csv(raw_path, names=[key_col], header=0) if os.path.getsize(raw_path) > 0 else pd.DataFrame(columns=[key_col])
            print(f'Loaded raw file: {raw_path} with shape {df.shape}')
            return df
        except Exception as e:
            print(f'Failed to read raw file {raw_path}: {e}')
    # Last resort: return empty dataframe with expected columns
    print('No summary files found or files empty — creating empty DataFrame with `url` column')
    return pd.DataFrame(columns=[key_col])

phish_df = load_summary(PHISH_SUMMARY, os.path.join('..','data','raw','phishtank_urls.csv'))
enron_senders = load_summary(ENRON_SENDERS, os.path.join('..','data','processed','enron_senders.csv'), key_col='sender')

#%%
# 3. Expand URLs (if multiple per row) - demonstration on phishing URLs
# Ensure expected column exists and normalize
if 'url' not in phish_df.columns:
    # If the raw phishtank file used a different column name, try to find a URL-like column
    url_cols = [c for c in phish_df.columns if 'url' in c.lower() or 'link' in c.lower()]
    if url_cols:
        phish_df = phish_df.rename(columns={url_cols[0]: 'url'})
    else:
        phish_df['url'] = None

expanded = phish_df.copy()
# Coerce URL column to string to avoid apply errors on NaN
expanded['url'] = expanded['url'].astype('string')
# Assume one URL per row; for emails, use extract_urls on bodies.

#%%
# 4. Lexical features
expanded['num_dots'] = expanded['url'].apply(lambda u: u.count('.') if pd.notna(u) else 0)
expanded['num_hyphens'] = expanded['url'].apply(lambda u: u.count('-') if pd.notna(u) else 0)
expanded['num_underscores'] = expanded['url'].apply(lambda u: u.count('_') if pd.notna(u) else 0)
expanded['num_qm'] = expanded['url'].apply(lambda u: u.count('?') if pd.notna(u) else 0)
expanded['has_at'] = expanded['url'].apply(lambda u: 1 if (pd.notna(u) and '@' in u) else 0)
expanded['path_length'] = expanded['url'].apply(lambda u: len(re.sub(r"https?://[\\w\\.]+", '', u)) if pd.notna(u) else 0)

#%%
# 5. Domain parsing and age

def get_domain_info(url):
    if pd.isna(url):
        return None
    ext = tldextract.extract(str(url))
    domain = f"{ext.domain}.{ext.suffix}" if ext.suffix else ext.domain
    return domain

expanded['domain'] = expanded['url'].apply(get_domain_info)

# WHOIS-based age with cache and optional network
from time import sleep

def calc_domain_age(domain):
    if domain is None or pd.isna(domain):
        return np.nan
    # check cache first
    if domain in whois_cache.index:
        return float(whois_cache.loc[domain,'age_days'])
    if not USE_WHOIS:
        return np.nan
    try:
        info = whois.whois(domain)
        date = info.creation_date
        if isinstance(date, list):
            date = date[0]
        if date is None:
            age = np.nan
        else:
            age = (datetime.now() - date).days
        # update cache
        whois_cache.loc[domain] = [age]
        # be polite to WHOIS servers
        sleep(0.5)
        return age
    except Exception:
        return np.nan

# compute domain_age_days
expanded['domain_age_days'] = expanded['domain'].apply(calc_domain_age)

# persist cache
save_whois_cache(whois_cache, WHOIS_CACHE)

#%%
# 6. IP-based URL and subdomain count
import ipaddress

def has_ip(url):
    if pd.isna(url):
        return 0
    try:
        host = re.findall(r"https?://([^/]+)/?", str(url))[0]
        # strip possible port
        host = host.split(':')[0]
        ipaddress.ip_address(host)
        return 1
    except Exception:
        return 0

expanded['has_ip'] = expanded['url'].apply(has_ip)
expanded['subdomain_count'] = expanded['url'].apply(lambda u: u.split('://')[-1].count('.') - 1 if pd.notna(u) else 0)

#%%
# 7. Sender-domain mismatch flag
# Load email-level data if needed; here we simulate for phishing: assume sender domain known
# For phishing URLs, label sender_domain as NaN; feature = 0
expanded['sender_domain'] = np.nan
expanded['sender_domain_mismatch'] = 0  # to be computed when email data available

#%%
# 8. Label and save
expanded['label'] = 1  # phishing
# Ensure output directory exists
out_dir = os.path.dirname(OUTPUT_FEATURES)
os.makedirs(out_dir, exist_ok=True)
expanded.to_csv(OUTPUT_FEATURES, index=False)
print(f"Saved features to {OUTPUT_FEATURES} with shape {expanded.shape}")

#%% [markdown]
# --- Benign (Enron) Feature Engineering ---

#%%

# 1. Load Enron senders/emails (already loaded as enron_senders)
# If not loaded, uncomment:
# ENRON_SENDERS = os.path.join('..','data','processed','enron_senders.csv')
# enron_senders = pd.read_csv(ENRON_SENDERS)

# 2. Extract URLs from email bodies
def extract_urls(text):
    if pd.isna(text):
        return []
    return re.findall(r'https?://[^\s]+', str(text))

enron_senders['urls'] = enron_senders['body'].apply(extract_urls)

# 3. Expand rows for each URL
enron_expanded = enron_senders.explode('urls').reset_index(drop=True)
enron_expanded = enron_expanded[enron_expanded['urls'].notna()]
enron_expanded = enron_expanded.rename(columns={'urls': 'url'})

# 4. Feature engineering (match phishing features)
enron_expanded['num_dots'] = enron_expanded['url'].apply(lambda u: u.count('.') if pd.notna(u) else 0)
enron_expanded['num_hyphens'] = enron_expanded['url'].apply(lambda u: u.count('-') if pd.notna(u) else 0)
enron_expanded['num_underscores'] = enron_expanded['url'].apply(lambda u: u.count('_') if pd.notna(u) else 0)
enron_expanded['num_qm'] = enron_expanded['url'].apply(lambda u: u.count('?') if pd.notna(u) else 0)
enron_expanded['has_at'] = enron_expanded['url'].apply(lambda u: 1 if (pd.notna(u) and '@' in u) else 0)
enron_expanded['path_length'] = enron_expanded['url'].apply(lambda u: len(re.sub(r"https?://[\w\.]+", '', u)) if pd.notna(u) else 0)

def get_domain_info(url):
    if pd.isna(url):
        return None
    ext = tldextract.extract(str(url))
    domain = f"{ext.domain}.{ext.suffix}" if ext.suffix else ext.domain
    return domain

enron_expanded['domain'] = enron_expanded['url'].apply(get_domain_info)

def has_ip(url):
    if pd.isna(url):
        return 0
    try:
        host = re.findall(r"https?://([^/]+)/?", str(url))[0]
        host = host.split(':')[0]
        ipaddress.ip_address(host)
        return 1
    except Exception:
        return 0

enron_expanded['has_ip'] = enron_expanded['url'].apply(has_ip)
enron_expanded['subdomain_count'] = enron_expanded['url'].apply(lambda u: u.split('://')[-1].count('.') - 1 if pd.notna(u) else 0)

enron_expanded['sender_domain'] = enron_expanded['sender'].apply(lambda s: s.split('@')[-1] if pd.notna(s) and '@' in s else np.nan)
enron_expanded['sender_domain_mismatch'] = enron_expanded.apply(
    lambda row: int(row['sender_domain'] != row['domain']) if pd.notna(row['sender_domain']) and pd.notna(row['domain']) else 0,
    axis=1
)
enron_expanded['domain_age_days'] = np.nan  # or use WHOIS if you want

enron_expanded['label'] = 0  # benign

# 5. Save benign features
BENIGN_FEATURES = os.path.join('..','data','processed','benign_features.csv')
enron_expanded.to_csv(BENIGN_FEATURES, index=False)
print(f"Saved benign features to {BENIGN_FEATURES} with shape {enron_expanded.shape}")

#%%

# 6. Combine phishing and benign features for modeling
PHISHING_FEATURES = os.path.join('..','data','processed','phishing_features.csv')
benign_features = pd.read_csv(BENIGN_FEATURES)
phishing_features = pd.read_csv(PHISHING_FEATURES)
all_features = pd.concat([phishing_features, benign_features], ignore_index=True)
ALL_FEATURES = os.path.join('..','data','processed','phishing_graph_features.csv')
all_features.to_csv(ALL_FEATURES, index=False)
print(f"Combined phishing and benign features saved to {ALL_FEATURES} with shape {all_features.shape}")

Note: you may need to restart the kernel to use updated packages.
Loaded processed summary: ../data/processed/phish_url_summary.csv with shape (53657, 2)
Loaded processed summary: ../data/processed/enron_senders.csv with shape (1000, 14)
Loaded processed summary: ../data/processed/phish_url_summary.csv with shape (53657, 2)
Loaded processed summary: ../data/processed/enron_senders.csv with shape (1000, 14)
Saved features to ../data/processed/phishing_features.csv with shape (53657, 15)
Saved features to ../data/processed/phishing_features.csv with shape (53657, 15)


KeyError: 'body'