In [None]:
# 02-feature-engineering.ipynb
# Feature Engineering: URL & Metadata

"""
## 02 - Feature Engineering

This notebook develops core features for phishing detection:

1. Load summary CSVs from Notebook 01 (phish_url_summary, enron_senders).
2. Extract URLs from email bodies and build unified DataFrame.
3. Compute lexical URL features (length, special chars, token counts).
4. Parse domain information using tldextract and WHOIS (domain age).
5. Identify IP-based URLs and subdomain counts.
6. Merge sender-domain mismatch flags.
7. Save engineered features to CSV for model input.
"""

#%%
# 1. Imports and Config
import os
import re
import pandas as pd
import numpy as np
import tldextract
import whois
from datetime import datetime

from utils import extract_urls

# File paths
PHISH_SUMMARY = os.path.join('..','data','processed','phish_url_summary.csv')
ENRON_SENDERS = os.path.join('..','data','processed','enron_senders.csv')
OUTPUT_FEATURES = os.path.join('..','data','processed','phishing_features.csv')

#%%
# 2. Load Summaries
phish_df = pd.read_csv(PHISH_SUMMARY)
enron_senders = pd.read_csv(ENRON_SENDERS)

#%%
# 3. Expand URLs (if multiple per row) - demonstration on phishing URLs
expanded = phish_df.copy()
# Assume one URL per row; skip if multiple. For emails, use extract_urls on bodies.

#%%
# 4. Lexical features
expanded['num_dots'] = expanded['url'].apply(lambda u: u.count('.'))
expanded['num_hyphens'] = expanded['url'].apply(lambda u: u.count('-'))
expanded['num_underscores'] = expanded['url'].apply(lambda u: u.count('_'))
expanded['num_qm'] = expanded['url'].apply(lambda u: u.count('?'))
expanded['has_at'] = expanded['url'].apply(lambda u: 1 if '@' in u else 0)
expanded['path_length'] = expanded['url'].apply(lambda u: len(re.sub(r"https?://[\w\.]+", '', u)))

#%%
# 5. Domain parsing and age

def get_domain_info(url):
    ext = tldextract.extract(url)
    domain = f"{ext.domain}.{ext.suffix}" if ext.suffix else ext.domain
    return domain

expanded['domain'] = expanded['url'].apply(get_domain_info)

# WHOIS-based age
def calc_domain_age(domain):
    try:
        info = whois.whois(domain)
        date = info.creation_date
        if isinstance(date, list): date = date[0]
        return (datetime.now() - date).days
    except:
        return np.nan

expanded['domain_age_days'] = expanded['domain'].apply(calc_domain_age)

#%%
# 6. IP-based URL and subdomain count
import ipaddress

def has_ip(url):
    try:
        host = re.findall(r"https?://([^/]+)/?", url)[0]
        ipaddress.ip_address(host)
        return 1
    except:
        return 0

expanded['has_ip'] = expanded['url'].apply(has_ip)
expanded['subdomain_count'] = expanded['url'].apply(lambda u: u.split('://')[-1].count('.') - 1)

#%%
# 7. Sender-domain mismatch flag
# Load email-level data if needed; here we simulate for phishing: assume sender domain known
# For phishing URLs, label sender_domain as NaN; feature = 0
expanded['sender_domain'] = np.nan
expanded['sender_domain_mismatch'] = 0  # to be computed when email data available

#%%
# 8. Label and save
expanded['label'] = 1  # phishing
expanded.to_csv(OUTPUT_FEATURES, index=False)
print(f"Saved features to {OUTPUT_FEATURES} with shape {expanded.shape}")
