In [None]:
# 03-graph-visuals.ipynb
# Graph-Based Feature Engineering with NetworkX

"""
## 03 - Graph Feature Engineering

This notebook:
1. Loads `phishing_features.csv` (phishing URLs & basic features).
2. Builds bipartite graph: sender → URL → domain.
3. Computes graph metrics for URL & domain nodes (degree, betweenness, clustering, PageRank).
4. Visualizes small subgraphs or distributions of metric values.
5. Merges graph metrics into feature table and saves `phishing_graph_features.csv`.
"""

#%%
# 1. Imports and Config
import os
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Paths
FEATURES_CSV = os.path.join('..', 'data', 'processed', 'phishing_features.csv')
OUTPUT_GRAPH = os.path.join('..', 'data', 'processed', 'phishing_graph_features.csv')

#%%
# 2. Load features
df = pd.read_csv(FEATURES_CSV)
print(f"Loaded {df.shape[0]} rows and {df.shape[1]} columns")

#%%
# 3. Build bipartite graph
G = nx.DiGraph()
for idx, row in df.iterrows():
    url = row['url']
    domain = row['domain']
    sender = row.get('sender_domain', None)
    # Add nodes
    G.add_node(url, type='url')
    G.add_node(domain, type='domain')
    if sender:
        G.add_node(sender, type='sender')
        G.add_edge(sender, url)
    # URL → domain
    G.add_edge(url, domain)

print(f"Graph nodes: {len(G.nodes())}, edges: {len(G.edges())}")

#%%
# 4. Compute graph metrics
# Degree centrality
degree_dict = dict(G.degree())
# Betweenness centrality (approx for speed)
between_dict = nx.betweenness_centrality(G, k=100, seed=42)
# PageRank
pagerank_dict = nx.pagerank(G)
# Clustering (undirected)
clust = nx.clustering(G.to_undirected())

#%%
# 5. Assign metrics back to DataFrame
# Initialize columns
df['graph_degree'] = df['url'].map(degree_dict)
# Betweenness
df['graph_betweenness'] = df['url'].map(between_dict)
# PageRank
df['graph_pagerank'] = df['url'].map(pagerank_dict)
# Clustering
df['graph_clustering'] = df['url'].map(clust)

#%%
# 6. Visualize distributions
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
df['graph_degree'].hist(bins=50, ax=axes[0,0]); axes[0,0].set_title('Degree')
df['graph_betweenness'].hist(bins=50, ax=axes[0,1]); axes[0,1].set_title('Betweenness')
df['graph_pagerank'].hist(bins=50, ax=axes[1,0]); axes[1,0].set_title('PageRank')
df['graph_clustering'].hist(bins=50, ax=axes[1,1]); axes[1,1].set_title('Clustering')
plt.tight_layout()
plt.show()

#%%
# 7. Save enriched features
# Export only new graph features or full table
df.to_csv(OUTPUT_GRAPH, index=False)
print(f"Saved graph-enhanced features to {OUTPUT_GRAPH}")
