In [None]:
# Install dependencies (Colab) - run once
%pip install -q tensorflow==2.13.0 keras==2.13.1 torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cpu
%pip install -q torch_geometric -f https://data.pyg.org/whl/torch-2.3.1+cpu.html
%pip install -q python-dotenv requests matplotlib seaborn scikit-learn pandas tldextract google-cloud-firestore
print('Finished pip installs')

# If a single command fails, re-run this cell -- Colab will show the failing line.


In [None]:
# Minimal imports and deterministic setup (small cell so it can be re-run after failures)
import os, json, random
import numpy as np
import pandas as pd
import tensorflow as tf

SEED = int(os.environ.get('TRAIN_SEED', '42'))
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
try:
    tf.config.experimental.enable_op_determinism()
except Exception:
    pass

print('Imports done, seed=', SEED)


In [None]:
# Thesis training stats printer
# Tries to load common stats JSONs, otherwise falls back to parsing the Pre_Train_and_Fine_Tune_AE_+_GCN.ipynb notebook
import re
import os
import json

def print_thesis_stats(nb_path='Pre_Train_and_Fine_Tune_AE_+_GCN.ipynb'):
    candidates = ['ae_gcn_training_stats.json','training_stats.json','training_provenance.json','training_provenance.json']
    for c in candidates:
        if os.path.exists(c):
            try:
                with open(c,'r') as f:
                    data = json.load(f)
                print(f"Loaded stats from {c}:")
                print(json.dumps(data, indent=2))
                return
            except Exception as e:
                print('Failed to load', c, e)
    # fallback: parse the notebook for numeric metrics
    if os.path.exists(nb_path):
        try:
            with open(nb_path,'r', encoding='utf-8') as f:
                nb = json.load(f)
            text = '\n'.join(''.join(cell.get('source',[])) for cell in nb.get('cells',[]))
            patterns = [r'(?i)(ae\s*loss\s*[:=]\s*\d+\.?\d*)', r'(?i)(gcn\s*loss\s*[:=]\s*\d+\.?\d*)', r'(?i)(auc\s*[:=]\s*\d+\.?\d*)', r'(?i)(accuracy\s*[:=]\s*\d+\.?\d*)', r'(?i)(threshold\s*[:=]\s*\d+\.?\d*)', r'(?i)(final\s*loss\s*[:=]\s*\d+\.?\d*)']
            found = False
            for p in patterns:
                for m in re.findall(p, text):
                    print(m)
                    found = True
            if not found:
                print('No clear metric patterns found in', nb_path)
        except Exception as e:
            print('Failed to parse notebook', e)
    else:
        print('No stats JSON or notebook found; run training cell first to generate stats')

# Call the helper so it prints immediately when run
print_thesis_stats()


In [None]:
# Declare model type (hybrid AE + GCN) so subsequent cells can reference it
model_type = 'AE+GCN'
print('Using hybrid model:', model_type)

# When training cells run, ensure they print thesis-specific stats (losses, AUC, thresholds) to
# either a JSON file named 'ae_gcn_training_stats.json' or to the notebook output so the above cell can
# extract them for reporting.


In [None]:
# Dakugumen Colab: Reproducible pipeline for AE + GCN hybrid phishing detector
# This notebook installs dependencies, captures provenance, loads data, trains AE/GCN, trains a supervised
# classifier, evaluates, and packages artifacts for deployment. It is designed to match the thesis methods.

# ======= 0) Install (optional - run once) =======
# Uncomment to install (Colab) - may take a while
# !pip install -q tensorflow==2.13.0 keras==2.13.1 torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cpu
# !pip install -q torch_geometric -f https://data.pyg.org/whl/torch-2.3.1+cpu.html
# !pip install -q python-dotenv requests matplotlib seaborn scikit-learn pandas tldextract google-cloud-firestore

# ======= 1) Imports & deterministic setup =======
import os, json, hashlib, subprocess, shutil, zipfile, io
from datetime import datetime
import random
import numpy as np
import pandas as pd
import tldextract
import matplotlib.pyplot as plt
import seaborn as sns

# ML frameworks
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split

print('Python, numpy, pandas, TF versions:')
print('python:', subprocess.check_output(['python','--version']).decode().strip())
print('numpy', np.__version__, 'pandas', pd.__version__, 'tensorflow', tf.__version__)

# Deterministic seed
SEED = int(os.environ.get('TRAIN_SEED', '42'))
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
try:
    tf.config.experimental.enable_op_determinism()
except Exception:
    pass

# provenance helper
def save_provenance(extra=None, path='training_provenance.json'):
    prov = {'seed': SEED, 'timestamp': datetime.utcnow().isoformat()+'Z'}
    try:
        prov['pip_freeze'] = subprocess.check_output(['python','-m','pip','freeze']).decode()
    except Exception:
        prov['pip_freeze'] = None
    try:
        prov['git_commit'] = subprocess.check_output(['git','rev-parse','--short','HEAD']).decode().strip()
    except Exception:
        prov['git_commit'] = None
    if extra: prov.update(extra)
    with open(path, 'w') as f:
        json.dump(prov, f, indent=2)
    print('Saved provenance to', path)

# ======= 2) Data loading (public dataset) =======
# Public dataset used in thesis (example). Replace URL if needed.
PUBLIC_CSV = 'https://raw.githubusercontent.com/GregaVrbancic/Phishing-Dataset/master/dataset_full.csv'
print('Downloading public dataset...')
df_base = pd.read_csv(PUBLIC_CSV)
print('Public dataset', df_base.shape)

# Save initial provenance
save_provenance({'public_rows': int(len(df_base))})

# ======= 3) Feature extraction & preprocessing (shared function) =======
from urllib.parse import urlparse, urlunparse, parse_qsl, urlencode
import re

# Use same feature code as app.py
def normalize_url_for_features(raw_url: str) -> str:
    try:
        p = urlparse(raw_url or '')
        if p.query:
            q = parse_qsl(p.query, keep_blank_values=True)
            q_filtered = [(k,v) for k,v in q if not (k.lower()=='fbclid' or k.lower().startswith('utm_'))]
            p = p._replace(query=urlencode(q_filtered))
        return urlunparse(p)
    except Exception:
        return raw_url or ''

# Stabilize features (log/clip) - same as notebook and app
import math

def stabilize_features_map(feat_dict: dict):
    feat_dict['qty_questionmark_url'] = min(max(int(feat_dict.get('qty_questionmark_url',0)),0),3)
    feat_dict['qty_slash_url'] = min(max(int(feat_dict.get('qty_slash_url',0)),0),10)
    feat_dict['qty_dot_url'] = min(max(int(feat_dict.get('qty_dot_url',0)),0),6)
    feat_dict['qty_hyphen_url'] = min(max(int(feat_dict.get('qty_hyphen_url',0)),0),10)
    feat_dict['url_shortened'] = 1.0 if feat_dict.get('url_shortened',0) else 0.0
    for key in ['file_length','directory_length','params_length','time_domain_activation','time_domain_expiration','ttl_hostname','asn_ip','time_response']:
        if key in feat_dict and feat_dict[key] is not None:
            try:
                feat_dict[key] = float(math.log1p(max(0.0,float(feat_dict[key]))))
            except Exception:
                pass
    return feat_dict

# Compute lexical features (subset - must match feature_cols ordering)
def compute_lexical(u: str):
    u = normalize_url_for_features(u)
    parts = tldextract.extract(u)
    domain = '.'.join([p for p in [parts.subdomain, parts.domain, parts.suffix] if p])
    path_q = u.split(domain,1)[-1] if domain and domain in u else ''
    feats = {
        'qty_dot_url': u.count('.'), 'qty_hyphen_url': u.count('-'), 'qty_underline_url': u.count('_'),
        'qty_slash_url': u.count('/'), 'qty_questionmark_url': u.count('?'), 'qty_equal_url': u.count('='),
        'qty_at_url': u.count('@'), 'qty_and_url': u.count('&'), 'qty_exclamation_url': u.count('!'),
        'qty_space_url': u.count(' '), 'qty_tilde_url': u.count('~'), 'qty_comma_url': u.count(','),
        'qty_plus_url': u.count('+'), 'qty_asterisk_url': u.count('*'), 'qty_hashtag_url': u.count('#'),
        'qty_dollar_url': u.count('$'), 'qty_percent_url': u.count('%'), 'length_url': len(u)
    }
    # directory/file breakdown approx
    directory = path_q.rsplit('/',1)[0] if '/' in path_q else ''
    filepart = path_q.rsplit('/',1)[-1] if '/' in path_q else path_q
    for prefix,s in [('directory',directory),('file',filepart)]:
        feats[f'qty_dot_{prefix}'] = s.count('.')
        feats[f'qty_hyphen_{prefix}'] = s.count('-')
        feats[f'qty_underline_{prefix}'] = s.count('_')
        feats[f'qty_slash_{prefix}'] = s.count('/')
        feats[f'qty_questionmark_{prefix}'] = s.count('?')
        feats[f'qty_equal_{prefix}'] = s.count('=')
        feats[f'qty_at_{prefix}'] = s.count('@')
        feats[f'qty_and_{prefix}'] = s.count('&')
        feats[f'{prefix}_length'] = len(s)
    # enrich with defaults
    feats = stabilize_features_map(feats)
    return feats

# Build feature dataframe for initial public features if not present
if 'feature_cols' not in globals():
    # attempt to infer feature columns from df_base
    feature_cols = [c for c in df_base.columns if c!='phishing']

# ======= Save feature names for audit =======
with open('feature_names.json','w') as f:
    json.dump(feature_cols, f)
print('Saved feature_names.json')

# The rest of the notebook contains cells to train AE/GCN and produce artifacts (already present in this notebook)
print('Preprocessing utilities ready. Continue to run remaining cells in the notebook to train models.')



In [None]:
# === Data split & scaler fit (idempotent) ===
# Loads public dataset, splits, and fits scaler on benign training set; saves scaler as scaler.pkl and scaler_final.pkl
import os, json, pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler

PUBLIC_CSV = os.environ.get('PUBLIC_CSV', 'https://raw.githubusercontent.com/GregaVrbancic/Phishing-Dataset/master/dataset_full.csv')
print('Loading public dataset from', PUBLIC_CSV)
df_public = pd.read_csv(PUBLIC_CSV)
print('Public data shape', df_public.shape)

if 'phishing' not in df_public.columns:
    raise RuntimeError('Public CSV must contain "phishing" label column')

X = df_public.drop(columns=['phishing']).astype(float)
y = df_public['phishing'].astype(int).values
feature_cols = X.columns.tolist()

# Splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=SEED)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, stratify=y_train, random_state=SEED)
print('Data splits:', X_train.shape, X_val.shape, X_test.shape)

# Fit scaler on benign portion of training set (per thesis)
use_standard = os.environ.get('STANDARD_SCALER','0').lower() in ('1','true','yes')
if use_standard:
    scaler = StandardScaler().fit(X_train[y_train==0])
    scaler_type = 'StandardScaler'
else:
    scaler = RobustScaler().fit(X_train[y_train==0])
    scaler_type = 'RobustScaler'

# Transform
X_train_s = scaler.transform(X_train)
X_val_s = scaler.transform(X_val)
X_test_s = scaler.transform(X_test)

# Save scaler for backend compatibility
with open('scaler.pkl','wb') as f:
    pickle.dump(scaler, f)
with open('scaler_final.pkl','wb') as f:
    pickle.dump(scaler, f)
print('Saved scaler.pkl and scaler_final.pkl (type=', scaler_type, ')')

# Persist feature names
with open('feature_names.json','w') as f:
    json.dump(feature_cols, f)
print('Saved feature_names.json')


In [None]:
# === Autoencoder model definition and training (pretrain + fine-tune) ===
import os
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

input_dim = X_train_s.shape[1]
print('Building AE with input_dim=', input_dim)

def build_autoencoder(input_dim, latent_dim=None):
    if latent_dim is None:
        latent_dim = max(8, input_dim // 4)
    inp = keras.Input(shape=(input_dim,))
    x = layers.Dense(max(64, input_dim//2), activation='relu')(inp)
    x = layers.Dense(latent_dim, activation='relu')(x)
    x = layers.Dense(max(64, input_dim//2), activation='relu')(x)
    out = layers.Dense(input_dim, activation='linear')(x)
    model = keras.Model(inp, out, name='autoencoder')
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3), loss='mse')
    return model

ae = build_autoencoder(input_dim)

epochs_pre = int(os.environ.get('AE_PRE_EPOCHS','10'))
epochs_fine = int(os.environ.get('AE_FINE_EPOCHS','5'))
batch_size = int(os.environ.get('AE_BATCH', '256'))

# Pretrain on all public training data (or only benign if preferred)
print('Pretraining AE...')
try:
    hist_pre = ae.fit(X_train_s, X_train_s, validation_data=(X_val_s, X_val_s), epochs=epochs_pre, batch_size=batch_size, verbose=2)
    print('Pretrain complete')
except Exception as e:
    print('Pretrain failed:', e)

# Fine-tune on benign samples only (if any)
try:
    benign_idx = np.where(y_train==0)[0]
    if len(benign_idx) > 0:
        print('Fine-tuning on', len(benign_idx), 'benign samples')
        hist_fine = ae.fit(X_train_s[benign_idx], X_train_s[benign_idx], validation_data=(X_val_s[y_val==0], X_val_s[y_val==0]) if any(y_val==0) else (X_val_s, X_val_s), epochs=epochs_fine, batch_size=batch_size, verbose=2)
    else:
        print('No benign samples for fine-tuning; skipping')
except Exception as e:
    print('Fine-tune failed:', e)

print('AE training finished')


In [None]:
# === Compute reconstruction threshold, save AE model and threshold ===
import numpy as np

# Compute reconstruction errors on validation benign set
val_benign = X_val_s[y_val==0]
if val_benign.shape[0] == 0:
    val_benign = X_val_s
recons = ae.predict(val_benign)
errors = np.mean(np.square(val_benign - recons), axis=1)
thr = float(np.mean(errors) + 3.0 * np.std(errors))
print('Computed AE threshold:', thr)

# Save model and threshold
try:
    ae.save('phishing_autoencoder_model.keras')
    print('Saved phishing_autoencoder_model.keras')
except Exception as e:
    print('Failed to save AE model:', e)

with open('autoencoder_threshold.txt','w') as f:
    f.write(str(thr))
print('Saved autoencoder_threshold.txt')

# Ensure scaler files exist
import pickle
if not os.path.exists('scaler.pkl'):
    with open('scaler.pkl','wb') as f:
        pickle.dump(scaler, f)
if not os.path.exists('scaler_final.pkl'):
    with open('scaler_final.pkl','wb') as f:
        pickle.dump(scaler, f)
print('Ensured scaler.pkl and scaler_final.pkl exist')


In [None]:
# === GCN artifacts creation & training (adapted from thesis notebook) ===
# Builds a heterogeneous graph from user reports (posts/domains/users), trains a small GCN to predict
# phishing probability per post node, and saves `gnn_probs.npy` and `post_node_map.json` for the backend.
import os, json
import numpy as np

gnn_probs_path = 'gnn_probs.npy'
post_node_map_path = 'post_node_map.json'

def _write_neutral_artifacts():
    probs = np.full(1, 0.5)
    np.save(gnn_probs_path, probs)
    with open(post_node_map_path, 'w') as f:
        json.dump({}, f)
    print('Wrote neutral GCN artifacts')

try:
    # Prefer real training if user-provided reports dataframe exists
    if 'df_user' in globals() and not df_user.empty:
        print('Building graph from df_user with', len(df_user), 'rows')
        try:
            import torch
            import torch.nn.functional as F
            from torch import nn, optim
            from torch_geometric.data import Data
            from torch_geometric.nn import GCNConv
        except Exception as e:
            print('Required torch/torch_geometric not installed or failed to import:', e)
            _write_neutral_artifacts()
            raise

        # Build node index mapping: create unique node keys for posts, domains, users
        idx_to_key = []
        key_to_idx = {}
        def add_node(key):
            if key in key_to_idx:
                return key_to_idx[key]
            i = len(idx_to_key)
            idx_to_key.append(key)
            key_to_idx[key] = i
            return i

        edges = []
        post_keys = []
        labels_by_post = {}

        # Helper to extract domain
        from urllib.parse import urlparse
        def domain_of(url):
            try:
                p = urlparse(url)
                host = p.netloc or p.path
                return host.lower()
            except Exception:
                return ''

        # Build nodes and edges from df_user. Expect df_user to have 'post_id', 'url', 'user_id', 'label' (0/1)
        for _, r in df_user.iterrows():
            post_id = str(r.get('post_id') or r.get('postId') or r.get('id') or '')
            url = r.get('url') or r.get('link') or ''
            user_id = str(r.get('userId') or r.get('user_id') or r.get('user') or 'anon')
            label = r.get('label') if 'label' in r.index else None

            if not post_id and not url:
                continue
            # Node keys
            post_key = f'post:{post_id}' if post_id else f'post_url:{url}'
            dom = domain_of(url)
            domain_key = f'domain:{dom}' if dom else None
            user_key = f'user:{user_id}'

            pidx = add_node(post_key)
            post_keys.append(post_key)
            if domain_key:
                didx = add_node(domain_key)
                edges.append((pidx, didx))
                edges.append((didx, pidx))
            uidx = add_node(user_key)
            edges.append((uidx, pidx))
            edges.append((pidx, uidx))

            # record label if present
            try:
                if label is not None and (int(label) == 0 or int(label) == 1):
                    labels_by_post[post_key] = int(label)
            except Exception:
                pass

        if len(idx_to_key) == 0:
            print('No graph nodes constructed from df_user; writing neutral artifacts')
            _write_neutral_artifacts()
        else:
            # Prepare node features: for post nodes use scaled lexical features (if url present), else zeros
            num_nodes = len(idx_to_key)
            feat_dim = X_train_s.shape[1] if 'X_train_s' in globals() else len(feature_cols)
            x = np.zeros((num_nodes, feat_dim), dtype=float)
            y_nodes = np.zeros((num_nodes,), dtype=float)
            labeled_mask = np.zeros((num_nodes,), dtype=bool)

            for i, key in enumerate(idx_to_key):
                if key.startswith('post:') or key.startswith('post_url:'):
                    # try to recover URL from df_user mapping
                    # Search df_user for matching post_key
                    url = None
                    if key.startswith('post:'):
                        pid = key.split(':',1)[1]
                        matches = df_user[(df_user.get('post_id')==pid) | (df_user.get('postId')==pid)] if 'post_id' in df_user.columns or 'postId' in df_user.columns else df_user[df_user.index==int(pid)] if pid.isdigit() else df_user[df_user['url'].str.contains(pid, na=False)]
                        if not matches.empty:
                            url = matches.iloc[0].get('url')
                    else:
                        # post_url key
                        # try to find row containing the url
                        candidates = df_user[df_user['url'].notnull() & df_user['url'].str.contains(key.split(':',1)[1], na=False)] if 'url' in df_user.columns else None
                        if candidates is not None and not candidates.empty:
                            url = candidates.iloc[0].get('url')
                    if url and 'scaler' in globals():
                        try:
                            from numpy import array
                            feats = np.array(list(compute_lexical(url).values()), dtype=float)
                            # Align features to scaler's expected feature order if possible
                            # Attempt to transform via scaler by building DataFrame
                            import pandas as pd
                            df_tmp = pd.DataFrame([compute_lexical(url)])
                            cols_expected = list(getattr(scaler, 'feature_names_in_', []))
                            if cols_expected:
                                row_vals = [float(df_tmp.iloc[0][c]) if c in df_tmp.columns else 0.0 for c in cols_expected]
                                x[i] = np.array(row_vals)
                            else:
                                # fallback: trim/pad to feat_dim
                                arr = np.array(list(df_tmp.iloc[0].values()), dtype=float)
                                if arr.size >= feat_dim:
                                    x[i] = arr[:feat_dim]
                                else:
                                    tmp = np.zeros(feat_dim, dtype=float)
                                    tmp[:arr.size] = arr
                                    x[i] = tmp
                        except Exception:
                            x[i] = np.zeros((feat_dim,), dtype=float)
                    else:
                        x[i] = np.zeros((feat_dim,), dtype=float)
                else:
                    # domain/user nodes: zeros
                    x[i] = np.zeros((feat_dim,), dtype=float)

                # labels
                if key in labels_by_post:
                    y_nodes[i] = labels_by_post[key]
                    labeled_mask[i] = True

            # Build edge_index
            if edges:
                import torch
                edge_index = torch.tensor(np.array(edges, dtype=np.int64).T, dtype=torch.long).contiguous()
            else:
                import torch
                edge_index = torch.empty((2,0), dtype=torch.long)

            # Train/test masks for labeled nodes
            labeled_idx = np.where(labeled_mask)[0]
            if labeled_idx.size == 0:
                print('No labeled nodes available for supervised GCN training; skipping training and writing neutral artifacts')
                _write_neutral_artifacts()
            else:
                # create boolean masks
                rng = np.random.RandomState(SEED)
                perm = rng.permutation(labeled_idx)
                n_train = max(1, int(0.8 * len(perm)))
                train_idx = perm[:n_train]
                test_idx = perm[n_train:]

                train_mask = np.zeros((num_nodes,), dtype=bool)
                test_mask = np.zeros((num_nodes,), dtype=bool)
                train_mask[train_idx] = True
                test_mask[test_idx] = True

                # Build PyG Data
                import torch
                data = Data(x=torch.tensor(x, dtype=torch.float), edge_index=edge_index, y=torch.tensor(y_nodes, dtype=torch.float), train_mask=torch.tensor(train_mask), test_mask=torch.tensor(test_mask))

                # Define simple GCN
                class GCN(nn.Module):
                    def __init__(self, num_features, num_classes=1):
                        super(GCN, self).__init__()
                        self.conv1 = GCNConv(num_features, 16)
                        self.conv2 = GCNConv(16, num_classes)
                    def forward(self, d):
                        x, ei = d.x, d.edge_index
                        x = F.relu(self.conv1(x, ei))
                        x = self.conv2(x, ei).squeeze(-1)
                        return x

                # Instantiate and train
                num_features = x.shape[1]
                gnn_model = GCN(num_features=num_features)
                optimizer = optim.Adam(gnn_model.parameters(), lr=1e-2, weight_decay=5e-4)
                criterion = nn.BCEWithLogitsLoss()

                epochs = int(os.environ.get('GCN_EPOCHS', '50'))
                print('Training GCN for', epochs, 'epochs on', int(train_mask.sum()), 'train labeled nodes')
                for ep in range(epochs):
                    gnn_model.train()
                    optimizer.zero_grad()
                    out = gnn_model(data)
                    loss = criterion(out[data.train_mask], data.y[data.train_mask])
                    loss.backward()
                    optimizer.step()
                    if ep % 10 == 0 or ep == epochs-1:
                        gnn_model.eval()
                        with torch.no_grad():
                            logits = gnn_model(data)
                            preds = torch.sigmoid(logits[data.test_mask]).cpu().numpy() if data.test_mask.any() else np.array([])
                            print(f'GCN epoch {ep}/{epochs} loss={loss.item():.4f} test_pos_mean={(preds.mean() if preds.size>0 else float("nan"))}')

                # Save model weights and node probs for post nodes
                try:
                    torch.save(gnn_model.state_dict(), 'gnn_model.pth')
                except Exception as e:
                    print('Warning: failed to save gnn_model.pth:', e)

                # Compute probability per node
                gnn_model.eval()
                with torch.no_grad():
                    logits_all = gnn_model(data)
                    probs_all = torch.sigmoid(logits_all).cpu().numpy()

                # Save post node map and probs (extract only post nodes mapping to post IDs)
                post_node_map = {}
                post_probs = []
                for i, key in enumerate(idx_to_key):
                    if key.startswith('post:') or key.startswith('post_url:'):
                        # key format: post:ID or post_url:URL
                        # map original post id where available (post:ID) else use URL hash
                        if key.startswith('post:'):
                            pid = key.split(':',1)[1]
                            post_node_map[str(pid)] = i
                        else:
                            # fallback id is the index-based key
                            post_node_map[key] = i
                        post_probs.append(float(probs_all[i]))

                # Save artifacts
                np.save(gnn_probs_path, np.array(post_probs, dtype=float))
                with open(post_node_map_path, 'w') as f:
                    json.dump(post_node_map, f)
                print('Saved trained GCN artifacts:', gnn_probs_path, post_node_map_path)
    else:
        print('No df_user available; writing neutral GCN artifacts')
        _write_neutral_artifacts()
except Exception as e:
    print('GCN pipeline failed:', e)
    _write_neutral_artifacts()


In [None]:
# === Supervised classifier training, evaluation, and stats export ===
import numpy as np
import pickle
import json
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score

# Build content_score using AE errors on the training set
recon_tr = ae.predict(X_train_s)
errors_tr = np.mean(np.square(X_train_s - recon_tr), axis=1)
thr_used = float(open('autoencoder_threshold.txt').read())
content_scores_tr = np.minimum(errors_tr / (thr_used * 2.0), 1.0)

# Use neutral GCN probabilities unless map exists
try:
    with open('post_node_map.json') as f:
        post_map = json.load(f)
    gcn_p_tr = np.full(len(content_scores_tr), 0.5)
except Exception:
    gcn_p_tr = np.full(len(content_scores_tr), 0.5)

clf_X = np.hstack([content_scores_tr.reshape(-1,1), gcn_p_tr.reshape(-1,1), X_train_s])
clf_y = y_train

clf = LogisticRegression(max_iter=1000)
clf.fit(clf_X, clf_y)

# Save classifier and meta
with open('phishing_classifier.pkl','wb') as f:
    pickle.dump(clf, f)
meta = {'feature_names': ['content_score','gcn_prob'] + feature_cols, 'model': 'LogisticRegression'}
with open('classifier_meta.json','w') as f:
    json.dump(meta, f)
print('Saved phishing_classifier.pkl and classifier_meta.json')

# Evaluate on test set
recon_te = ae.predict(X_test_s)
errors_te = np.mean(np.square(X_test_s - recon_te), axis=1)
content_scores_te = np.minimum(errors_te / (thr_used * 2.0), 1.0)
gcn_p_te = np.full(len(content_scores_te), 0.5)
clf_X_te = np.hstack([content_scores_te.reshape(-1,1), gcn_p_te.reshape(-1,1), X_test_s])
probs = clf.predict_proba(clf_X_te)[:,1]
auc = float(roc_auc_score(y_test, probs))
acc = float(accuracy_score(y_test, probs > 0.5))

stats = {'ae_threshold': thr_used, 'ae_pretrain_epochs': epochs_pre, 'ae_finetune_epochs': epochs_fine, 'classifier_auc': auc, 'classifier_accuracy': acc}
with open('ae_gcn_training_stats.json','w') as f:
    json.dump(stats, f, indent=2)

print('Training stats saved to ae_gcn_training_stats.json')
print(json.dumps(stats, indent=2))


In [None]:
# === Export & zip artifacts for deployment ===
# Zips backend artifacts into `artifacts_for_deploy.zip` and attempts to download via Colab's files.download
import os, shutil
from pathlib import Path

artifacts = [
    'phishing_autoencoder_model.keras', 'scaler.pkl', 'scaler_final.pkl', 'autoencoder_threshold.txt',
    'post_node_map.json', 'gnn_probs.npy', 'gnn_model.pth',
    'phishing_classifier.pkl', 'classifier_meta.json', 'ae_gcn_training_stats.json',
    'artifact_manifest.json', 'MODEL_CARD.md', 'feature_names.json'
]
zip_name = 'artifacts_for_deploy.zip'

with shutil.ZipFile(zip_name, 'w') as zf:
    for a in artifacts:
        if os.path.exists(a):
            zf.write(a)
            print('Added to zip:', a)
        else:
            print('Missing (skipped):', a)

print('\nBuilt', zip_name)

# Attempt Colab download if available
try:
    from google.colab import files
    try:
        files.download(zip_name)
    except Exception as e:
        print('Colab files.download failed:', e)
        print('You can download the zip from the Files sidebar or use gdown / drive integration.')
except Exception:
    print('Not running in Colab or google.colab.files not available — zip saved in notebook working directory.')

print('\nArtifacts ready. Copy `artifacts_for_deploy.zip` to your backend and POST /reload_models or upload files directly to the server directory.')
