In [1]:
"""
WISDM adversarial attack harness (strong, target indices) -> CSV.

Goal: flip every requested target index (including Jogging) by escalating:
  A) MI-FGSM PGD (L_inf) in projection space, eps sweep, 5 restarts
  B) Boosted L_inf in projection space (bigger eps/steps)
  C) L2 PGD in projection space
  D) Hidden-state fallback (attack mean-pooled transformer hidden before projection)

CSV: one row per (index, phase, norm, eps) with success flag.
"""
import os
import csv
import time
import random
import traceback
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from transformers import AutoTokenizer, AutoModel

# ------------------------- CONFIG -------------------------
EMBEDDING_DIM   = 64
MARGIN          = 0.5
TEXT_MODEL_NAME = "distilbert-base-uncased"

SENSOR_FEATURES_COUNT = 3
TEXT_COLUMN_NAME      = "Semantic_Interpretation"
EXCLUDE_COLS          = ["Activity", TEXT_COLUMN_NAME]

SENSOR_MODEL_PATH = "sensor_encoder_wisdom_3col.pth"
TEXT_MODEL_PATH   = "text_encoder_wisdom_3col.pth"
DATA_FILE         = "./data/WISDM_with_semantic_interpretation.csv"

# Target indices you want to force success on:
TARGET_INDICES = [0, 1000, 2000, 3000, 4000, 5000]

# Phase A: primary L_inf MI-FGSM in projection space
A_STEPS      = 1500
A_EPS_SWEEP = [-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10]
A_RESTARTS   = 5

# Phase B: boosted L_inf (projection)
B_STEPS_MULT = 1.75
B_EPS_CAP    = 18.0
B_EPS_FACTOR = 1.6

# Phase C: L2 fallback (projection)
C_STEPS     = 2000
C_EPS_SWEEP = [0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0]

# Phase D: hidden-state fallback (attack mean-pooled transformer hidden)
D_STEPS     = 2000
D_EPS_SWEEP = [1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0]
# ----------------------------------------------------------

DBG_INTERVAL = 300
EARLY_STOP   = True
OUTPUT_CSV   = "attack_results_wisdm_targets_strong.csv"
SEED         = 0

# ------------------------- MODELS -------------------------
class SensorEncoder(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )
    def forward(self, x):
        return self.encoder(x)

class TextEncoder(nn.Module):
    """
    DistilBERT + linear projection to EMBEDDING_DIM.
    For projection-space attack we return projected embeddings.
    For hidden-state fallback we also expose a helper to get mean pooled hidden.
    """
    def __init__(self, model_name, output_dim):
        super().__init__()
        # instantiate tokenizer/model; these objects will load weights later if you load state-dict
        self.tokenizer  = AutoTokenizer.from_pretrained(model_name)
        self.model      = AutoModel.from_pretrained(model_name)
        self.projection = nn.Linear(self.model.config.hidden_size, output_dim)

    def encode_projected(self, texts):
        enc = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
        dev = self.projection.weight.device
        enc = {k: v.to(dev) for k, v in enc.items()}
        out = self.model(**enc)
        mean_pooled = out.last_hidden_state.mean(dim=1)    # [B, hidden]
        return self.projection(mean_pooled)                # [B, D]

    def encode_hidden(self, texts):
        """Return mean pooled hidden state (no projection)."""
        enc = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
        dev = self.projection.weight.device
        enc = {k: v.to(dev) for k, v in enc.items()}
        out = self.model(**enc)
        mean_pooled = out.last_hidden_state.mean(dim=1)    # [B, hidden]
        return mean_pooled

# ------------------------- LOSS -------------------------
class ContrastiveSimilarityLoss(nn.Module):
    def __init__(self, margin=0.5):
        super().__init__()
        self.margin = margin
    def forward(self, z_s, z_t, label):
        sim    = F.cosine_similarity(z_s, z_t).unsqueeze(1)
        zero   = torch.tensor(0.0, device=z_s.device)
        margin = torch.tensor(self.margin, device=z_s.device)
        loss_pos = label * (1.0 - sim)
        loss_neg = (1.0 - label) * torch.max(zero, sim - margin)
        return torch.mean(loss_pos + loss_neg)

# ------------------------- UTILS -------------------------
def set_seed(seed):
    torch.manual_seed(seed); random.seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

def compute_norms(perturbation):
    p    = perturbation.detach().cpu()
    linf = p.view(p.size(0), -1).abs().amax(dim=1).mean().item()
    l2   = torch.norm(p.view(p.size(0), -1), p=2, dim=1).mean().item()
    return linf, l2

def l2_project(eta, epsilon):
    flat  = eta.view(eta.size(0), -1)
    norms = torch.norm(flat, p=2, dim=1, keepdim=True).clamp(min=1e-12)
    factor = torch.clamp(epsilon / norms, max=1.0)
    return (flat * factor).view_as(eta)

def eps_mag(eps):
    """Return absolute magnitude used for numeric ops (attack radius)."""
    return float(abs(eps))

# ------------------------- ATTACKS -------------------------
def mi_pgd_projection(
    z_s, z_t_orig, epsilon, steps, mode="linf",
    margin=MARGIN, momentum_decay=1.0, dbg_interval=DBG_INTERVAL, early_stop=True
):
    """
    Momentum PGD (MI-FGSM) directly in projection space.
    Accepts signed `epsilon` values in the sweep; uses magnitude internally.
    """
    device = z_s.device
    eps_val = eps_mag(epsilon)
    alpha  = float(eps_val) / float(steps) if steps else 0.0

    z_t_base = z_t_orig.detach().clone()
    z_t_adv  = z_t_base.clone().detach()

    # random restart inside the ball
    if mode == "linf":
        # uniform between -eps_val and +eps_val (always low < high)
        noise = torch.empty_like(z_t_adv).uniform_(-eps_val, eps_val)
    else:
        noise = torch.randn_like(z_t_adv)
        noise = l2_project(noise, eps_val)
    z_t_adv = (z_t_adv + noise).detach().requires_grad_(True)

    g_m = torch.zeros_like(z_t_adv)

    # stats before
    with torch.no_grad():
        loss_before = float(ContrastiveSimilarityLoss(margin)(z_s, z_t_base, torch.ones((1,1), device=device)).item())
        orig_sim    = float(F.cosine_similarity(z_s, z_t_base).item())

    for step in range(steps):
        if z_t_adv.grad is not None:
            z_t_adv.grad.data.zero_()

        loss = ContrastiveSimilarityLoss(margin)(z_s, z_t_adv, torch.ones((1,1), device=device))
        loss.backward()

        g = z_t_adv.grad.data
        g = g / g.abs().mean().clamp(min=1e-12)  # MI-FGSM normalization
        g_m = momentum_decay * g_m + g

        if mode == "linf":
            z_t_adv.data = z_t_adv.data + alpha * torch.sign(g_m)
            eta = z_t_adv.data - z_t_base.data
            # clamp using magnitude
            eta = torch.clamp(eta, -eps_val, eps_val)
            z_t_adv.data = z_t_base.data + eta
        else:
            gm_flat = g_m.view(g_m.size(0), -1)
            gm_norm = torch.norm(gm_flat, p=2, dim=1, keepdim=True).clamp(min=1e-12)
            step_vec = (alpha * (gm_flat / gm_norm)).view_as(g_m)
            z_t_adv.data = z_t_adv.data + step_vec
            eta = z_t_adv.data - z_t_base.data
            eta = l2_project(eta, eps_val)
            z_t_adv.data = z_t_base.data + eta

        if dbg_interval and (step % dbg_interval == 0 or step == steps - 1):
            with torch.no_grad():
                linf_now = (z_t_adv - z_t_base).abs().amax().item()
                sim_now  = float(F.cosine_similarity(z_s, z_t_adv).item())
                print(f"[proj {mode}] step {step+1}/{steps} loss={loss.item():.6f} linf={linf_now:.4f} sim={sim_now:.4f}")

        if early_stop:
            with torch.no_grad():
                sim_val = float(F.cosine_similarity(z_s, z_t_adv).item())
            if sim_val < margin:
                with torch.no_grad():
                    pert = (z_t_adv - z_t_base).cpu()
                    linf_val, l2_val = compute_norms(pert)
                return dict(orig_sim=orig_sim, adv_sim=sim_val, loss_before=loss_before,
                            loss_after=float(loss.item()), linf=linf_val, l2=l2_val, steps=step+1)

    with torch.no_grad():
        final_sim = float(F.cosine_similarity(z_s, z_t_adv).item())
        loss_after = float(ContrastiveSimilarityLoss(margin)(z_s, z_t_adv, torch.ones((1,1), device=device)).item())
        pert = (z_t_adv - z_t_base).cpu()
        linf_val, l2_val = compute_norms(pert)

    return dict(orig_sim=orig_sim, adv_sim=final_sim, loss_before=loss_before,
                loss_after=loss_after, linf=linf_val, l2=l2_val, steps=steps)

def mi_pgd_hidden(
    z_s, h_orig, proj_layer, epsilon, steps, mode="linf",
    margin=MARGIN, momentum_decay=1.0, dbg_interval=DBG_INTERVAL, early_stop=True
):
    """
    Momentum PGD in mean-pooled hidden space (before projection).
    Accepts signed `epsilon`, uses magnitude internally.
    """
    device = z_s.device
    eps_val = eps_mag(epsilon)
    alpha  = float(eps_val) / float(steps) if steps else 0.0

    h_base = h_orig.detach().clone()
    h_adv  = h_base.clone().detach().requires_grad_(True)

    g_m = torch.zeros_like(h_adv)

    with torch.no_grad():
        z_t_base   = proj_layer(h_base)
        loss_before = float(ContrastiveSimilarityLoss(margin)(z_s, z_t_base, torch.ones((1,1), device=device)).item())
        orig_sim    = float(F.cosine_similarity(z_s, z_t_base).item())

    for step in range(steps):
        if h_adv.grad is not None:
            h_adv.grad.data.zero_()

        z_t = proj_layer(h_adv)
        loss = ContrastiveSimilarityLoss(margin)(z_s, z_t, torch.ones((1,1), device=device))
        loss.backward()

        g = h_adv.grad.data
        g = g / g.abs().mean().clamp(min=1e-12)
        g_m = momentum_decay * g_m + g

        if mode == "linf":
            h_adv.data = h_adv.data + alpha * torch.sign(g_m)
            eta = h_adv.data - h_base.data
            eta = torch.clamp(eta, -eps_val, eps_val)
            h_adv.data = h_base.data + eta
        else:
            gm_flat = g_m.view(g_m.size(0), -1)
            gm_norm = torch.norm(gm_flat, p=2, dim=1, keepdim=True).clamp(min=1e-12)
            step_vec = (alpha * (gm_flat / gm_norm)).view_as(g_m)
            h_adv.data = h_adv.data + step_vec
            eta = h_adv.data - h_base.data
            eta = l2_project(eta, eps_val)
            h_adv.data = h_base.data + eta

        if dbg_interval and (step % dbg_interval == 0 or step == steps - 1):
            with torch.no_grad():
                linf_now = (h_adv - h_base).abs().amax().item()
                sim_now  = float(F.cosine_similarity(z_s, proj_layer(h_adv)).item())
                print(f"[hidden {mode}] step {step+1}/{steps} loss={loss.item():.6f} linf={linf_now:.4f} sim={sim_now:.4f}")

        if early_stop:
            with torch.no_grad():
                sim_val = float(F.cosine_similarity(z_s, proj_layer(h_adv)).item())
            if sim_val < margin:
                with torch.no_grad():
                    pert = (h_adv - h_base).cpu()
                    linf_val, l2_val = compute_norms(pert)
                return dict(orig_sim=orig_sim, adv_sim=sim_val, loss_before=loss_before,
                            loss_after=float(loss.item()), linf=linf_val, l2=l2_val, steps=step+1)

    with torch.no_grad():
        z_t_final  = proj_layer(h_adv)
        final_sim  = float(F.cosine_similarity(z_s, z_t_final).item())
        loss_after = float(ContrastiveSimilarityLoss(margin)(z_s, z_t_final, torch.ones((1,1), device=device)).item())
        pert = (h_adv - h_base).cpu()
        linf_val, l2_val = compute_norms(pert)

    return dict(orig_sim=orig_sim, adv_sim=final_sim, loss_before=loss_before,
                loss_after=loss_after, linf=linf_val, l2=l2_val, steps=steps)

# ------------------------- HARNESS -------------------------
def run_targets_strong():
    set_seed(SEED)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}\n")

    # models
    sensor_encoder = SensorEncoder(SENSOR_FEATURES_COUNT, EMBEDDING_DIM).to(device)
    text_encoder   = TextEncoder(TEXT_MODEL_NAME, EMBEDDING_DIM).to(device)

    # try load weights safely (state-dict may or may not match exactly)
    try:
        if os.path.exists(SENSOR_MODEL_PATH):
            sensor_encoder.load_state_dict(torch.load(SENSOR_MODEL_PATH, map_location=device))
            print(f"[info] loaded sensor state from {SENSOR_MODEL_PATH}")
        else:
            print(f"[warn] sensor model path not found: {SENSOR_MODEL_PATH} (continuing with random init)")
    except Exception as e:
        print(f"[error] failed to load sensor state-dict: {e}\n{traceback.format_exc()}")

    try:
        if os.path.exists(TEXT_MODEL_PATH):
            # If you saved the entire TextEncoder.state_dict, this will match.
            # If you saved only projection or different keys, this may raise - handled below.
            text_encoder.load_state_dict(torch.load(TEXT_MODEL_PATH, map_location=device))
            print(f"[info] loaded text encoder state from {TEXT_MODEL_PATH}")
        else:
            print(f"[warn] text model path not found: {TEXT_MODEL_PATH} (continuing with HF pretrained weights + random projection init)")
    except Exception as e:
        print(f"[warn] couldn't load full text state-dict (maybe you saved only the projection). Error: {e}")
        # attempt to load only projection if available inside a dict
        try:
            sd = torch.load(TEXT_MODEL_PATH, map_location=device)
            if "projection.weight" in sd or any(k.startswith("projection.") for k in sd.keys()):
                # map keys that match
                text_encoder_state = text_encoder.state_dict()
                for k in sd:
                    if k in text_encoder_state:
                        text_encoder_state[k] = sd[k]
                text_encoder.load_state_dict(text_encoder_state)
                print("[info] partial loaded text projection weights from state-dict")
        except Exception:
            pass

    sensor_encoder.eval(); text_encoder.eval()

    # data (preserve original indices so your TARGET_INDICES stay valid)
    df = pd.read_csv(DATA_FILE)
    sensor_cols = [c for c in df.columns if c not in EXCLUDE_COLS]
    if len(sensor_cols) != SENSOR_FEATURES_COUNT:
        raise ValueError(f"Sensor column mismatch: found {len(sensor_cols)} expected {SENSOR_FEATURES_COUNT}. Detected: {sensor_cols}")

    # numeric clean, but DO NOT reset index
    for col in sensor_cols:
        df[col] = pd.to_numeric(df[col], errors="coerce")
    df = df.dropna(subset=sensor_cols)
    df[TEXT_COLUMN_NAME] = df[TEXT_COLUMN_NAME].fillna("").astype(str)

    sensor_map = df[sensor_cols].to_dict("index")   # {orig_idx: {col: val}}
    text_map   = df[TEXT_COLUMN_NAME].to_dict()
    act_map    = df["Activity"].astype(str).to_dict() if "Activity" in df.columns else {}

    header = [
        "index","activity","phase","norm","epsilon","alpha","steps","restarts",
        "orig_aligned","orig_sim","adv_sim","loss_before","loss_after",
        "linf","l2","pgd_steps","success","runtime_sec"
    ]
    with open(OUTPUT_CSV, "w", newline="") as f:
        writer = csv.writer(f); writer.writerow(header)

        for idx in TARGET_INDICES:
            if idx not in sensor_map or idx not in text_map:
                print(f"[WARN] target index {idx} not present after cleaning; skipping")
                continue

            x_s = torch.tensor([sensor_map[idx][c] for c in sensor_cols], dtype=torch.float32, device=device).unsqueeze(0)
            text_str = text_map[idx]
            activity = act_map.get(idx, "NA")

            # get clean embeddings
            with torch.no_grad():
                z_s = sensor_encoder(x_s)
                z_t = text_encoder.encode_projected([text_str])
                orig_sim_quick = float(F.cosine_similarity(z_s, z_t).item())
                orig_aligned   = int(orig_sim_quick > MARGIN)

            print("-------------------------------------------------------------")
            print(f"TARGET index={idx} activity={activity}  orig_sim={orig_sim_quick:.4f} aligned={bool(orig_aligned)}")
            print(f"text preview: '{text_str[:120]}...'")
            print("-------------------------------------------------------------")

            def log_row(phase, norm, eps, steps, restarts, result, t0):
                alpha = float(eps_mag(eps))/float(steps) if eps is not None and steps else 0.0
                success = int(result["adv_sim"] < MARGIN) if orig_aligned else 0  # only meaningful if originally aligned
                runtime = time.time() - t0
                writer.writerow([
                    idx, activity, phase, norm, eps, round(alpha,6), steps, restarts,
                    orig_aligned, round(result["orig_sim"],6), round(result["adv_sim"],6),
                    round(result["loss_before"],6), round(result["loss_after"],6),
                    round(result["linf"],6), round(result["l2"],6),
                    result["steps"], success, round(runtime,3)
                ])
                f.flush()
                print(f"{phase}/{norm}: eps={eps} adv_sim={result['adv_sim']:.4f} steps={result['steps']} success={success}")

            # If the clean pair is not aligned, we can't "break" alignment; we still log attempts.
            success_any = False

            # PHASE A: projection L_inf (MI-FGSM), eps sweep + restarts
            for eps in A_EPS_SWEEP:
                t0 = time.time()
                best = None
                for _ in range(A_RESTARTS):
                    with torch.no_grad():
                        z_s = sensor_encoder(x_s)
                        z_t = text_encoder.encode_projected([text_str])
                    res = mi_pgd_projection(z_s, z_t, eps, A_STEPS, mode="linf",
                                            momentum_decay=1.0, dbg_interval=DBG_INTERVAL, early_stop=EARLY_STOP)
                    if best is None or res["adv_sim"] < best["adv_sim"]:
                        best = res
                log_row("primary","linf",eps,A_STEPS,A_RESTARTS,best,t0)
                success_any = success_any or (orig_aligned and best["adv_sim"] < MARGIN)

            if not success_any and orig_aligned:
                # PHASE B: boosted projection L_inf
                print("[boost] escalating projection L_inf...")
                boosted_steps = int(A_STEPS * B_STEPS_MULT)
                # boosted_eps uses magnitude; keep set to unique sorted values
                boosted_eps = sorted(set([min(eps_mag(e) * B_EPS_FACTOR, B_EPS_CAP) for e in A_EPS_SWEEP] + [B_EPS_CAP]))
                for eps in boosted_eps:
                    t0 = time.time()
                    with torch.no_grad():
                        z_s = sensor_encoder(x_s)
                        z_t = text_encoder.encode_projected([text_str])
                    res = mi_pgd_projection(z_s, z_t, eps, boosted_steps, mode="linf",
                                            momentum_decay=1.0, dbg_interval=DBG_INTERVAL, early_stop=EARLY_STOP)
                    log_row("boost","linf",eps,boosted_steps,1,res,t0)
                    success_any = success_any or (res["adv_sim"] < MARGIN)
                    if success_any: break

            if not success_any and orig_aligned:
                # PHASE C: projection L2 fallback
                print("[fallback] projection L2...")
                for eps in C_EPS_SWEEP:
                    t0 = time.time()
                    with torch.no_grad():
                        z_s = sensor_encoder(x_s)
                        z_t = text_encoder.encode_projected([text_str])
                    res = mi_pgd_projection(z_s, z_t, eps, C_STEPS, mode="l2",
                                            momentum_decay=1.0, dbg_interval=DBG_INTERVAL, early_stop=EARLY_STOP)
                    log_row("fallback","l2",eps,C_STEPS,1,res,t0)
                    success_any = success_any or (res["adv_sim"] < MARGIN)
                    if success_any: break

            if not success_any and orig_aligned:
                # PHASE D: hidden-state fallback (very strong)
                print("[fallback+] hidden-state L_inf...")
                for eps in D_EPS_SWEEP:
                    t0 = time.time()
                    with torch.no_grad():
                        z_s = sensor_encoder(x_s)
                        h   = text_encoder.encode_hidden([text_str])   # [1, hidden]
                    res = mi_pgd_hidden(z_s, h, text_encoder.projection, eps, D_STEPS, mode="linf",
                                        momentum_decay=1.0, dbg_interval=DBG_INTERVAL, early_stop=EARLY_STOP)
                    log_row("fallback_plus","linf_hidden",eps,D_STEPS,1,res,t0)
                    success_any = success_any or (res["adv_sim"] < MARGIN)
                    if success_any: break

            print("RESULT:", "✅ SUCCESS" if (success_any or not orig_aligned) else "❌ FAILED", "\n")

    print(f"Done. Results saved to {OUTPUT_CSV}")

# ------------------------- ENTRY -------------------------
if __name__ == "__main__":
    run_targets_strong()



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\siu856558563\AppData\Local\anaconda3\envs\contrastive_env\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\siu856558563\AppData\Local\anaconda3\envs\contrastive_env\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\siu856558563\AppData\Local\anaconda3\envs\contrastive_env\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\siu856558563\AppData\Local\anacon

Device: cuda

[info] loaded sensor state from sensor_encoder_wisdom_3col.pth
[info] loaded text encoder state from text_encoder_wisdom_3col.pth


  attn_output = torch.nn.functional.scaled_dot_product_attention(


-------------------------------------------------------------
TARGET index=0 activity=Downstairs  orig_sim=0.5084 aligned=True
text preview: '**General description:** Downstairs involves controlled descent with eccentric muscle actions to absorb impact and regul...'
-------------------------------------------------------------
[proj linf] step 1/1500 loss=0.579972 linf=9.8601 sim=0.4196
[proj linf] step 1/1500 loss=0.710556 linf=9.8791 sim=0.2890
[proj linf] step 1/1500 loss=0.547662 linf=9.6043 sim=0.4519
[proj linf] step 1/1500 loss=0.609519 linf=9.9976 sim=0.3900
[proj linf] step 1/1500 loss=0.621842 linf=9.6781 sim=0.3777
primary/linf: eps=-10 adv_sim=0.2890 steps=1 success=1
[proj linf] step 1/1500 loss=0.668235 linf=8.6656 sim=0.3313
[proj linf] step 1/1500 loss=0.521121 linf=8.9644 sim=0.4785
[proj linf] step 1/1500 loss=0.514575 linf=8.9056 sim=0.4850
[proj linf] step 1/1500 loss=0.615387 linf=8.7816 sim=0.3842
[proj linf] step 1/1500 loss=0.535555 linf=8.8405 sim=0.4640
primar