In [15]:
import torch
import sys
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification

repo_path = "/mnt/e/untitled folder/codebase/LoRO/LoRO"  
if os.path.exists(repo_path) and repo_path not in sys.path:
    sys.path.append(repo_path)

try:
    from utils import model_obfuscation
except ImportError as e:
    print('wrong repo_path')
    sys.exit(1)

# ==========================================
# 1. 加载目标模型 (Private Model)
# ==========================================
model_id = "Creekside/Qwen-3B-gsm8k-GRPO"
device = "cpu"
save_path = "/mnt/e/untitled folder/codebase/LoRO_attack/loro_bart_obfuscated.pt"

print(f"正在加载模型: {model_id} ...")
# bart-large-mnli 是一个分类模型 (SequenceClassification)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id).to(device)

print("模型加载完成。准备进行 LoRO 混淆...")

# ==========================================
# 2. 执行混淆 (调用仓库代码)
# ==========================================
noise_magnitude = 1

print(f"开始混淆 (Noise Magnitude: {noise_magnitude})...")
obfuscated_model = model_obfuscation(model, device=device, noise_mag=noise_magnitude, r=30)

# ==========================================
# 4. 保存混淆后的 Checkpoint
# ==========================================
print(f"正在保存混淆后的模型至: {save_path} ...")
torch.save(obfuscated_model.state_dict(), save_path)

print(f"Checkpoint Path: {os.path.abspath(save_path)}")


正在加载模型: Creekside/Qwen-3B-gsm8k-GRPO ...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Creekside/Qwen-3B-gsm8k-GRPO and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


模型加载完成。准备进行 LoRO 混淆...
开始混淆 (Noise Magnitude: 1)...
Obfuscating: model.layers.0.self_attn.q_proj
Obfuscating: model.layers.0.self_attn.k_proj
Obfuscating: model.layers.0.self_attn.v_proj
Obfuscating: model.layers.0.self_attn.o_proj
Obfuscating: model.layers.0.mlp.gate_proj
Obfuscating: model.layers.0.mlp.up_proj
Obfuscating: model.layers.0.mlp.down_proj
Obfuscating: model.layers.1.self_attn.q_proj
Obfuscating: model.layers.1.self_attn.k_proj
Obfuscating: model.layers.1.self_attn.v_proj
Obfuscating: model.layers.1.self_attn.o_proj
Obfuscating: model.layers.1.mlp.gate_proj
Obfuscating: model.layers.1.mlp.up_proj
Obfuscating: model.layers.1.mlp.down_proj
Obfuscating: model.layers.2.self_attn.q_proj
Obfuscating: model.layers.2.self_attn.k_proj
Obfuscating: model.layers.2.self_attn.v_proj
Obfuscating: model.layers.2.self_attn.o_proj
Obfuscating: model.layers.2.mlp.gate_proj
Obfuscating: model.layers.2.mlp.up_proj
Obfuscating: model.layers.2.mlp.down_proj
Obfuscating: model.layers.3.self_att

In [1]:
import torch
from transformers import AutoModelForSequenceClassification
import numpy as np
import pandas as pd

# ==========================================
# 配置
# ==========================================
model_id_ft = "Creekside/Qwen-3B-gsm8k-GRPO"   # Target (Private/Fine-tuned)
model_id_base = "Qwen/Qwen2.5-3B-Instruct"      # Prior (Public/Base)
device = "cpu"

print(f"Loading Fine-Tuned Model: {model_id_ft}...")
model_ft = AutoModelForSequenceClassification.from_pretrained(model_id_ft).to(device)

print(f"Loading Base Model: {model_id_base}...")

model_base = AutoModelForSequenceClassification.from_pretrained(model_id_base).to(device)

print("\nStarting Comparison (FT vs. Base)...")
print("-" * 80)
print(f"{'Layer Name':<50} | {'Cos Sim':<10} | {'Delta Norm':<12} | {'Rel Diff (%)':<12}")
print("-" * 80)

results = []

# 获取所有模块的字典
modules_ft = dict(model_ft.named_modules())
modules_base = dict(model_base.named_modules())

# 遍历 FT 模型的层
for name, module_ft in model_ft.named_modules():
    if isinstance(module_ft, torch.nn.Linear):
        # 确保 Base 模型中有同名层
        if name in modules_base:
            module_base = modules_base[name]
            
            # 获取权重 (Clone detached to avoid grad issues)
            w_ft = module_ft.weight.detach()
            w_base = module_base.weight.detach()
            
            # 检查形状是否一致 (分类头可能不一致)
            if w_ft.shape != w_base.shape:
                print(f"[Skipping] {name}: Shapes mismatch {w_ft.shape} vs {w_base.shape} (Likely Classification Head)")
                continue
                
            # 1. 计算 Cosine Similarity
            # Flatten 之后计算向量夹角
            cos_sim = torch.nn.functional.cosine_similarity(
                w_ft.flatten(), 
                w_base.flatten(), 
                dim=0
            ).item()
            
            # 2. 计算 Delta (FT - Base)
            delta = w_ft - w_base
            norm_delta = torch.norm(delta).item()
            
            # 3. 计算 Base Norm
            norm_base = torch.norm(w_base).item()
            
            # 4. 计算相对差异 (Relative Difference)
            # diff / norm_base
            rel_diff = norm_delta / norm_base if norm_base > 0 else 0.0
            
            # 打印部分层的结果 (为了展示整洁，可以每隔几层打印一次，或者打印所有)
            # 这里打印所有 Encoder/Decoder 的投影层
            if "proj" in name or "fc" in name:
                print(f"{name:<50} | {cos_sim:.6f}   | {norm_delta:.4f}       | {rel_diff*100:.4f}%")
            
            results.append({
                "Layer": name,
                "Cos_Sim": cos_sim,
                "Delta_Norm": norm_delta,
                "Base_Norm": norm_base,
                "Rel_Diff": rel_diff
            })
        else:
            print(f"[Missing] {name} not found in Base Model.")

# ==========================================
# 统计摘要
# ==========================================
df = pd.DataFrame(results)
print("-" * 80)
print("Summary Statistics:")
print(f"Average Cosine Similarity: {df['Cos_Sim'].mean():.6f}")
print(f"Average Relative Diff:     {df['Rel_Diff'].mean()*100:.4f}%")
print(f"Min Cosine Similarity:     {df['Cos_Sim'].min():.6f}")
print("-" * 80)


Loading Fine-Tuned Model: Creekside/Qwen-3B-gsm8k-GRPO...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Creekside/Qwen-3B-gsm8k-GRPO and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading Base Model: Qwen/Qwen2.5-3B-Instruct...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-3B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting Comparison (FT vs. Base)...
--------------------------------------------------------------------------------
Layer Name                                         | Cos Sim    | Delta Norm   | Rel Diff (%)
--------------------------------------------------------------------------------
model.layers.0.self_attn.q_proj                    | 0.995474   | 7.0397       | 9.4047%
model.layers.0.self_attn.k_proj                    | 0.995679   | 3.4136       | 9.2446%
model.layers.0.self_attn.v_proj                    | 0.995532   | 1.1226       | 9.4010%
model.layers.0.self_attn.o_proj                    | 0.995545   | 4.0921       | 9.2753%
model.layers.0.mlp.gate_proj                       | 0.997714   | 11.8519       | 9.3296%
model.layers.0.mlp.up_proj                         | 0.997950   | 10.4254       | 9.3316%
model.layers.0.mlp.down_proj                       | 0.997777   | 11.1631       | 9.3132%
model.layers.1.self_attn.q_proj                    | 0.995503   | 5.4892       |

In [1]:
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import os
import copy
from tqdm import tqdm

# ==========================================
# 1. 配置
# ==========================================
# 攻击者的目标：从 Obfuscated Checkpoint + Base Model 恢复出 Private Model\
base_model_id = "Qwen/Qwen2.5-3B-Instruct"        # 攻击者拥有的先验
target_model_id = "Creekside/Qwen-3B-gsm8k-GRPO"  # 仅用于验证攻击成功率 (GT)
obfuscated_checkpoint = "/mnt/e/untitled folder/codebase/LoRO_attack/loro_bart_obfuscated.pt" # 您的混淆文件路径
save_path_recovered = "/mnt/e/untitled folder/codebase/LoRO_attack/recovered_bart_model"    # 还原后的模型保存路径
device = "cuda" if torch.cuda.is_available() else "cpu"

# SVD 攻击参数
REMOVE_RANK = 30

# ==========================================
# 2. 模型加载
# ==========================================
print(f"1. Loading Base Model (Prior): {base_model_id}...")
# 攻击者初始只有 Base 模型
recovered_model = AutoModelForSequenceClassification.from_pretrained(base_model_id).to(device)

print(f"2. Loading Obfuscated Checkpoint: {obfuscated_checkpoint}...")
if not os.path.exists(obfuscated_checkpoint):
    raise FileNotFoundError("混淆 Checkpoint 未找到，请检查路径。")
obfus_state_dict = torch.load(obfuscated_checkpoint, map_location=device)

print(f"3. Loading Ground Truth (for validation): {target_model_id}...")
gt_model = AutoModelForSequenceClassification.from_pretrained(target_model_id).to(device)

# ==========================================
# 3. 执行全模型攻击
# ==========================================
print("\n" + "="*50)
print(f"STARTING FULL MODEL RECOVERY (Removing Top-{REMOVE_RANK} Singular Components)")
print("="*50)

# 用于统计恢复效果
similarities = []
relative_errors = []

# 获取所有线性层
# 我们遍历 recovered_model (即 base_model) 的模块，然后去 check state_dict 里有没有对应的混淆权重
all_modules = list(recovered_model.named_modules())
linear_layers = [(n, m) for n, m in all_modules if isinstance(m, nn.Linear)]

progress_bar = tqdm(linear_layers, desc="Recovering Layers")

for name, module in progress_bar:
    # 1. 获取 Base 权重 (Prior)
    W_base = module.weight.detach()
    
    # 2. 获取 Obfuscated 权重 (Observation)
    # LoRO 的 state_dict key 通常是 "layer_name.obfus_linear.weight"
    obfus_key = f"{name}.obfus_linear.weight"
    
    # 如果找不到对应的 key，说明这一层可能没有被混淆（或者是分类头等特殊层）
    # 但根据 LoRO 逻辑，Linear 层应该都被混淆了
    if obfus_key not in obfus_state_dict:
        # 尝试直接找 name.weight (有些层可能未被 LoRO 包装)
        if f"{name}.weight" in obfus_state_dict:
            # 如果没混淆，直接加载（或者攻击者认为这就是原样）
            # 但这里我们假设攻击者不知道，只看混淆文件
            continue
        else:
            # 可能是分类头，LoRO 有时也会混淆它。
            # 如果 key 不匹配，跳过
            continue
            
    W_obfus = obfus_state_dict[obfus_key].detach()
    
    # 3. 计算 Diff
    # Diff = W_obfus - W_base
    Diff = W_obfus - W_base
    
    # 4. SVD 攻击 (去噪)
    # 使用 float32 进行 SVD 以保证精度
    U, S, Vh = torch.linalg.svd(Diff.float(), full_matrices=False)
    
    # 剔除前 K 个奇异值 (认为它们是 LoRO 注入的低秩噪声)
    S_clean = S.clone()
    S_clean[:REMOVE_RANK] = 0.0
    
    # 重构 Delta
    Delta_Recovered = (U @ torch.diag(S_clean) @ Vh).to(W_base.dtype)
    
    # 5. 恢复权重
    # W_rec = W_base + Delta_rec
    W_recovered = W_base + Delta_Recovered
    
    # 更新 recovered_model 的权重
    module.weight.data = W_recovered
    
    # 处理 Bias (LoRO 代码中 Bias 是直接存储的，通常没有加噪声，或者加了也可以直接减)
    # 检查 utils.py/loro.py: "self.obfus_linear.bias = torch.nn.Parameter(original_linear.bias)"
    # Bias 没有加噪声！所以直接从 obfus_state_dict 读取即可（攻击者可以直接拿）
    obfus_bias_key = f"{name}.obfus_linear.bias"
    if obfus_bias_key in obfus_state_dict and module.bias is not None:
        module.bias.data = obfus_state_dict[obfus_bias_key].detach()
    
    # 6. 验证 (与 GT 对比)
    # 获取 GT 对应层的权重
    # 注意：需通过 name 从 gt_model 索引
    gt_module = dict(gt_model.named_modules())[name]
    W_gt = gt_module.weight.detach()
    
    # 计算指标
    # 真实的 Delta = W_gt - W_base
    Delta_True = W_gt - W_base
    
    # 计算恢复出的 Delta 和 真实 Delta 的相似度
    sim = torch.nn.functional.cosine_similarity(Delta_True.flatten(), Delta_Recovered.flatten(), dim=0).item()
    rel_err = torch.norm(Delta_Recovered - Delta_True) / torch.norm(Delta_True)
    
    similarities.append(sim)
    relative_errors.append(rel_err.item())
    
    # 更新进度条显示当前层的相似度
    progress_bar.set_postfix({"Sim": f"{sim:.4f}", "Err": f"{rel_err:.4f}"})

# ==========================================
# 4. 结果汇总与保存
# ==========================================
print("\n" + "="*50)
print("RECOVERY COMPLETE")
print("="*50)

avg_sim = sum(similarities) / len(similarities)
avg_err = sum(relative_errors) / len(relative_errors)
min_sim = min(similarities)

print(f"Total Layers Recovered: {len(similarities)}")
print(f"Average Cosine Similarity: {avg_sim:.4f}")
print(f"Average Relative Error:    {avg_err:.4f}")
print(f"Worst Layer Similarity:    {min_sim:.4f}")

if avg_sim > 0.95:
    print("\n[SUCCESS] 模型还原极其成功！基本等同于原始私有模型。")
else:
    print("\n[WARNING] 模型还原效果一般，可能需要调整 REMOVE_RANK 参数。")

# 保存模型
print(f"\nSaving recovered model to {save_path_recovered}...")
recovered_model.save_pretrained(save_path_recovered)
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.save_pretrained(save_path_recovered)
print("Saved. You can now load this model with 'AutoModelForSequenceClassification.from_pretrained'.")

1. Loading Base Model (Prior): Qwen/Qwen2.5-3B-Instruct...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-3B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2. Loading Obfuscated Checkpoint: /mnt/e/untitled folder/codebase/LoRO_attack/loro_bart_obfuscated.pt...
3. Loading Ground Truth (for validation): Creekside/Qwen-3B-gsm8k-GRPO...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Creekside/Qwen-3B-gsm8k-GRPO and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



STARTING FULL MODEL RECOVERY (Removing Top-30 Singular Components)


Recovering Layers: 100%|██████████████████████████████████████| 253/253 [05:57<00:00,  1.41s/it, Sim=0.0000, Err=1.0000]



RECOVERY COMPLETE
Total Layers Recovered: 253
Average Cosine Similarity: 0.9579
Average Relative Error:    0.2736
Worst Layer Similarity:    0.0000

[SUCCESS] 模型还原极其成功！基本等同于原始私有模型。

Saving recovered model to /mnt/e/untitled folder/codebase/LoRO_attack/recovered_bart_model...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Saved. You can now load this model with 'AutoModelForSequenceClassification.from_pretrained'.


In [None]:
import torch
import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import load_dataset
import re

# ==========================================
# 1. 配置路径
# ==========================================
recovered_model_path = "/mnt/e/untitled folder/codebase/LoRO_attack/recovered_bart_model"
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Loading Recovered Model from: {recovered_model_path} ...")
print(f"Device: {device}")

# ==========================================
# 2. 加载模型与数据集
# ==========================================
try:
    # 加载还原的模型
    tokenizer = AutoTokenizer.from_pretrained(recovered_model_path)
    model = AutoModelForCausalLM.from_pretrained(recovered_model_path, torch_dtype=torch.float16).to(device)
    model.eval()
except Exception as e:
    print(f"模型加载失败: {e}")
    print("请检查路径是否正确，或者模型文件是否完整。")
    exit(1)

print("Loading GSM8K dataset (test split)...")
# 加载 GSM8K 测试集
dataset = load_dataset("gsm8k", "main", split="test")

# 为了快速验证，您可以只测前 100 条。如果要测全集 (1319条)，请注释掉下面这行
dataset = dataset.select(range(100)) 

print(f"Start evaluating on {len(dataset)} samples...")

# ==========================================
# 3. 定义推理逻辑
# ==========================================
# 使用 text-generation pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if device=="cuda" else -1)

def extract_answer(generated_text):
    # 提取 #### 后面的数字
    try:
        # 找到最后出现的 ####
        if "####" in generated_text:
            ans = generated_text.split("####")[-1].strip()
            # 移除可能的非数字字符（保留数字和可能的负号/小数点）
            # 这里简单处理：通常 GSM8K 答案是纯数字
            ans = ans.replace(",", "") # 去掉千分位
            return ans
        else:
            return None
    except:
        return None

# ==========================================
# 4. 执行评估
# ==========================================
correct = 0
total = 0

# 进度条
progress_bar = tqdm.tqdm(dataset)

for sample in progress_bar:
    question = sample['question']
    # GSM8K 的 answer 字段通常包含推理过程，最后是 #### 数字
    # 我们只需要 #### 后面的部分作为 ground truth
    ground_truth = sample['answer'].split('####')[-1].strip()
    
    # 构造 Prompt (参考原作者格式)
    # Qwen 推荐使用 Chat 模板，或者按照 LoRO 原作者的 simple formatting
    # 这里沿用您 notebook 中的 prompt 格式
    messages = [
        {"role": "user", "content": f"{question} Please think step by step, give the final number in ONE new line after ####, without other words. Your answer will be considered wrong if not follow this rule."}
    ]
    
    # 构建输入文本
    prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    # 推理
    try:
        # 设置 max_new_tokens 防止生成太长
        outputs = pipe(prompt_text, max_new_tokens=512, do_sample=False, temperature=0.0)
        generated_text = outputs[0]['generated_text']
        
        # 提取模型生成的回答部分 (去除 prompt)
        # pipeline 返回通常包含 prompt，Qwen 的 chat template 可能会有特殊处理
        # 简单处理：直接在生成的完整文本里找 ####
        
        pred_ans = extract_answer(generated_text)
        
        # 对比
        # 简单的字符串匹配，或者数值对比
        if pred_ans == ground_truth:
            correct += 1
        elif pred_ans is not None:
             # 尝试转 float 对比 (处理 540.0 vs 540)
            try:
                if float(pred_ans) == float(ground_truth):
                    correct += 1
            except:
                pass
                
    except Exception as e:
        print(f"Error processing sample: {e}")
    
    total += 1
    progress_bar.set_postfix({'acc': f"{correct/total:.2%}"})

# ==========================================
# 5. 输出最终结果
# ==========================================
acc = correct / total
print("\n" + "="*40)
print(f"Evaluation Result on {total} samples")
print("="*40)
print(f"Correct:  {correct}")
print(f"Accuracy: {acc:.4%}")


Loading Recovered Model from: /mnt/e/untitled folder/codebase/LoRO_attack/recovered_bart_model ...
Device: cpu


The tokenizer you are loading from '/mnt/e/untitled folder/codebase/LoRO_attack/recovered_bart_model' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
`torch_dtype` is deprecated! Use `dtype` instead!
Some weights of Qwen2ForCausalLM were not initialized from the model checkpoint at /mnt/e/untitled folder/codebase/LoRO_attack/recovered_bart_model and are newly initialized: ['lm_head.weight', 'model.embed_tokens.weight', 'model.layers.0.input_layernorm.weight', 'model.layers.0.mlp.down_proj.weight', 'model.layers.0.mlp.gate_proj.weight', 'model.layers.0.mlp.up_proj.weight', 'model.layers.0.post_attention_layernorm.weight', 'model.layers.0.self_attn.k_proj.bias', 'model.layers.0.self_attn.k_proj.weight', 'model.layers.0.self_attn.o_proj.weight', 'model.layers.

Loading GSM8K dataset (test split)...


README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Device set to use cpu


Start evaluating on 100 samples...


  0%|                                                                                           | 0/100 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  1%|▋                                                                     | 1/100 [01:25<2:21:14, 85.60s/it, acc=0.00%]