# Part I: Experiment with trained LoRA-XS

In [2]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [1]:
import os
os.chdir("src")

In [2]:
from utils import get_model

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch
torch.cuda.is_available()

True

In [4]:
model, tokenizer, config = get_model(model_name="qwen2.5-1.5b-instruct")

In [13]:
adapter_path = r"/scratch/doluk/Compact-Interference-PRAG/offline_loraxs/qwen2.5-1.5b-instruct/rank=64_alpha=16/popqa/lr=0.0003_epoch=4_direct/aug_model=qwen2.5-1.5b-instruct/total/data_0/passage_0"

In [18]:
from peft import PeftModel
peft_model = PeftModel.from_pretrained(model, adapter_path, is_trainable=False)

NameError: name 'adapter_path' is not defined

In [15]:
from utils import model_generate

In [16]:
model_generate("What is George Rankin's occupation?", peft_model, tokenizer, config)



'George Rankin was born into poverty and grew up homeless in Liverpool docks during World War II. He'

In [29]:
model_generate("What is George Rankin's occupation?", model, tokenizer, config)

"I'm sorry, but I couldn't find any information about a person named George Rankin with a"

# Part 2: Testing with MoE architecture

### Use base Model with separate A/B matrices (frozen pretrained PRAG-baseline adapters), just trainable MoE

In [8]:
trained_adapter_dir = r"/scratch/doluk/Compact-Interference-PRAG/offline/qwen2.5-1.5b-instruct/rank=2_alpha=32/popqa/lr=0.0003_epoch=2_direct/aug_model=qwen2.5-1.5b-instruct/total/data_0"
print(os.path.exists(trained_adapter_dir))
# print(os.listdir(trained_adapter_dir))

True


In [56]:
from safetensors.torch import load_file

state = load_file(os.path.join(trained_adapter_dir, "passage_0", "adapter_model.safetensors"))
list(state.keys())

['base_model.model.base_model.model.model.layers.0.mlp.down_proj.lora_A.weight',
 'base_model.model.base_model.model.model.layers.0.mlp.down_proj.lora_B.weight',
 'base_model.model.base_model.model.model.layers.0.mlp.gate_proj.lora_A.weight',
 'base_model.model.base_model.model.model.layers.0.mlp.gate_proj.lora_B.weight',
 'base_model.model.base_model.model.model.layers.0.mlp.up_proj.lora_A.weight',
 'base_model.model.base_model.model.model.layers.0.mlp.up_proj.lora_B.weight',
 'base_model.model.base_model.model.model.layers.1.mlp.down_proj.lora_A.weight',
 'base_model.model.base_model.model.model.layers.1.mlp.down_proj.lora_B.weight',
 'base_model.model.base_model.model.model.layers.1.mlp.gate_proj.lora_A.weight',
 'base_model.model.base_model.model.model.layers.1.mlp.gate_proj.lora_B.weight',
 'base_model.model.base_model.model.model.layers.1.mlp.up_proj.lora_A.weight',
 'base_model.model.base_model.model.model.layers.1.mlp.up_proj.lora_B.weight',
 'base_model.model.base_model.model.

In [20]:
import os
from pathlib import Path
from typing import Dict, List, Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F
from safetensors.torch import load_file

# -------------------------------
# MoE-LoRA block
# -------------------------------
class MoELoRA(nn.Module):
    def __init__(self, in_dim, out_dim, trained_lora_A_weights, trained_lora_B_weights, r=None, alpha=32):
        """
        trained_lora_A_weights/B_weights: list[Tensor] with shapes:
          A_i: (r, in_dim)   (down-proj)
          B_i: (out_dim, r)  (up-proj)
        """
        super().__init__()
        assert len(trained_lora_A_weights) == len(trained_lora_B_weights)
        self.num_heads = len(trained_lora_A_weights)

        # infer r if not provided
        inferred_r = trained_lora_A_weights[0].shape[0]
        if r is None:
            r = inferred_r
        else:
            assert r == inferred_r, f"Provided r={r} != inferred r={inferred_r}"

        self.r = r
        self.alpha = alpha
        self.scaling = alpha / r

        # A_i: in_dim -> r ; B_i: r -> out_dim
        self.As = nn.ModuleList([nn.Linear(in_dim, r, bias=False) for _ in range(self.num_heads)])
        self.Bs = nn.ModuleList([nn.Linear(r, out_dim, bias=False) for _ in range(self.num_heads)])

        with torch.no_grad():
            for i in range(self.num_heads):
                # Safetensor shapes: A: (r, in_dim) matches Linear(in_dim->r).weight
                self.As[i].weight.copy_(trained_lora_A_weights[i])
                # Safetensor shapes: B: (out_dim, r) matches Linear(r->out_dim).weight
                self.Bs[i].weight.copy_(trained_lora_B_weights[i])

        # Simple dense router on a pooled compressed repr
        self.router = nn.Linear(r, self.num_heads)

        
        # Freeze all A/B parameters
        for p in self.As.parameters():
            p.requires_grad = False
        for p in self.Bs.parameters():
            p.requires_grad = False
        # Ensure router is trainable
        for p in self.router.parameters():
            p.requires_grad = True

    def forward(self, x):
        # x: [..., in_dim]
        # compute compressed reps for each head
        zs = [A(x) for A in self.As]           # list of [..., r]
        z_stack = torch.stack(zs, dim=-1)      # [..., r, num_heads]
        # expert outputs
        expert_outs = torch.stack(
            [self.Bs[i](zs[i]) for i in range(self.num_heads)],
            dim=-1
        )                                      # [..., out_dim, num_heads]
        # router uses pooled compressed repr; stopgrad into A
        z_mean = z_stack.mean(dim=-1)          # [..., r]
        router_logits = self.router(z_mean.detach())
        router_scores = F.softmax(router_logits, dim=-1)  # [..., num_heads]
        # mix experts
        out = torch.sum(expert_outs * router_scores.unsqueeze(-2), dim=-1)  # [..., out_dim]
        return self.scaling * out


# -------------------------------
# Wrapper Linear Layer
# -------------------------------
class LinearWithMoELoRA(nn.Module):
    def __init__(self, base_linear: nn.Linear, moe_lora: MoELoRA):
        super().__init__()
        self.base = base_linear
        self.moe_lora = moe_lora
        # base stays frozen (you can unfreeze if you want full finetuning)
        for p in self.base.parameters():
            p.requires_grad = False

    def forward(self, x):
        with torch.no_grad():
            base_out = self.base(x)  # base is frozen; grads won't flow into it
        return base_out + self.moe_lora(x)


# -------------------------------
# Injection Utilities
# -------------------------------
TARGET_MODULES = ["gate_proj", "up_proj", "down_proj"]

def _collect_passage_paths(trained_data_adapters_dir: str) -> List[Path]:
    root = Path(trained_data_adapters_dir)
    assert root.exists(), f"Adapters dir not found: {root}"
    passages = sorted([p for p in root.iterdir() if p.is_dir() and p.name.startswith("passage_")],
                      key=lambda p: int(p.name.split("_")[-1]))
    assert len(passages) > 0, f"No passage_* folders found under {root}"
    return passages

def _load_all_adapter_states(passages: List[Path]) -> List[Dict[str, torch.Tensor]]:
    states = []
    for p in passages:
        st_path = p / "adapter_model.safetensors"
        assert st_path.exists(), f"Missing {st_path}"
        states.append(load_file(str(st_path)))
    return states

def _find_weights_for_module(
    module_name: str,
    adapter_states: List[Dict[str, torch.Tensor]],
) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
    """
    For a model module name like 'model.layers.0.mlp.down_proj',
    find matching safetensor keys ending with '.{module_name}.lora_A.weight' / B across all passages.
    Returns lists A_list, B_list aligned by passage order.
    """
    suffix_A = f".{module_name}.lora_A.weight"
    suffix_B = f".{module_name}.lora_B.weight"

    A_list, B_list = [], []
    for st in adapter_states:
        # keys have long prefixes; we match by endswith
        kA = next((k for k in st.keys() if k.endswith(suffix_A)), None)
        kB = next((k for k in st.keys() if k.endswith(suffix_B)), None)
        if kA is None or kB is None:
            # Fallback: keys might omit the leading 'model.' in module_name
            alt_suffix_A = "." + module_name.split("model.", 1)[-1] + ".lora_A.weight"
            alt_suffix_B = "." + module_name.split("model.", 1)[-1] + ".lora_B.weight"
            kA = next((k for k in st.keys() if k.endswith(alt_suffix_A)), kA)
            kB = next((k for k in st.keys() if k.endswith(alt_suffix_B)), kB)
        if kA is None or kB is None:
            # If still missing, we cannot inject for this module from this passage
            return [], []
        A_list.append(st[kA])
        B_list.append(st[kB])
    return A_list, B_list

def _replace_module(model: nn.Module, dotted_name: str, new_mod: nn.Module):
    """
    Replace submodule at dotted_name with new_mod.
    """
    parent_name, attr = dotted_name.rsplit('.', 1)
    parent = dict(model.named_modules())[parent_name]
    setattr(parent, attr.split(':')[0], new_mod)  # strip any :type suffix just in case


# -------------------------------
# Main injector
# -------------------------------
def inject_hydra_lora(
    model: nn.Module,
    trained_data_adapters_dir: str,
    r: int = None,
    alpha: int = 32,
    target_modules: List[str] = TARGET_MODULES,
    verbose: bool = True,
):
    """
    For each Linear in TARGET_MODULES, wrap it with LinearWithMoELoRA
    using the LoRA A/B from all 'passage_*' adapters found in the dir.
    """
    # 1) Load all passage adapters (each passage = one expert head)
    passages = _collect_passage_paths(trained_data_adapters_dir)
    adapter_states = _load_all_adapter_states(passages)
    if verbose:
        print(f"Found {len(adapter_states)} passages: {[p.name for p in passages]}")

    # 2) Freeze base model
    for p in model.parameters():
        p.requires_grad = False

    # 3) Traverse modules and inject
    injected = 0
    # Create a stable list of (name, module) so we can replace during iteration
    modules = [(n, m) for n, m in model.named_modules() if isinstance(m, nn.Linear)]
    for name, module in modules:
        if not any(t in name for t in target_modules):
            continue

        # Find A/B weights across passages for this module name
        A_list, B_list = _find_weights_for_module(name, adapter_states)
        if len(A_list) == 0:
            if verbose:
                print(f"[skip] No matching LoRA weights for {name}")
            continue

        in_dim, out_dim = module.in_features, module.out_features

        # Sanity checks
        for i, (A, B) in enumerate(zip(A_list, B_list)):
            assert A.shape[1] == in_dim, f"{name} passage#{i} A shape {A.shape} incompatible with in_dim={in_dim}"
            assert B.shape[0] == out_dim, f"{name} passage#{i} B shape {B.shape} incompatible with out_dim={out_dim}"
            assert A.shape[0] == B.shape[1], f"{name} passage#{i} r mismatch: {A.shape[0]} vs {B.shape[1]}"

        # Instantiate MoE-LoRA for this linear
        moe = MoELoRA(
            in_dim=in_dim,
            out_dim=out_dim,
            trained_lora_A_weights=A_list,
            trained_lora_B_weights=B_list,
            r=(A_list[0].shape[0] if r is None else r),
            alpha=alpha,
        )
        wrapped = LinearWithMoELoRA(module, moe)

        # Replace module in model
        _replace_module(model, name, wrapped)
        injected += 1
        if verbose:
            r_used = moe.r
            print(f"[ok] Injected MoE-LoRA into {name} (heads={len(A_list)}, r={r_used}, alpha={alpha})")

    if verbose:
        print(f"Done. Injected {injected} modules.")

In [21]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qwen2R

In [22]:
from torch import nn
TARGET_MODULES = ["gate_proj", "up_proj", "down_proj"]

for name, module in model.named_modules():
    if any(target in name for target in TARGET_MODULES) and isinstance(module, nn.Linear):
        print(name)

model.layers.0.mlp.gate_proj
model.layers.0.mlp.up_proj
model.layers.0.mlp.down_proj
model.layers.1.mlp.gate_proj
model.layers.1.mlp.up_proj
model.layers.1.mlp.down_proj
model.layers.2.mlp.gate_proj
model.layers.2.mlp.up_proj
model.layers.2.mlp.down_proj
model.layers.3.mlp.gate_proj
model.layers.3.mlp.up_proj
model.layers.3.mlp.down_proj
model.layers.4.mlp.gate_proj
model.layers.4.mlp.up_proj
model.layers.4.mlp.down_proj
model.layers.5.mlp.gate_proj
model.layers.5.mlp.up_proj
model.layers.5.mlp.down_proj
model.layers.6.mlp.gate_proj
model.layers.6.mlp.up_proj
model.layers.6.mlp.down_proj
model.layers.7.mlp.gate_proj
model.layers.7.mlp.up_proj
model.layers.7.mlp.down_proj
model.layers.8.mlp.gate_proj
model.layers.8.mlp.up_proj
model.layers.8.mlp.down_proj
model.layers.9.mlp.gate_proj
model.layers.9.mlp.up_proj
model.layers.9.mlp.down_proj
model.layers.10.mlp.gate_proj
model.layers.10.mlp.up_proj
model.layers.10.mlp.down_proj
model.layers.11.mlp.gate_proj
model.layers.11.mlp.up_proj
mode

In [23]:
inject_hydra_lora(model, trained_data_adapters_dir=trained_adapter_dir, alpha=32)

Found 3 passages: ['passage_0', 'passage_1', 'passage_2']
[ok] Injected MoE-LoRA into model.layers.0.mlp.gate_proj (heads=3, r=2, alpha=32)
[ok] Injected MoE-LoRA into model.layers.0.mlp.up_proj (heads=3, r=2, alpha=32)
[ok] Injected MoE-LoRA into model.layers.0.mlp.down_proj (heads=3, r=2, alpha=32)
[ok] Injected MoE-LoRA into model.layers.1.mlp.gate_proj (heads=3, r=2, alpha=32)
[ok] Injected MoE-LoRA into model.layers.1.mlp.up_proj (heads=3, r=2, alpha=32)
[ok] Injected MoE-LoRA into model.layers.1.mlp.down_proj (heads=3, r=2, alpha=32)
[ok] Injected MoE-LoRA into model.layers.2.mlp.gate_proj (heads=3, r=2, alpha=32)
[ok] Injected MoE-LoRA into model.layers.2.mlp.up_proj (heads=3, r=2, alpha=32)
[ok] Injected MoE-LoRA into model.layers.2.mlp.down_proj (heads=3, r=2, alpha=32)
[ok] Injected MoE-LoRA into model.layers.3.mlp.gate_proj (heads=3, r=2, alpha=32)
[ok] Injected MoE-LoRA into model.layers.3.mlp.up_proj (heads=3, r=2, alpha=32)
[ok] Injected MoE-LoRA into model.layers.3.mlp.d

In [24]:
def get_closed_book_QA(aug_model, augments, tokenizer, args=None):
    from prompt_template import get_prompt
    prompt_ids = []
    for aug in augments:
        rew = aug[f"{aug_model}_rewrite"]
        qas = aug[f"{aug_model}_qa"]
        qas = qas
        qpa_cnt = (len(qas) + 1) // 2
        for qid, qa in enumerate(qas):
            prompt_ids.append(get_prompt(tokenizer, qa["question"], None, qa["answer"]))
    return prompt_ids

In [25]:
import json

data_file = r"/scratch/doluk/Compact-Interference-PRAG/data_aug/popqa/qwen2.5-1.5b-instruct/total.json"
with open(data_file, "r") as f:
    data = json.load(f)
augments = data[0]["augment"]

In [26]:
aug_model = "qwen2.5-1.5b-instruct"
prompt_ids = get_closed_book_QA(aug_model, augments, tokenizer)

In [27]:
len(prompt_ids)

9

In [28]:
wtf = [p.shape for p in model.parameters() if p.requires_grad]

In [29]:
print(sum(p.numel() for p in model.parameters() if p.requires_grad))

756


In [30]:
from encode import get_train_data, TrainingData, TrainingDataCollator
device = torch.device("cuda")
model.to(device)
def train(question, augments, model, tokenizer, save_path="please_first_test"):
    prompt_ids = get_closed_book_QA(aug_model, augments, tokenizer)
    train_data = TrainingData(prompt_ids, tokenizer)
    train_dataloader = torch.utils.data.DataLoader(
        train_data,
        batch_size=1,
        collate_fn=TrainingDataCollator(tokenizer, model.device),
        shuffle=False,
    )
    model.is_parallelizable = True
    model.model_parallel = True
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.AdamW(model_parameters, lr=3e-4)
    for epoch in range(4):
        for step, batch in enumerate(train_dataloader):
            optimizer.zero_grad()
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
    os.makedirs(save_path, exist_ok=True)
    # print(f"Save LoRA to {save_path}")
    model.save_pretrained(save_path)
    return model

In [31]:
torch.cuda.empty_cache()
model = train("What is George Rankin's occupation?", augments, model, tokenizer)

OutOfMemoryError: CUDA out of memory. Tried to allocate 308.00 MiB. GPU 0 has a total capacty of 39.50 GiB of which 119.38 MiB is free. Including non-PyTorch memory, this process has 39.37 GiB memory in use. Of the allocated memory 38.03 GiB is allocated by PyTorch, and 875.12 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import torch

base_model = "Qwen/Qwen2.5-1.5B-Instruct"
adapter_path = "/home/doluk/Compact-Interference-PRAG/please_first_test"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)

# Load base model
base = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.bfloat16,
    device_map="cuda:0"
)

# Load your fine-tuned adapters
model = PeftModel.from_pretrained(base, adapter_path)

# Merge LoRA weights if you want a single model
model = model.merge_and_unload()

# Save merged model (optional)
model.save_pretrained("/home/doluk/Compact-Interference-PRAG/merged_model")
tokenizer.save_pretrained("/home/doluk/Compact-Interference-PRAG/merged_model")

# Try inference
question = "What is George Rankin's occupation?"
inputs = tokenizer(question, return_tensors="pt").to(model.device)
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


ValueError: Can't find 'adapter_config.json' at '/home/doluk/Compact-Interference-PRAG/please_first_test'