In [1]:
# Step 1: GPT-2 quantization migration test
import sys
sys.path.append('..')

from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from src.quantization import QuantLinear, replace_with_quant, load_config

# Load model and config
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
cfg = load_config("../configs/config.yaml")
print(f"✅ Config loaded: {cfg['default_w_bits']} bit")

# Execute quantization replacement (keep layer printing)
replace_with_quant(model, cfg)
print("✅ Model quantized successfully")

# Test basic generation
inputs = tokenizer("Hello, world is ", return_tensors="pt")
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=20)
print("✅ Generation test:", tokenizer.decode(outputs[0]))


✅ Config loaded: 8 bit
 GPT2LMHeadModel
transformer GPT2Model
transformer.wte Embedding
transformer.wpe Embedding
transformer.drop Dropout
transformer.h ModuleList
transformer.h.0 GPT2Block
transformer.h.0.ln_1 LayerNorm
transformer.h.0.attn GPT2Attention
transformer.h.0.attn.c_attn Conv1D
transformer.h.0.attn.c_proj Conv1D
transformer.h.0.attn.attn_dropout Dropout
transformer.h.0.attn.resid_dropout Dropout
transformer.h.0.ln_2 LayerNorm
transformer.h.0.mlp GPT2MLP
transformer.h.0.mlp.c_fc Conv1D
transformer.h.0.mlp.c_proj Conv1D
transformer.h.0.mlp.act NewGELUActivation
transformer.h.0.mlp.dropout Dropout
transformer.h.1 GPT2Block
transformer.h.1.ln_1 LayerNorm
transformer.h.1.attn GPT2Attention
transformer.h.1.attn.c_attn Conv1D
transformer.h.1.attn.c_proj Conv1D
transformer.h.1.attn.attn_dropout Dropout
transformer.h.1.attn.resid_dropout Dropout
transformer.h.1.ln_2 LayerNorm
transformer.h.1.mlp GPT2MLP
transformer.h.1.mlp.c_fc Conv1D
transformer.h.1.mlp.c_proj Conv1D
transformer.h.

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


transformer.h.10.attn.c_proj Conv1D
transformer.h.10.attn.attn_dropout Dropout
transformer.h.10.attn.resid_dropout Dropout
transformer.h.10.ln_2 LayerNorm
transformer.h.10.mlp GPT2MLP
transformer.h.10.mlp.c_fc Conv1D
transformer.h.10.mlp.c_proj Conv1D
transformer.h.10.mlp.act NewGELUActivation
transformer.h.10.mlp.dropout Dropout
transformer.h.11 GPT2Block
transformer.h.11.ln_1 LayerNorm
transformer.h.11.attn GPT2Attention
transformer.h.11.attn.c_attn Conv1D
transformer.h.11.attn.c_proj Conv1D
transformer.h.11.attn.attn_dropout Dropout
transformer.h.11.attn.resid_dropout Dropout
transformer.h.11.ln_2 LayerNorm
transformer.h.11.mlp GPT2MLP
transformer.h.11.mlp.c_fc Conv1D
transformer.h.11.mlp.c_proj Conv1D
transformer.h.11.mlp.act NewGELUActivation
transformer.h.11.mlp.dropout Dropout
transformer.ln_f LayerNorm
lm_head Linear
✅ Model quantized successfully
✅ Generation test: Hello, world is  going to be a lot more interesting than it was before.
I


In [2]:
# Complete Step 1 functionality validation
from src.quantization import requantize_model_to_config

# Count quantized layers
quant_count = 0
for name, module in model.named_modules():
    if isinstance(module, QuantLinear):
        quant_count += 1
print(f"✅ Successfully quantized {quant_count} layers")

# Test 2bit/6bit dynamic requantization
test_cfg = load_config("../configs/test_2bit_6bit.yaml")
requantize_model_to_config(model, test_cfg)

# Test generation with quantized model
print("\n--- Generation test ---")
inputs = tokenizer("Hello, world is ", return_tensors="pt")
outputs = model.generate(**inputs, max_length=50)
result = tokenizer.decode(outputs[0])
print(f"Result: {result[:80]}...")

print("\n✅ Step 1 completed!")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


✅ Successfully quantized 48 layers

--- Generation test ---
Result: Hello, world is  in the world. 
I've been in the a small small small small small...

✅ Step 1 completed!


In [3]:
# Step 2: LoRA migration test
from src.lora import attach_lora_to_quant, activate_lora_by_config, activate_lora_by_bits

# Set same random seed
torch.manual_seed(42)

# LoRA spec with 6-bit support (same as test1)
lora_spec = {
    "transformer.h.0.attn.c_attn": {"bw4": (8,16), "bw8": (4,8), "bw6": (6,12)},
    "transformer.h.1.attn.c_attn": {"bw4": (8,16), "bw8": (4,8), "bw6": (6,12)},
}

# Fresh model + quantization + LoRA
lora_model = GPT2LMHeadModel.from_pretrained("gpt2")
wrappers = attach_lora_to_quant(lora_model, lora_spec, cfg)

# Test 6-bit activation and fallback
test_6bit_cfg = {"per_layer_bits": {"transformer.h.0.attn.c_attn": 6}, "default_w_bits": 8}
activate_lora_by_config(wrappers, test_6bit_cfg)

# Test branch switching and parameter collection
test_wrapper = list(wrappers.values())[0]
test_wrapper.set_active("bw4")
test_wrapper.set_active("bw999")  # trigger fallback warning

lora_params = [p for w in wrappers.values() for p in w.bank.parameters() if p.requires_grad]

# Set consistent activation state for generation test
for wrapper in wrappers.values():
    wrapper.set_active("bw8")  # ensure same activation state

# Deterministic generation test (minimal)
import torch

torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

lora_model.eval()
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

inp = tokenizer("Hello, world is ", return_tensors="pt")
with torch.no_grad():
    out = lora_model.generate(
        **inp,
        max_new_tokens=32,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
res = tokenizer.decode(out[0, inp["input_ids"].size(1):], skip_special_tokens=True)
print("Sampled:", res[:80] + "...")

print(f"✅ Step 2 completed: {len(wrappers)} wrappers, {len(lora_params)} params")


 GPT2LMHeadModel
transformer GPT2Model
transformer.wte Embedding
transformer.wpe Embedding
transformer.drop Dropout
transformer.h ModuleList
transformer.h.0 GPT2Block
transformer.h.0.ln_1 LayerNorm
transformer.h.0.attn GPT2Attention
transformer.h.0.attn.c_attn Conv1D
transformer.h.0.attn.c_proj Conv1D
transformer.h.0.attn.attn_dropout Dropout
transformer.h.0.attn.resid_dropout Dropout
transformer.h.0.ln_2 LayerNorm
transformer.h.0.mlp GPT2MLP
transformer.h.0.mlp.c_fc Conv1D
transformer.h.0.mlp.c_proj Conv1D
transformer.h.0.mlp.act NewGELUActivation
transformer.h.0.mlp.dropout Dropout
transformer.h.1 GPT2Block
transformer.h.1.ln_1 LayerNorm
transformer.h.1.attn GPT2Attention
transformer.h.1.attn.c_attn Conv1D
transformer.h.1.attn.c_proj Conv1D
transformer.h.1.attn.attn_dropout Dropout
transformer.h.1.attn.resid_dropout Dropout
transformer.h.1.ln_2 LayerNorm
transformer.h.1.mlp GPT2MLP
transformer.h.1.mlp.c_fc Conv1D
transformer.h.1.mlp.c_proj Conv1D
transformer.h.1.mlp.act NewGELUActiva

In [4]:
# Step 3: Switchable precision training test
from src.training import create_squad_dataloader, SwitchableTrainer
from src.quantization import load_config

# Set random seed for reproducible training
torch.manual_seed(42)

# Prepare multiple configs for switchable training
config_A = load_config("../configs/config.yaml")
config_B = load_config("../configs/config_4bit.yaml")
precision_configs = [config_A, config_B]

# Create dataloader and trainer
train_dataloader = create_squad_dataloader(tokenizer, batch_size=4, subset_size=100)
trainer = SwitchableTrainer(lora_model, wrappers, precision_configs, lr=1e-4)

# Run training (1000 iterations)
trainer.train(train_dataloader, iterations=1000)

# Test generation after training (CUDA-only)
import torch
assert torch.cuda.is_available(), "CUDA is required for generation but not found."

lora_model.eval().to("cuda")
inputs = tokenizer("Hello, world is ", return_tensors="pt").to("cuda")
with torch.no_grad():
    outputs = lora_model.generate(**inputs, max_length=50)
result = tokenizer.decode(outputs[0])
print("After training:", result[:80] + "...")

print(f"✅ Step 3 completed: switchable training with {len(precision_configs)} configs")


  0%|          | 0/1000 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
Iteration 1000 | Loss: 4.004 | Config: 4-bit: 100%|██████████| 1000/1000 [00:42<00:00, 23.72it/s]
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Training completed!
After training: Hello, world is  going to be a lot more interesting than it was before.
I'm not ...
✅ Step 3 completed: switchable training with 2 configs


In [None]:
# Step 4: Evaluation test using src modules
from src.evaluation import analyze_all_configs, save_results
from datasets import load_dataset

# Load validation dataset
val_ds = load_dataset("squad", split="validation")
device = torch.device("cuda")

# Analyze configs using migrated functions
df = analyze_all_configs(lora_model, tokenizer, val_ds, device, n=50)
print(df.to_string(index=False))

print(f"✅ Step 4 completed: evaluated {len(df)} configs")


                                               

            config  EM       F1  default_bits
           C1_all8 0.0 9.125855             8
            config 0.0 9.125855             8
    test_2bit_6bit 0.0 6.792441             4
   C4_back8_front4 0.0 5.996765             4
C8_mlpfc4_mlpproj8 0.0 5.515476             4
           C2_all4 0.0 4.583590             4
 C9_layernorm_fp32 0.0 4.583590             4
       config_4bit 0.0 4.583590             4
     C7_qkv4_proj8 0.0 4.418269             4
   C3_front8_back4 0.0 4.096109             4
  C10_mixed_budget 0.0 4.048412             4
     C6_qkv8_proj4 0.0 3.658520             4
       C5_sandwich 0.0 3.054139             4
Results saved to step4_results.json
✅ Step 4 completed: evaluated 13 configs




In [None]:
# A/B: CPT (probe) vs JOINT (parallel) on GPU only for step 5
import logging, torch
from time import perf_counter
from scripts.step5_cpt_slope import build_toy_mlp, train
from src.quantization.model_utils import replace_with_quant

assert torch.cuda.is_available(), "CUDA required"
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

device = "cuda"
steps = 300
bitwidths = [8, 6, 4]     # highest first
segment_steps = 60
probe_steps = 8
bs, inp, hid, cls = 32, 128, 64, 10
seed = 42

def run_once(mode: str):
    model = build_toy_mlp(inp=inp, hid=hid, cls=cls)
    ret = replace_with_quant(model, {"default_w_bits": bitwidths[0], "per_layer_bits": {}})
    if ret is not None:
        model = ret
    t0 = perf_counter()
    hist = train(model=model, steps=steps, bitwidths=bitwidths, mode=mode,
                 segment_steps=segment_steps, probe_steps=probe_steps,
                 bs=bs, inp=inp, cls=cls, dev=device, lr=1e-3, seed=seed)
    dt = perf_counter() - t0
    return hist, dt

def summarize(hist):
    tail = hist[-20:] if len(hist) >= 20 else hist
    avg_last = sum(l for _, l in tail) / max(1, len(tail))
    bits = [b for b, _ in hist]
    dist = {}
    for b in bits:
        dist[b] = dist.get(b, 0) + 1
    pure = [b for b in bits if b != -1]
    switches = sum(1 for i in range(1, len(pure)) if pure[i] != pure[i-1]) if pure else 0
    return avg_last, dist, switches

hist_cpt, dt_cpt = run_once("cpt")
avg_cpt, dist_cpt, sw_cpt = summarize(hist_cpt)

hist_joint, dt_joint = run_once("joint")
avg_joint, dist_joint, _ = summarize(hist_joint)

print(f"device={device}")
print(f"CPT:   steps={len(hist_cpt)} time={dt_cpt:.2f}s avg_last_loss={avg_cpt:.4f} switches={sw_cpt} dist={dist_cpt}")
print(f"JOINT: steps={len(hist_joint)} time={dt_joint:.2f}s avg_last_loss={avg_joint:.4f} dist={dist_joint}")
print("CPT last 5:", hist_cpt[-5:])
print("JOINT last 5:", hist_joint[-5:])
```