In [34]:
# Step 1: GPT-2 quantization migration test
import sys
sys.path.append('..')

from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from src.quantization import QuantLinear, replace_with_quant, load_config

# Load model and config
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
cfg = load_config("../configs/config.yaml")
print(f"✅ Config loaded: {cfg['default_w_bits']} bit")

# Execute quantization replacement (keep layer printing)
replace_with_quant(model, cfg)
print("✅ Model quantized successfully")

# Test basic generation
inputs = tokenizer("Hello, world is ", return_tensors="pt")
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=20)
print("✅ Generation test:", tokenizer.decode(outputs[0]))


✅ Config loaded: 8 bit
 GPT2LMHeadModel
transformer GPT2Model
transformer.wte Embedding
transformer.wpe Embedding
transformer.drop Dropout
transformer.h ModuleList
transformer.h.0 GPT2Block
transformer.h.0.ln_1 LayerNorm
transformer.h.0.attn GPT2Attention
transformer.h.0.attn.c_attn Conv1D
transformer.h.0.attn.c_proj Conv1D
transformer.h.0.attn.attn_dropout Dropout
transformer.h.0.attn.resid_dropout Dropout
transformer.h.0.ln_2 LayerNorm
transformer.h.0.mlp GPT2MLP
transformer.h.0.mlp.c_fc Conv1D
transformer.h.0.mlp.c_proj Conv1D
transformer.h.0.mlp.act NewGELUActivation
transformer.h.0.mlp.dropout Dropout
transformer.h.1 GPT2Block
transformer.h.1.ln_1 LayerNorm
transformer.h.1.attn GPT2Attention
transformer.h.1.attn.c_attn Conv1D
transformer.h.1.attn.c_proj Conv1D
transformer.h.1.attn.attn_dropout Dropout
transformer.h.1.attn.resid_dropout Dropout
transformer.h.1.ln_2 LayerNorm
transformer.h.1.mlp GPT2MLP
transformer.h.1.mlp.c_fc Conv1D
transformer.h.1.mlp.c_proj Conv1D
transformer.h.

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


transformer.h.8.attn.c_proj Conv1D
transformer.h.8.attn.attn_dropout Dropout
transformer.h.8.attn.resid_dropout Dropout
transformer.h.8.ln_2 LayerNorm
transformer.h.8.mlp GPT2MLP
transformer.h.8.mlp.c_fc Conv1D
transformer.h.8.mlp.c_proj Conv1D
transformer.h.8.mlp.act NewGELUActivation
transformer.h.8.mlp.dropout Dropout
transformer.h.9 GPT2Block
transformer.h.9.ln_1 LayerNorm
transformer.h.9.attn GPT2Attention
transformer.h.9.attn.c_attn Conv1D
transformer.h.9.attn.c_proj Conv1D
transformer.h.9.attn.attn_dropout Dropout
transformer.h.9.attn.resid_dropout Dropout
transformer.h.9.ln_2 LayerNorm
transformer.h.9.mlp GPT2MLP
transformer.h.9.mlp.c_fc Conv1D
transformer.h.9.mlp.c_proj Conv1D
transformer.h.9.mlp.act NewGELUActivation
transformer.h.9.mlp.dropout Dropout
transformer.h.10 GPT2Block
transformer.h.10.ln_1 LayerNorm
transformer.h.10.attn GPT2Attention
transformer.h.10.attn.c_attn Conv1D
transformer.h.10.attn.c_proj Conv1D
transformer.h.10.attn.attn_dropout Dropout
transformer.h.10.

In [35]:
# Complete Step 1 functionality validation
from src.quantization import requantize_model_to_config

# Count quantized layers
quant_count = 0
for name, module in model.named_modules():
    if isinstance(module, QuantLinear):
        quant_count += 1
print(f"✅ Successfully quantized {quant_count} layers")

# Test 2bit/6bit dynamic requantization
test_cfg = load_config("../configs/test_2bit_6bit.yaml")
requantize_model_to_config(model, test_cfg)

# Test generation with quantized model
print("\n--- Generation test ---")
inputs = tokenizer("Hello, world is ", return_tensors="pt")
outputs = model.generate(**inputs, max_length=50)
result = tokenizer.decode(outputs[0])
print(f"Result: {result[:80]}...")

print("\n✅ Step 1 completed!")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


✅ Successfully quantized 48 layers

--- Generation test ---
Result: Hello, world is  in the world. 
I've been in the a small small small small small...

✅ Step 1 completed!


In [36]:
# Step 2: LoRA migration test
from src.lora import attach_lora_to_quant, activate_lora_by_config, activate_lora_by_bits

# Set same random seed
torch.manual_seed(42)

# LoRA spec with 6-bit support (same as test1)
lora_spec = {
    "transformer.h.0.attn.c_attn": {"bw4": (8,16), "bw8": (4,8), "bw6": (6,12)},
    "transformer.h.1.attn.c_attn": {"bw4": (8,16), "bw8": (4,8), "bw6": (6,12)},
}

# Fresh model + quantization + LoRA
lora_model = GPT2LMHeadModel.from_pretrained("gpt2")
wrappers = attach_lora_to_quant(lora_model, lora_spec, cfg)

# Test 6-bit activation and fallback
test_6bit_cfg = {"per_layer_bits": {"transformer.h.0.attn.c_attn": 6}, "default_w_bits": 8}
activate_lora_by_config(wrappers, test_6bit_cfg)

# Test branch switching and parameter collection
test_wrapper = list(wrappers.values())[0]
test_wrapper.set_active("bw4")
test_wrapper.set_active("bw999")  # trigger fallback warning

lora_params = [p for w in wrappers.values() for p in w.bank.parameters() if p.requires_grad]

# Set consistent activation state for generation test
for wrapper in wrappers.values():
    wrapper.set_active("bw8")  # ensure same activation state

# Deterministic generation test (minimal)
import torch

torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

lora_model.eval()
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

inp = tokenizer("Hello, world is ", return_tensors="pt")
with torch.no_grad():
    out = lora_model.generate(
        **inp,
        max_new_tokens=32,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
res = tokenizer.decode(out[0, inp["input_ids"].size(1):], skip_special_tokens=True)
print("Sampled:", res[:80] + "...")

print(f"✅ Step 2 completed: {len(wrappers)} wrappers, {len(lora_params)} params")


 GPT2LMHeadModel
transformer GPT2Model
transformer.wte Embedding
transformer.wpe Embedding
transformer.drop Dropout
transformer.h ModuleList
transformer.h.0 GPT2Block
transformer.h.0.ln_1 LayerNorm
transformer.h.0.attn GPT2Attention
transformer.h.0.attn.c_attn Conv1D
transformer.h.0.attn.c_proj Conv1D
transformer.h.0.attn.attn_dropout Dropout
transformer.h.0.attn.resid_dropout Dropout
transformer.h.0.ln_2 LayerNorm
transformer.h.0.mlp GPT2MLP
transformer.h.0.mlp.c_fc Conv1D
transformer.h.0.mlp.c_proj Conv1D
transformer.h.0.mlp.act NewGELUActivation
transformer.h.0.mlp.dropout Dropout
transformer.h.1 GPT2Block
transformer.h.1.ln_1 LayerNorm
transformer.h.1.attn GPT2Attention
transformer.h.1.attn.c_attn Conv1D
transformer.h.1.attn.c_proj Conv1D
transformer.h.1.attn.attn_dropout Dropout
transformer.h.1.attn.resid_dropout Dropout
transformer.h.1.ln_2 LayerNorm
transformer.h.1.mlp GPT2MLP
transformer.h.1.mlp.c_fc Conv1D
transformer.h.1.mlp.c_proj Conv1D
transformer.h.1.mlp.act NewGELUActiva

In [37]:
# Step 3: Switchable precision training test
from src.training import create_squad_dataloader, SwitchableTrainer
from src.quantization import load_config

# Set random seed for reproducible training
torch.manual_seed(42)

# Prepare multiple configs for switchable training
config_A = load_config("../configs/config.yaml")
config_B = load_config("../configs/config_4bit.yaml")
precision_configs = [config_A, config_B]

# Create dataloader and trainer
train_dataloader = create_squad_dataloader(tokenizer, batch_size=4, subset_size=100)
trainer = SwitchableTrainer(lora_model, wrappers, precision_configs, lr=1e-4)

# Run training (1000 iterations)
trainer.train(train_dataloader, iterations=1000)

# Test generation after training (CUDA-only)
import torch
assert torch.cuda.is_available(), "CUDA is required for generation but not found."

lora_model.eval().to("cuda")
inputs = tokenizer("Hello, world is ", return_tensors="pt").to("cuda")
with torch.no_grad():
    outputs = lora_model.generate(**inputs, max_length=50)
result = tokenizer.decode(outputs[0])
print("After training:", result[:80] + "...")

print(f"✅ Step 3 completed: switchable training with {len(precision_configs)} configs")


Iteration 1000 | Loss: 3.307 | Config: 4-bit: 100%|██████████| 1000/1000 [01:35<00:00, 10.46it/s]
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Training completed!
After training: Hello, world is  going to be a lot more interesting than it was before.
I'm not ...
✅ Step 3 completed: switchable training with 2 configs


In [None]:
# Step 4: Evaluation test using src modules
from src.evaluation import analyze_all_configs, save_results
from datasets import load_dataset

# Load validation dataset
val_ds = load_dataset("squad", split="validation")
device = torch.device("cuda")

# Analyze configs using migrated functions
df = analyze_all_configs(lora_model, tokenizer, val_ds, device, n=50)
print(df.to_string(index=False))

# Save results
save_results(df.to_dict('records'), "step4_results.json")

print(f"✅ Step 4 completed: evaluated {len(df)} configs")
