In [11]:
# Step 1: GPT-2 quantization migration test
import sys
sys.path.append('..')

from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from src.quantization import QuantLinear, replace_with_quant, load_config

# Load model and config
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
cfg = load_config("../configs/config.yaml")
print(f"✅ Config loaded: {cfg['default_w_bits']} bit")

# Execute quantization replacement (keep layer printing)
replace_with_quant(model, cfg)
print("✅ Model quantized successfully")

# Test basic generation
inputs = tokenizer("Hello, world is ", return_tensors="pt")
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=20)
print("✅ Generation test:", tokenizer.decode(outputs[0]))


✅ Config loaded: 8 bit
 GPT2LMHeadModel
transformer GPT2Model
transformer.wte Embedding
transformer.wpe Embedding
transformer.drop Dropout
transformer.h ModuleList
transformer.h.0 GPT2Block
transformer.h.0.ln_1 LayerNorm
transformer.h.0.attn GPT2Attention
transformer.h.0.attn.c_attn Conv1D
transformer.h.0.attn.c_proj Conv1D
transformer.h.0.attn.attn_dropout Dropout
transformer.h.0.attn.resid_dropout Dropout
transformer.h.0.ln_2 LayerNorm
transformer.h.0.mlp GPT2MLP
transformer.h.0.mlp.c_fc Conv1D
transformer.h.0.mlp.c_proj Conv1D
transformer.h.0.mlp.act NewGELUActivation
transformer.h.0.mlp.dropout Dropout
transformer.h.1 GPT2Block
transformer.h.1.ln_1 LayerNorm
transformer.h.1.attn GPT2Attention
transformer.h.1.attn.c_attn Conv1D
transformer.h.1.attn.c_proj Conv1D
transformer.h.1.attn.attn_dropout Dropout
transformer.h.1.attn.resid_dropout Dropout
transformer.h.1.ln_2 LayerNorm
transformer.h.1.mlp GPT2MLP
transformer.h.1.mlp.c_fc Conv1D
transformer.h.1.mlp.c_proj Conv1D
transformer.h.

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


transformer.h.11.attn.c_proj Conv1D
transformer.h.11.attn.attn_dropout Dropout
transformer.h.11.attn.resid_dropout Dropout
transformer.h.11.ln_2 LayerNorm
transformer.h.11.mlp GPT2MLP
transformer.h.11.mlp.c_fc Conv1D
transformer.h.11.mlp.c_proj Conv1D
transformer.h.11.mlp.act NewGELUActivation
transformer.h.11.mlp.dropout Dropout
transformer.ln_f LayerNorm
lm_head Linear
✅ Model quantized successfully
✅ Generation test: Hello, world is  going to be a lot more interesting than it was before.
I


In [None]:
# Complete Step 1 functionality validation
from src.quantization import requantize_model_to_config

# Count quantized layers
quant_count = 0
for name, module in model.named_modules():
    if isinstance(module, QuantLinear):
        quant_count += 1
print(f"✅ Successfully quantized {quant_count} layers")

# Test 2bit/6bit dynamic requantization
test_cfg = load_config("../configs/test_2bit_6bit.yaml")
requantize_model_to_config(model, test_cfg)

# Test generation with quantized model
print("\n--- Generation test ---")
inputs = tokenizer("Hello, world is ", return_tensors="pt")
outputs = model.generate(**inputs, max_length=50)
result = tokenizer.decode(outputs[0])
print(f"Result: {result[:80]}...")

print("\n✅ Step 1 completed!")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


✅ Successfully quantized 48 layers

--- Generation test ---
Result: Hello, world is  in the world. 
I've been in the a small small small small small...

✅ Step 1 completed!


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


✅ Found 48 QuantLinear layers in the model
✅ get_bits_for_layer('transformer.h.0.attn.c_attn') = 8 bits
✅ All Conv1D layers successfully converted to QuantLinear

--- Testing quantized model generation ---
Generated text: Hello, world is  in the world. 
I've been in the a small small small small small small small small s...
