In [1]:
!pip -q install transformers peft bitsandbytes accelerate --upgrade --quiet

In [2]:
import math, torch

DTYPE_BYTES = {
    'fp32': 4,
    'fp16': 2,
    'bf16': 2,
    'int8': 1,
    'int4': 0.5,
}

def vram_estimate(num_params: int,
                  base_dtype: str = 'fp16',
                  trainable_ratio: float = 1.0,
                  train_dtype: str = 'fp16',
                  optimizer: str = 'adam'):
    """Return VRAM (MB) for weights + grads + optimiser."""
    base_mem = num_params * DTYPE_BYTES[base_dtype]
    train_params = num_params * trainable_ratio
    grad_mem = train_params * DTYPE_BYTES[train_dtype]
    opt_mult = 2 if optimizer == 'adam' else 0  # momentum+variance
    opt_mem = train_params * DTYPE_BYTES[train_dtype] * opt_mult
    total = (base_mem + grad_mem + opt_mem) / (1024 ** 2)
    return dict(weights_MB=base_mem / (1024**2),
                grads_MB=grad_mem / (1024**2),
                optim_MB=opt_mem / (1024**2),
                total_MB=total)

In [3]:
llama_7b = 6.74e9  # params

def pretty(d):
    return {k: f"{v:,.0f} MB" for k, v in d.items()}

full_ft   = vram_estimate(llama_7b, base_dtype='fp16', trainable_ratio=1.0)
lora_1pct = vram_estimate(llama_7b, base_dtype='fp16', trainable_ratio=0.01)
qlora     = vram_estimate(llama_7b, base_dtype='int4', trainable_ratio=0.01, train_dtype='fp16')

print('Full fine‑tune   :', pretty(full_ft))
print('LoRA (1% params) :', pretty(lora_1pct))
print('QLoRA (int4 base):', pretty(qlora))

Full fine‑tune   : {'weights_MB': '12,856 MB', 'grads_MB': '12,856 MB', 'optim_MB': '25,711 MB', 'total_MB': '51,422 MB'}
LoRA (1% params) : {'weights_MB': '12,856 MB', 'grads_MB': '129 MB', 'optim_MB': '257 MB', 'total_MB': '13,241 MB'}
QLoRA (int4 base): {'weights_MB': '3,214 MB', 'grads_MB': '129 MB', 'optim_MB': '257 MB', 'total_MB': '3,600 MB'}


In [4]:
# tweak me ↑
model_params   = 13e9      # e.g., 13B model
trainable_ratio= 0.0025    # 0.25% (r=8 LoRA) 
base_dtype     = 'int4'    # 'fp16', 'int8', 'int4'
train_dtype    = 'fp16'
print(pretty(vram_estimate(model_params, base_dtype, trainable_ratio, train_dtype=train_dtype)))

{'weights_MB': '6,199 MB', 'grads_MB': '62 MB', 'optim_MB': '124 MB', 'total_MB': '6,385 MB'}


In [5]:
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model

base = AutoModelForCausalLM.from_pretrained('sshleifer/tiny-gpt2')
base_params = sum(p.numel() for p in base.parameters())

lora_cfg = LoraConfig(r=8, target_modules=['c_attn'])
lora_model = get_peft_model(base, lora_cfg)

print("Model", lora_model)

train_params = sum(p.numel() for p in lora_model.parameters() if p.requires_grad)
print(f'Base model params   : {base_params:,}')
print(f'LoRA trainable params: {train_params:,}  ({train_params/base_params:.2%})')

  from .autonotebook import tqdm as notebook_tqdm
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers and GPU quantization are unavailable.


Model PeftModel(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 2)
        (wpe): Embedding(1024, 2)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-1): 2 x GPT2Block(
            (ln_1): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=6, nx=2)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=6, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()

