In [1]:
print('Installing packages...')
! pip install torch transformers accelerate sentencepiece  datasets tqdm zstandard

In [2]:
import tqdm
import torch
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from functools import partial
import gc

In [3]:
model_path = "facebook/opt-1.3b"
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cuda")
dataset=load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

wikitext-2-raw-v1/test-00000-of-00001.pa(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-2-raw-v1/train-00000-of-00001.p(…):   0%|          | 0.00/6.36M [00:00<?, ?B/s]

wikitext-2-raw-v1/validation-00000-of-00(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [4]:
n_samples=50
def evaluate_perplexity(model,tokenizer):
    tokenized_data=tokenizer("/n/n".join(dataset['text']),return_tensors='pt')
    tokens=tokenized_data.input_ids
    model.eval()
    total_loss=0.0
    tokens.to(model.device)
    for i in tqdm.tqdm(range(n_samples)):
        batch=tokens[:,i*2048:(i+1)*2048].to(model.device)
        with torch.no_grad():
            logits=model(batch).logits
            shift_logits=logits[:,:-1,:].contiguous()
            shift_labels=batch[:,1:].contiguous()
            loss_fct=nn.CrossEntropyLoss()
            loss=loss_fct(shift_logits.view(-1,shift_logits.size(-1)),shift_labels.view(-1))
            total_loss+=loss.item()
        del batch
    return(torch.exp(torch.tensor(total_loss/(n_samples))))

def model_size(model,data_width=16,group_size=-1):
    if group_size!=-1:
        scale_width=16
        zero_point_width=4
        data_width+= (scale_width + zero_point_width)/group_size
    num_params=0
    for n,m in model.named_parameters():
        num_params+=m.numel()
    size_in_bits=num_params*data_width
    size_in_megabytes=size_in_bits/(8*1024*1024)
    return size_in_megabytes



In [5]:
### Model info 
print(f'Model size (in MB): {model_size(model,data_width=32,group_size=-1)}')
print(f'Model perplexity: {evaluate_perplexity(model,tokenizer)}')

Model size (in MB): 5019.21875


100%|██████████| 50/50 [01:15<00:00,  1.51s/it]

Model perplexity: 13.209306716918945





Given a high-precision tensor $W$ (e.g., FP32), uniform quantization aims to represent each weight using a lower bit-width while minimizing the error between $W$ and its quantized version $Q(W)$.

Assume that the values of $W$ lie in the range $[\alpha, \beta]$ and that we target a bit-width of $b$ bits. The quantized representation is:

- Quantized integer: $q$
- Scale factor: $S$
- Zero Point: $z$

The quantized reconstruction is:
$$
Q(W) = S \cdot (q - z)
$$

The scale and zero-point are defined as:
$$
S = \frac{\beta - \alpha}{2^{b} - 1} \tag{1}
$$

$$
z = -\text{Round}\left( \frac{\alpha}{S} \right) \tag{2}
$$

Thus, each weight $w \in W$ is quantized as:
$$
q(w) = \text{Clamp}\left( \text{Round}\left(\frac{w}{S}\right) + z,\ 0,\ 2^{b} - 1 \right) \tag{3}
$$


In [6]:
def pseudo_quantize_tensor(tensor, num_bits=8, group_size=-1):
    original_shape=tensor.shape
    if group_size!=-1:
        if original_shape[-1]%group_size!=0:
            raise ValueError("The last dimension of the tensor must be divisible by group_size")
        tensor=tensor.view(-1,group_size)
    # alpha and beta calculation
    alpha=0
    beta=2**num_bits - 1
    max_int=beta-alpha
    min_val=tensor.min(dim=-1,keepdim=True).values
    max_val=tensor.max(dim=-1,keepdim=True).values
    scales=(max_val - min_val)/(max_int)
    zero_points=torch.round(-min_val/scales).clamp(alpha,beta)
    # Quantization
    quantized_tensor=torch.round(tensor/scales + zero_points).clamp(alpha,beta)
    # Dequantization
    dequantized_tensor=(quantized_tensor - zero_points)*scales
    return dequantized_tensor.view(original_shape)
def pseudo_quantize_model(model, num_bits=4, group_size=-1):
    for n,m in model.named_modules():
        if isinstance(m, nn.Linear):
            m.weight.data=pseudo_quantize_tensor(m.weight.data,num_bits=num_bits,group_size=group_size)
    gc.collect()
    torch.cuda.empty_cache()
        

In [7]:
#evaluate quantized model
gc.collect()
torch.cuda.empty_cache()
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
pseudo_quantize_model(model, num_bits=3, group_size=128)
print(f'Quantized model perplexity: {evaluate_perplexity(model,tokenizer)}')
print(f'Quantized model size (in MB): {model_size(model,data_width=3,group_size=128)}')

100%|██████████| 50/50 [01:17<00:00,  1.54s/it]

Quantized model perplexity: 104.80799865722656
Quantized model size (in MB): 495.0596618652344





As we can see, after pseudo-quantization to 3 bits with group size 128, the model size is significantly reduced but perplexity rocketed.

Let us exmaine , following AWQ logic , the variance of same channels across tokens , and the variance between channels in the same activation

In [23]:
def calib_data(tokenizer , n_samples=256 , seq_len=512):
    dataset=load_dataset('wikitext', 'wikitext-2-raw-v1', split='validation').shuffle(seed=42)
    samples=[]
    it = 0
    for i in dataset['text']:
        i.strip()
        it+=1
        tokenized=tokenizer.encode(i, return_tensors='pt')
        
        if tokenized.shape[1]>=seq_len:
            continue
        sample=torch.tensor(tokenized)
                
        samples.append(sample)
        if it>=n_samples:
            print(f'Collected {n_samples} samples for calibration in  {it} iterations')
            break
    cat_sample=torch.cat(samples,dim=1)
    
    return  [cat_sample[:,i*seq_len:(i+1)*seq_len] for i in range(cat_sample.shape[1]//seq_len)]
        
    

In [43]:
gc.collect()
torch.cuda.empty_cache()
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto") 
@torch.no_grad()
def plot_activation(model , tokenizer , layer_name , n_samples=512 , seq_len=512):
    import matplotlib.pyplot as plt 
    activations = []
    def hook_fn(module, input, output):
        print('Hooked layer output shape' , output.shape)
        activations.append(output.detach().cpu())
    for n, m in model.named_modules():
        if n == layer_name:
            hook=m.register_forward_hook(hook_fn)
            break
        raise ValueError(f'Layer {layer_name} not found in the model')
    calib_samples = calib_data(tokenizer , n_samples=n_samples , seq_len=seq_len)
    print(len(calib_samples))
    for sample in calib_samples:
        sample = sample.to(model.device)
        print(sample.shape)
        _=model(sample)
    hook.remove()
    print('activation of shape' , len(activations))
    activations_cat = torch.cat(activations , dim=0)
    print('Concatenated activation shape' , activations_cat.shape)
    
    

plot_activation(model , tokenizer , layer_name='model.decoder.layers.0.self_attn.q_proj')
    
        

OutOfMemoryError: CUDA out of memory. Tried to allocate 198.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 142.12 MiB is free. Process 5898 has 14.60 GiB memory in use. Of the allocated memory 14.32 GiB is allocated by PyTorch, and 164.90 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [44]:
!nvidia_smi


/bin/bash: line 1: nvidia_smi: command not found
