In [1]:
!nvidia-smi

Mon Mar 25 10:41:20 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.07             Driver Version: 535.161.07   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        Off | 00000000:01:00.0  On |                  Off |
|  0%   45C    P8              27W / 490W |  24130MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                         

## Importing libraries

In [4]:
import accelerate 
import transformers
import json
import os

REPO_ID = "decapoda-research/llama-7b-hf"
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"
cache_dir = "./models"
os.environ['HF_HOME'] = './cache/'
# Make sure you set this variable according to the environment
environment = "local"

## Generating the device map on laptop

In [5]:
# Show that when we do this neither GPU nor CPU memory increases
config = transformers.AutoConfig.from_pretrained(REPO_ID)

with accelerate.init_empty_weights():
    fake_model = transformers.AutoModelForCausalLM.from_config(config)
    
device_map = accelerate.infer_auto_device_map(fake_model, max_memory={0: "3GiB", "cpu": "6GiB"})
print(json.dumps(device_map, indent=4))

OSError: decapoda-research/llama-7b-hf is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [4]:
# This device map was generated using accelerator.infer_auto_device_map() function
device_map = {
    'model.embed_tokens': 0,
     'model.layers.0': 0,
     'model.layers.1': 0,
     'model.layers.2': 0,
     'model.layers.3': 0,
     'model.layers.4': 0,
     'model.layers.5': 0,
     'model.layers.6': 0,
     'model.layers.7': 0,
     'model.layers.8': 0,
     'model.layers.9': 0,
     'model.layers.10': 0,
     'model.layers.11': 0,
     'model.layers.12': 0,
     'model.layers.13': 0,
     'model.layers.14': 'cpu',
     'model.layers.15': 'cpu',
     'model.layers.16': 'cpu',
     'model.layers.17': 'cpu',
     'model.layers.18': 'cpu',
     'model.layers.19': 'cpu',
     'model.layers.20': 'cpu',
     'model.layers.21': 'cpu',
     'model.layers.22': 'cpu',
     'model.layers.23': 'cpu',
     'model.layers.24': 'cpu',
     'model.layers.25': 'cpu',
     'model.layers.26': 'cpu',
     'model.layers.27': 'cpu',
     'model.layers.28': 'disk',
     'model.layers.29': 'disk',
     'model.layers.30': 'disk',
     'model.layers.31': 'disk',
     'model.norm': 'disk',
     'lm_head': 'disk'
}

## Loading the model memory efficiently

* LLM.int8() quantization
* Offloading: Uses GPU memory to the maximum, then CPU and finally memory-mapped chunks on disk
  * How offloading works: https://huggingface.co/docs/accelerate/usage_guides/big_modeling

**Note**: If you use WSL/Windows, you might run into an issue where the `bitsandbytes` library cannot find the file `libbitsandbytes_cpu.so`. If so follow the instructions [here](https://github.com/TimDettmers/bitsandbytes/issues/156#issuecomment-1474056975)

In [5]:
import transformers

tokenizer = transformers.LlamaTokenizer.from_pretrained(REPO_ID)

# Check what happens when device_map = auto
# This will fail as the model in FP32 precision cannot be fit on CPU
# model = transformers.LlamaForCausalLM.from_pretrained("decapoda-research/llama-7b-hf")
# This will fail as the model in FP32 precision cannot be fit on GPU
# model = transformers.LlamaForCausalLM.from_pretrained("decapoda-research/llama-7b-hf").to("cuda")
# This will also fail as the model cannot be fit on GPU fully even with the quantization
# model = transformers.LlamaForCausalLM.from_pretrained("decapoda-research/llama-7b-hf", device_map="auto", load_in_8bit=True)

if environment == "local":
    model = transformers.LlamaForCausalLM.from_pretrained(
        REPO_ID, 
        device_map=device_map, 
        offload_folder="/tmp/.offload",
        load_in_8bit=True,
        llm_int8_enable_fp32_cpu_offload=True,
    )
elif environment == "colab":
    model = transformers.LlamaForCausalLM.from_pretrained(
        REPO_ID, 
        device_map="auto", 
        load_in_8bit=True,
    )
else:
    raise ValueError(f"Environment can only be local/colab. Got {environment}")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/thushv89/anaconda3/envs/ml.torch/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Loading binary /home/thushv89/anaconda3/envs/ml.torch/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so...


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

In [6]:
# without quantization memory footprint-> 27020779520 (~25.1GB)
# The more weights on GPU the better the memory reduction
# with quantization memory footprint -> 10710692352 (~10GB)
print(f"Memory footprint in bytes: {model.get_memory_footprint()}")

Memory footprint in bytes: 10710692352


In [7]:
# Check if it provided the full list of layers without device map 
print(json.dumps(model.hf_device_map, indent=4))

{
    "model.embed_tokens": 0,
    "model.layers.0": 0,
    "model.layers.1": 0,
    "model.layers.2": 0,
    "model.layers.3": 0,
    "model.layers.4": 0,
    "model.layers.5": 0,
    "model.layers.6": 0,
    "model.layers.7": 0,
    "model.layers.8": 0,
    "model.layers.9": 0,
    "model.layers.10": 0,
    "model.layers.11": 0,
    "model.layers.12": 0,
    "model.layers.13": 0,
    "model.layers.14": "cpu",
    "model.layers.15": "cpu",
    "model.layers.16": "cpu",
    "model.layers.17": "cpu",
    "model.layers.18": "cpu",
    "model.layers.19": "cpu",
    "model.layers.20": "cpu",
    "model.layers.21": "cpu",
    "model.layers.22": "cpu",
    "model.layers.23": "cpu",
    "model.layers.24": "cpu",
    "model.layers.25": "cpu",
    "model.layers.26": "cpu",
    "model.layers.27": "cpu",
    "model.layers.28": "disk",
    "model.layers.29": "disk",
    "model.layers.30": "disk",
    "model.layers.31": "disk",
    "model.norm": "disk",
    "lm_head": "disk"
}


## Inferring with the loaded model

In [9]:
import time 

# Remember Llama is not instruction finetuned
batch = tokenizer(
    "I would explain a blackhole to a 5 year old as",
    return_tensors="pt", 
    add_special_tokens=False
)

# /home/thushv89/anaconda3/envs/ml.torch/lib/python3.9/site-packages/transformers/generation/utils.py:1405: UserWarning: You are calling .generate() with the `input_ids` being on a device type different than your model's device. `input_ids` is on cpu, whereas the model is on meta. You may experience unexpected behaviors or slower generation. Please make sure that you have put `input_ids` to the correct device by calling for example input_ids = input_ids.to('meta') before running `.generate()`.
batch = {k: v for k, v in batch.items()}
n_input_tokens = batch["input_ids"].shape[-1]

t1 = time.perf_counter()
generated = model.generate(batch["input_ids"].to("cuda"), max_length=n_input_tokens+25)
t2 = time.perf_counter()
print(tokenizer.decode(generated[0]))
n_generated = generated.shape[-1]-batch["input_ids"].shape[-1]
print(f"It took {t2-t1}s to generate the sequence of {n_generated} tokens ({n_generated/(t2-t1)} tokens/s).")

I would explain a blackhole to a 5 year old as a place where the laws of physics don't apply.
I'm not sure I'd go that far.
It took 69.4328759409982s to generate the sequence of 25 tokens (0.36005998111390614 tokens/s).
