In [1]:
import collections
import enum
import time
import typing


import rich
import rich.console
import rich.table
import torch
import transformers


################################################################################################
# Generic Utils
################################################################################################


def add_check_not_exists(dict_, key, value):
    assert key not in dict_, f"Key already exists: {key}"
    dict_[key] = value


################################################################################################
# Generic HF Utils
################################################################################################



class Precision(str, enum.Enum):
    FP32  = "fp32"
    FP16  = "fp16"
    BF16  = "bf16"
    BITS8 = "bits8"
    BITS4 = "bits4"


def create_kwargs(precision, device_map):
    if precision == Precision.BITS4:
        kwargs = dict(
            load_in_4bit=True,
        )
    elif precision == Precision.BITS8:
        kwargs = dict(
            load_in_8bit = True, 
            torch_dtype  = torch.float16,
        )
    elif precision == Precision.FP16:
        kwargs = dict(torch_dtype = torch.float16)
    elif precision == Precision.BF16:
        kwargs = dict(torch_dtype = torch.bfloat16)
    elif precision == Precision.FP32:
        kwargs = dict(torch_dtype = torch.float32)
    else:
        raise ValueError(f"Invalid precision: {precision}")
    
    assert "device_map" not in kwargs
    kwargs["device_map"] = device_map

    return kwargs


def build_device_map(*, use_device_map, gpu_index, config):
    if use_device_map:
        if gpu_index is None:
            device_map = "balanced"
        else:
            device_map = {"": torch.device(gpu_index)}
    else:
        device_map = None

    return device_map


def make_model(*, config, model_name, kwargs, gpu_index, device_map):
    if config.is_encoder_decoder:
        cls = transformers.AutoModelForSeq2SeqLM
    else:
        cls = transformers.AutoModelForCausalLM

    model = cls.from_pretrained(
        model_name,
        trust_remote_code = True,
        **kwargs,
    )

    if device_map is None:
        model.to(torch.device(gpu_index))
    return model


def load_model(
    *, 
    config_name: str,
    model_name: str, 
    precision:  Precision, 
    gpu_index:  typing.Optional[int],
    use_device_map: bool,
):
    rich.print(f"Loading model: \"{model_name}\" - \"{precision}\"")
    if precision in [Precision.BITS4, Precision.BITS8]:
        assert use_device_map

    precision = Precision(precision)
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
    config    = transformers.AutoConfig   .from_pretrained(model_name, trust_remote_code=True)

    device_map = build_device_map(use_device_map=use_device_map, gpu_index=gpu_index, config=config,)
    kwargs     = create_kwargs   (precision=precision, device_map=device_map)
    model      = make_model      (config=config, model_name=model_name, kwargs=kwargs, gpu_index=gpu_index, device_map=device_map)

    print_table(kwargs=kwargs, model=model, config_name=config_name)

    return model, tokenizer


################################################################################################
# Project specific
################################################################################################
def print_table(*, kwargs, model, config_name):
    table = rich.table.Table("Key", "Value", title=f"[green]{config_name}", show_lines=True)
    table.add_row("Model type",               model.config.model_type)
    table.add_row("Loading model with kwargs", str(kwargs))
    table.add_row("Device indices",            str(
        collections.Counter(
            int(x.device.index) 
            if x.device.index is not None 
            else x.device.index 
            for x in model.parameters()
        )))
    table.add_row("Device types",              str(collections.Counter(
        x.device.type for x in model.parameters())))
    rich.print(table)



def text_generation(*, s, m, t: transformers.PreTrainedTokenizerBase):
    conditional_gen_kwargs = {}
    
    ###########################
    # Pad token id stuff
    ###########################
    if t.pad_token_id is None:
        t.pad_token_id = t.eos_token_id

    if m.config.pad_token_id is None:
        conditional_gen_kwargs["pad_token_id"] = m.config.eos_token_id

    ###########################
    # Padding side stuff
    ###########################
    if not m.config.is_encoder_decoder:
        assert hasattr(t       , "padding_side"), "hasattr(t       , 'padding_side') is False"
        t.padding_side = "left"

    ###########################
    # Generation
    ###########################
    sample_toks = t(s, return_tensors="pt", padding=True).to("cuda")
    
    start = time.perf_counter()
    output_toks = m.generate(
        input_ids=sample_toks.input_ids,
        num_return_sequences = 1, 
        max_new_tokens       = 200,
        num_beams            = 1,
        do_sample            = False,
        **conditional_gen_kwargs,
    )
    end = time.perf_counter()
    print(f"\t- {end - start:.2f} seconds")
    return start - end


def print_generations(tokenizer, model, output_toks, sample_toks):
    if not model.config.is_encoder_decoder:
        output_toks = output_toks[:, sample_toks.input_ids.shape[-1]:]
    outputs = tokenizer.batch_decode(output_toks, skip_special_tokens=True)
    outputs = [x.replace("\n", " ").strip() for x in outputs]

    print(f"\n{tokenizer.decode(sample_toks.input_ids)}")
    for line in outputs:
        line = line.replace("  ", " ")
        if line:
            print(f" - {line.strip()}")


def build_configs(*, model_name, big):
    configs = {}

    if not big:
        add_check_not_exists(
            configs,
            "m_fp_16_normal",
            dict(
                model_name = model_name,
                precision  = Precision.BF16, 
                gpu_index  = 0,
                use_device_map = False,
            )
        )
        add_check_not_exists(
            configs,
            "m_fp_16_dm_single",
            dict(
                model_name = model_name,
                precision  = Precision.BF16, 
                gpu_index  = 0,
                use_device_map = True,
            ))
        add_check_not_exists(
            configs,
            "m_b_8_dm_single",
            dict(
                model_name = model_name,
                precision  = Precision.BITS8, 
                gpu_index  = 0,
                use_device_map = True,
            ))
        
        # add_check_not_exists(
        # configs,
        # "m_b_4_dm_single",
        # dict(
        #     model_name = model_name,
        #     precision  = Precision.BITS4,
        #     gpu_index  = 0,
        #     use_device_map = True,
        # ))

    add_check_not_exists(
        configs,
        "m_fp_16_dm_auto",
        dict(
            model_name = model_name,
            precision  = Precision.BF16, 
            gpu_index  = None,
            use_device_map = True,
        ))

    add_check_not_exists(
        configs,
        "m_b_8_dm_auto", 
        dict(
            model_name = model_name,
            precision  = Precision.BITS8, 
            gpu_index  = None,
            use_device_map = True,
        ))

    # add_check_not_exists(
    #     configs,
    #     "m_b_4_dm_auto", 
    #     dict(
    #         model_name = model_name,
    #         precision  = Precision.BITS4,
    #         gpu_index  = None,
    #         use_device_map = True,
    #     ))

    return configs

In [2]:
MODEL_NAME = "huggyllama/llama-7b"
BIG = False
# MODEL_NAME = "tiiuae/falcon-40b-instruct"
# MODEL_NAME     = "google/flan-t5-xl"
###############

N_LOOPS    = 5
BATCH_SIZE = 8 * 8
PRECISION  = Precision.BF16
SAMPLE = [
    "Isabella earns $5 an hour babysitting. "
    "She babysits 5 hours every day, 6 afternoons a week. "
    "After babysitting for 7 weeks, how much money "
    "will Isabella have earned?"
] * BATCH_SIZE

In [3]:
transformers.logging.set_verbosity_warning()
CONSOLE = rich.console.Console()
configs = build_configs(model_name=MODEL_NAME, big=BIG)

for config_name, config in configs.items():
    CONSOLE.rule()
    CONSOLE.rule(config_name)
    CONSOLE.rule()
    model, tokenizer = load_model(config_name=config_name, **config)
    for i in range(N_LOOPS):
        text_generation(s=SAMPLE, m=model, t=tokenizer)
    del model

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

	- 21.53 seconds
	- 8.84 seconds
	- 8.82 seconds
	- 8.84 seconds
	- 8.86 seconds


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

	- 10.26 seconds
	- 10.28 seconds
	- 10.31 seconds
	- 10.32 seconds
	- 10.27 seconds



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/mila/g/gagnonju/.main/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /cvmfs/ai.mila.quebec/apps/arch/common/cuda/11.7/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/mila/g/gagnonju/.main/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/libbitsandbytes_cuda117.so...


  warn(msg)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

	- 28.56 seconds
	- 28.28 seconds
	- 28.36 seconds
	- 28.40 seconds
	- 28.33 seconds


The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

	- 10.28 seconds
	- 10.33 seconds
	- 10.28 seconds
	- 10.29 seconds
	- 10.29 seconds


The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

	- 28.54 seconds
	- 28.49 seconds
	- 28.56 seconds
	- 28.28 seconds
	- 28.37 seconds


In [4]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tue Jun  6 20:43:18 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.65.01    Driver Version: 515.65.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100 80G...  On   | 00000000:43:00.0 Off |                    0 |
| N/A   38C    P0    71W / 300W |  53538MiB / 81920MiB |      0%      Default |
|                               |            