In [2]:
import torch
from vllm import LLM, SamplingParams, EngineArgs
from transformers import (
    AutoModel,
    AutoTokenizer,
    AutoModelForCausalLM,
    GenerationConfig,
    set_seed,
)

import importlib

import vllm

importlib.reload(vllm)
from vllm import LLM, SamplingParams

import transformers

importlib.reload(transformers)
from transformers import AutoModelForCausalLM

In [3]:
torch.cuda.is_available()

False

In [2]:
class DeviceUtil:
    @staticmethod
    def gpu_usage():
        try:
            import nvidia_smi

            nvidia_smi.nvmlInit()
            deviceCount = nvidia_smi.nvmlDeviceGetCount()
            lst = []
            for i in range(deviceCount):
                handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
                util = nvidia_smi.nvmlDeviceGetUtilizationRates(handle)
                mem = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
                lst.append(
                    f"|Device {i}| Mem Free: {mem.free/1024**2:5.2f}MB / {mem.total/1024**2:5.2f}MB"
                    + f"(used: {(mem.total/1024**2 - mem.free/1024**2):5.2f}MB, {round((mem.total/1024**2 - mem.free/1024**2) / (mem.total/1024**2) * 100, 1)}%)",
                )
            return lst

        except:
            return []

    @staticmethod
    def empty_gpu(*, args_list, delete=False):
        import gc, time

        for i in args_list:
            if delete:
                del i
        gc.collect()
        torch.cuda.empty_cache()
        for i in range(5):
            time.sleep(1)
            print(f"{i} of 5")
        DeviceUtil.gpu_usage()


class ModelUtil:
    import torch

    def __init__(
        self,
        *,
        model_checkpoint,
        model_loading_class,
        tokenizer_loading_class,
        tokenizer_checkpoint=None,
        cpu_only=False,
        to_device=True,
    ):
        self.model = model_loading_class.from_pretrained(model_checkpoint)
        count_params = sum(p.numel() for p in self.model.parameters())
        print("Total Parameters: ", "{:,}".format(count_params))

        if tokenizer_checkpoint == None:
            self.tokenizer = tokenizer_loading_class.from_pretrained(model_checkpoint)
        else:
            self.tokenizer = tokenizer_loading_class.from_pretrained(
                tokenizer_checkpoint
            )

        print("Before: ")
        DeviceUtil.gpu_usage()

        self.device = (
            "cuda" if (cpu_only == False) and torch.cuda.is_available() else "cpu"
        )
        if to_device == True:
            self.model.to(self.device)

        print("After: ")
        DeviceUtil.gpu_usage()


DeviceUtil.gpu_usage()

['|Device 0| Mem Free: 81082.38MB / 81920.00MB(used: 837.62MB, 1.0%)']

In [3]:
import time

# temperature=0.8, top_p=0.95,


class PagedAttention:
    def __init__(self, checkpoint):
        self.checkpoint = checkpoint

    def generate(self, prompts, batch, generationconfig):
        top_k = generationconfig.pop("top_k")
        n = generationconfig.pop("n")
        max_tokens = generationconfig.pop("max_tokens")
        min_tokens = generationconfig.pop("min_tokens")
        seed = generationconfig.pop("seed")
        temperature = generationconfig.pop("temperature")

        sampling_params = SamplingParams(
            top_k=top_k,
            temperature=temperature,
            n=n,
            max_tokens=max_tokens,
            min_tokens=min_tokens,
            seed=seed,
        )

        # EngineArgs(self.checkpoint, max_model_len=max_tokens)

        s1 = time.time()
        llm = LLM(model=self.checkpoint)
        print(llm.llm_engine.scheduler_config.max_model_len)

        s2 = time.time()
        lst = []

        if batch:
            outputs = llm.generate(prompts, sampling_params=sampling_params)

            lst = []
            for output in outputs:
                prompt: str = output.prompt
                generated_text = output.outputs[0].text
                lst.append(prompt + generated_text)

            s3 = time.time()

            return lst, s3 - s2, DeviceUtil.gpu_usage()

        else:
            for i in prompts:
                outputs = llm.generate(i, sampling_params=sampling_params)

                for output in outputs:
                    prompt: str = output.prompt
                    generated_text = output.outputs[0].text
                    lst.append(prompt + generated_text)

            s3 = time.time()
            return lst, s3 - s2, DeviceUtil.gpu_usage()


class NormalGenerator:
    def __init__(self):

        self.model = AutoModelForCausalLM.from_pretrained(
            "meta-llama/Meta-Llama-3-8B-Instruct",
            attn_implementation="flash_attention_2",
        )  # , token = "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"
        self.model.half()
        self.tokenizer = AutoTokenizer.from_pretrained(
            "meta-llama/Meta-Llama-3-8B-Instruct"
        )  # , token = "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")

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)

    def generate(self, prompts, batch, generationconfig):

        self.tokenizer.pad_token = self.tokenizer.eos_token
        top_k = generationconfig.pop("top_k")
        n = generationconfig.pop("n")
        max_tokens = generationconfig.pop("max_tokens")
        min_tokens = generationconfig.pop("min_tokens")
        do_sample = generationconfig.pop("do_sample")

        if batch:

            s = time.time()

            tokenized_chat = self.tokenizer(
                prompts, padding=True, truncation=True, return_tensors="pt"
            ).to(self.device)

            lst = self.model.generate(
                **tokenized_chat,
                pad_token_id=self.tokenizer.eos_token_id,
                do_sample=do_sample,
                min_length=min_tokens,
                max_length=max_tokens,
            )
            lst = self.tokenizer.batch_decode(lst, skip_special_tokens=True)

            s1 = time.time()

            return lst, s1 - s, DeviceUtil.gpu_usage()

        else:

            s = time.time()

            lst = []
            for i in prompts:
                tokenized_chat = self.tokenizer(
                    i, padding=True, truncation=True, return_tensors="pt"
                ).to(self.device)

                out = self.model.generate(
                    **tokenized_chat,
                    pad_token_id=self.tokenizer.eos_token_id,
                    do_sample=do_sample,
                    min_length=min_tokens,
                    max_length=max_tokens,
                )
                out = self.tokenizer.batch_decode(out, skip_special_tokens=True)
                lst.append(out[0])

            s1 = time.time()

            return lst, s1 - s, DeviceUtil.gpu_usage()

In [4]:
# def my_seed(seed=42):
#     import numpy as np
#     import random
#     import os

#     np.random.seed(seed)
#     random.seed(seed)
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.backends.cudnn.deterministic = True
#     torch.backends.cudnn.benchmark = False
#     os.environ["PYTHONHASHSEED"] = str(seed)


def main(prompts, machine_type, batched, generationconfig):

    x = DeviceUtil.gpu_usage()
    checkpoint = "meta-llama/Meta-Llama-3-8B-Instruct"
    # generationconfig = GenerationConfig.from_pretrained(
    #     checkpoint, max_length=128, min_lenght=128, do_sample=False
    # )

    if machine_type == "vLLM":
        pagedattention = PagedAttention(checkpoint)
        y = DeviceUtil.gpu_usage()

        output = pagedattention.generate(prompts, batched, generationconfig)

    elif machine_type == "regular":
        gen = NormalGenerator()
        y = DeviceUtil.gpu_usage()

        output = gen.generate(prompts, batched, generationconfig)

    else:
        y = []
        output = []

    output = x + y + list(output)

    return output

In [5]:
if __name__ == "__main__":

    set_seed(42)

    prompts = ["The biggest challenge we face is"] * 765

    # generationconfig_regular = {
    #     "max_tokens": 512,
    #     "min_tokens": 512,
    #     "do_sample": False,
    #     "top_k": 1,
    #     "n": 1,
    #     "seed": 42,
    #     "temperature": 1e-6,
    #     "top_p": None,
    # }

    # output_regular = main(
    #     prompts=prompts,
    #     machine_type="regular",
    #     batched=True,
    #     generationconfig=generationconfig_regular,
    # )

    generationconfig_vllm = {
        "max_tokens": 505,
        "min_tokens": 505,
        "do_sample": False,
        "top_k": 1,
        "temperature": 1,
        "n": 1,
        "seed": 42,
    }

    output_vllm = main(
        prompts=prompts,
        machine_type="vLLM",
        batched=True,
        generationconfig=generationconfig_vllm,
    )

    from transformers import AutoTokenizer

    checkpoint = "meta-llama/Meta-Llama-3-8B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)



INFO 07-09 23:04:31 llm_engine.py:161] Initializing an LLM engine (v0.5.0.post1) with config: model='meta-llama/Meta-Llama-3-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=meta-llama/Meta-Llama-3-8B-Instruct)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


attention backend:  _Backend.FLASH_ATTN
attention backend:  _Backend.FLASH_ATTN
INFO 07-09 23:04:32 weight_utils.py:218] Using model weights format ['*.safetensors']
INFO 07-09 23:04:41 model_runner.py:160] Loading model weights took 14.9595 GB
gpu 0.9
gpu_memory: free, total, cache, gpu_utilization, num_cpu_blocks (59444166656, 85024112640, 2097152, 0.9, 24499)
INFO 07-09 23:04:43 gpu_executor.py:83] # GPU blocks: 24499, # CPU blocks: 2048
kv_cache_shape: 2, num_blocks, block_size, num_kv_heads, head_size (2, 24499, 16, 8, 128)
is_pin_memory_available: False
kv_cache_shape: 2, num_blocks, block_size, num_kv_heads, head_size (2, 2048, 16, 8, 128)
is_pin_memory_available: True
INFO 07-09 23:04:46 model_runner.py:889] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 07-09 23:04:46 model_runner.py:893] CUDA graphs can take additional 1

Processed prompts:   0%| | 0/102



Processed prompts: 100%|█| 1024/
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [28]:
len(
    tokenizer.encode(
        "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ..."
    )
)

90

In [27]:
len(
    "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...".split(
        " "
    )
)

89

In [34]:
24499 * 16 / 512

765.59375

In [33]:
from collections import Counter

x = Counter([i for j in [i.split(" ") for i in output_vllm[2]] for i in j])
x["..."]

f"total blocks: {24499}, blocks required: {1024 * 512 / 16}, unsaturated slots: {89352 / 16}"

'total blocks: 24499, blocks required: 32768.0, unsaturated slots: 5584.5'

5584.5

89352

In [9]:
#### FileUtils에 추가하기
import os


def find(name, path):
    for root, dirs, files in os.walk(path):
        if name in files:
            return os.path.join(root, name)


find(
    "scheduler.py",
    "/home/hyohyeongjang/.conda/envs/hyohyeongjang_base/lib/python3.9/site-packages/vllm",
)

'/home/hyohyeongjang/.conda/envs/hyohyeongjang_base/lib/python3.9/site-packages/vllm/core/scheduler.py'

In [10]:
(67845 - 17619) / 1024

49.048828125

In [11]:
4 * 128 * 2 * 4096 * 32 * 2 / (1024 * 1024)

256.0

In [12]:
1824 * 32 / 1024

57.0

In [13]:
model = AutoModelForCausalLM.from_pretrained(checkpoint)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [14]:
pytorch_total_params = sum(p.numel() for p in model.parameters())
pytorch_total_params

8030261248