In [1]:
import torch, importlib, time, sys
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
from os import path
import os

utils_path = path.abspath(path.join(os.path.abspath(""), "../myUtils"))
sys.path.append(utils_path)
import deviceUtils, IOUtils, checkUtils, modelUtils, parallelUtils

In [None]:
AutoModelForCausalLM.from_pretrained("")

In [3]:
# temperature=0.8, top_p=0.95,


class PagedAttention:
    def __init__(self, checkpoint):
        self.checkpoint = checkpoint

    def generate(self, prompts, batch, generationconfig):
        top_k = generationconfig.pop("top_k")
        n = generationconfig.pop("n")
        max_tokens = generationconfig.pop("max_tokens")
        min_tokens = generationconfig.pop("min_tokens")
        seed = generationconfig.pop("seed")
        temperature = generationconfig.pop("temperature")

        sampling_params = SamplingParams(
            top_k=top_k,
            temperature=temperature,
            n=n,
            max_tokens=max_tokens,
            min_tokens=min_tokens,
            seed=seed,
        )

        # EngineArgs(self.checkpoint, max_model_len=max_tokens)

        s1 = time.time()
        llm = LLM(model=self.checkpoint)
        print(llm.llm_engine.scheduler_config.max_model_len)

        s2 = time.time()
        lst = []

        if batch:
            outputs = llm.generate(prompts, sampling_params=sampling_params)

            lst = []
            for output in outputs:
                prompt: str = output.prompt
                generated_text = output.outputs[0].text
                lst.append(prompt + generated_text)

            s3 = time.time()

            return lst, s3 - s2, DeviceUtil.gpu_usage()

        else:
            for i in prompts:
                outputs = llm.generate(i, sampling_params=sampling_params)

                for output in outputs:
                    prompt: str = output.prompt
                    generated_text = output.outputs[0].text
                    lst.append(prompt + generated_text)

            s3 = time.time()
            return lst, s3 - s2, DeviceUtil.gpu_usage()


class NormalGenerator:
    def __init__(self):

        self.model = AutoModelForCausalLM.from_pretrained(
            "meta-llama/Meta-Llama-3-8B-Instruct",
            attn_implementation="flash_attention_2",
        )  # , token = "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"
        self.model.half()
        self.tokenizer = AutoTokenizer.from_pretrained(
            "meta-llama/Meta-Llama-3-8B-Instruct"
        )  # , token = "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")

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)

    def generate(self, prompts, batch, generationconfig):

        self.tokenizer.pad_token = self.tokenizer.eos_token
        top_k = generationconfig.pop("top_k")
        n = generationconfig.pop("n")
        max_tokens = generationconfig.pop("max_tokens")
        min_tokens = generationconfig.pop("min_tokens")
        do_sample = generationconfig.pop("do_sample")

        if batch:

            s = time.time()

            tokenized_chat = self.tokenizer(
                prompts, padding=True, truncation=True, return_tensors="pt"
            ).to(self.device)

            lst = self.model.generate(
                **tokenized_chat,
                pad_token_id=self.tokenizer.eos_token_id,
                do_sample=do_sample,
                min_length=min_tokens,
                max_length=max_tokens,
            )
            lst = self.tokenizer.batch_decode(lst, skip_special_tokens=True)

            s1 = time.time()

            return lst, s1 - s, DeviceUtil.gpu_usage()

        else:

            s = time.time()

            lst = []
            for i in prompts:
                tokenized_chat = self.tokenizer(
                    i, padding=True, truncation=True, return_tensors="pt"
                ).to(self.device)

                out = self.model.generate(
                    **tokenized_chat,
                    pad_token_id=self.tokenizer.eos_token_id,
                    do_sample=do_sample,
                    min_length=min_tokens,
                    max_length=max_tokens,
                )
                out = self.tokenizer.batch_decode(out, skip_special_tokens=True)
                lst.append(out[0])

            s1 = time.time()

            return lst, s1 - s, DeviceUtil.gpu_usage()

In [4]:
# def my_seed(seed=42):
#     import numpy as np
#     import random
#     import os

#     np.random.seed(seed)
#     random.seed(seed)
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.backends.cudnn.deterministic = True
#     torch.backends.cudnn.benchmark = False
#     os.environ["PYTHONHASHSEED"] = str(seed)


def main(prompts, machine_type, batched, generationconfig):

    x = DeviceUtil.gpu_usage()
    checkpoint = "meta-llama/Meta-Llama-3-8B-Instruct"
    # generationconfig = GenerationConfig.from_pretrained(
    #     checkpoint, max_length=128, min_lenght=128, do_sample=False
    # )

    if machine_type == "vLLM":
        pagedattention = PagedAttention(checkpoint)
        y = DeviceUtil.gpu_usage()

        output = pagedattention.generate(prompts, batched, generationconfig)

    elif machine_type == "regular":
        gen = NormalGenerator()
        y = DeviceUtil.gpu_usage()

        output = gen.generate(prompts, batched, generationconfig)

    else:
        y = []
        output = []

    output = x + y + list(output)

    return output

In [5]:
if __name__ == "__main__":

    set_seed(42)

    prompts = ["The biggest challenge we face is"] * 768

    # generationconfig_regular = {
    #     "max_tokens": 512,
    #     "min_tokens": 512,
    #     "do_sample": False,
    #     "top_k": 1,
    #     "n": 1,
    #     "seed": 42,
    #     "temperature": 1e-6,
    #     "top_p": None,
    # }

    # output_regular = main(
    #     prompts=prompts,
    #     machine_type="regular",
    #     batched=True,
    #     generationconfig=generationconfig_regular,
    # )

    generationconfig_vllm = {
        "max_tokens": 505,
        "min_tokens": 505,
        "do_sample": False,
        "top_k": 1,
        "temperature": 1,
        "n": 1,
        "seed": 42,
    }

    output_vllm = main(
        prompts=prompts,
        machine_type="vLLM",
        batched=True,
        generationconfig=generationconfig_vllm,
    )

    from transformers import AutoTokenizer

    checkpoint = "meta-llama/Meta-Llama-3-8B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

NameError: name 'DeviceUtil' is not defined

In [None]:
len(
    tokenizer.encode(
        "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ..."
    )
)

In [None]:
len(
    "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...".split(
        " "
    )
)

In [None]:
24499 * 16 / 512

In [None]:
from collections import Counter

x = Counter([i for j in [i.split(" ") for i in output_vllm[2]] for i in j])
x["..."]

f"total blocks: {24499}, blocks required: {1024 * 512 / 16}, unsaturated slots: {89352 / 16}"

In [None]:
#### FileUtils에 추가하기
import os


def find(name, path):
    for root, dirs, files in os.walk(path):
        if name in files:
            return os.path.join(root, name)


find(
    "scheduler.py",
    "/home/hyohyeongjang/.conda/envs/hyohyeongjang_base/lib/python3.9/site-packages/vllm",
)

In [None]:
(67845 - 17619) / 1024

In [None]:
4 * 128 * 2 * 4096 * 32 * 2 / (1024 * 1024)

In [None]:
1824 * 32 / 1024

In [None]:
model = AutoModelForCausalLM.from_pretrained(checkpoint)

In [None]:
pytorch_total_params = sum(p.numel() for p in model.parameters())
pytorch_total_params