# LLM Serving Framework

In [None]:
!pip install vllm==0.4.1 -qqq

## Offline serving

**Prepare dataset**

In [5]:
import torch
from datasets import load_dataset

def make_prompt(ddl, question, query=''):
    prompt = f"""당신은 SQL을 생성하는 SQL 봇입니다. DDL의 테이블을 활용한 Question을 해결할 수 있는 SQL 쿼리를 생성하세요.

### DDL:
{ddl}

### Question:
{question}

### SQL:
{query}"""
    return prompt

In [6]:
dataset = load_dataset("shangrilar/ko_text2sql", "origin")['test']
dataset = dataset.to_pandas()

In [10]:
dataset

Unnamed: 0,db_id,context,question,answer
0,1,CREATE TABLE quests (\n quest_id INT PRIMARY ...,각 보상 아이템별로 보상 경험치의 합을 구해줘,"SELECT reward_items, SUM(reward_experience) AS..."
1,1,CREATE TABLE players (\n player_id INT PRIMAR...,사용자 이름에 'admin'이 포함되어 있는 계정의 수를 알려주세요.,SELECT COUNT(*) FROM players WHERE username LI...
2,1,CREATE TABLE quests (\n quest_id INT PRIMARY ...,퀘스트 진행 상황이 100%인 퀘스트의 이름과 보상 경험치는 얼마인가요?,"SELECT q.name, q.reward_experience FROM quests..."
3,1,CREATE TABLE characters (\n character_id INT ...,경험이 5000000 이상이거나 직업이 전사인 캐릭터들의 이름은 무엇인가,SELECT name FROM characters WHERE experience >...
4,1,CREATE TABLE characters (\n character_id INT ...,레벨이 20 이상인 플레이어의 캐릭터 이름과 해당 캐릭터의 스킬 이름을 알아보세요.,"SELECT C.name, ST.skill_name FROM characters A..."
...,...,...,...,...
107,1,CREATE TABLE quests (\n quest_id INT PRIMARY ...,캐릭터가 완료한 퀘스트 중 보상 경험치가 100 이상 200 이하인 퀘스트의 이름과...,"SELECT q.name, q.reward_items FROM quests AS q..."
108,1,CREATE TABLE characters (\n character_id INT ...,"경험이 1000에서 2000 사이인 캐릭터들의 이름, 레벨, 경험치, 아이템 이름 ...","SELECT c.name, c.level, c.experience, i.item_n..."
109,1,CREATE TABLE npcs (\n npc_id INT PRIMARY KEY ...,각 역할별로 npc의 총 수를 알려주세요.,"SELECT role, COUNT(npc_id) FROM npcs GROUP BY ..."
110,1,CREATE TABLE characters (\n character_id INT ...,장비가 장착된 상태인 캐릭터의 플레이어 ID와 아이템 이름을 나열하십시오.,"SELECT T1.player_id, T2.item_name FROM charact..."


In [9]:
dataset['context'][0], dataset['question'][0]

('CREATE TABLE quests (\n  quest_id INT PRIMARY KEY AUTO_INCREMENT,\n  name VARCHAR(255) NOT NULL,\n  description TEXT,\n  reward_experience INT NOT NULL,\n  reward_items VARCHAR(255)\n);',
 '각 보상 아이템별로 보상 경험치의 합을 구해줘')

In [7]:
for idx, row in dataset.iterrows():
    prompt = make_prompt(row['context'], row['question'])
    dataset.loc[idx, 'prompt'] = prompt

In [12]:
dataset

Unnamed: 0,db_id,context,question,answer,prompt
0,1,CREATE TABLE quests (\n quest_id INT PRIMARY ...,각 보상 아이템별로 보상 경험치의 합을 구해줘,"SELECT reward_items, SUM(reward_experience) AS...",당신은 SQL을 생성하는 SQL 봇입니다. DDL의 테이블을 활용한 Question...
1,1,CREATE TABLE players (\n player_id INT PRIMAR...,사용자 이름에 'admin'이 포함되어 있는 계정의 수를 알려주세요.,SELECT COUNT(*) FROM players WHERE username LI...,당신은 SQL을 생성하는 SQL 봇입니다. DDL의 테이블을 활용한 Question...
2,1,CREATE TABLE quests (\n quest_id INT PRIMARY ...,퀘스트 진행 상황이 100%인 퀘스트의 이름과 보상 경험치는 얼마인가요?,"SELECT q.name, q.reward_experience FROM quests...",당신은 SQL을 생성하는 SQL 봇입니다. DDL의 테이블을 활용한 Question...
3,1,CREATE TABLE characters (\n character_id INT ...,경험이 5000000 이상이거나 직업이 전사인 캐릭터들의 이름은 무엇인가,SELECT name FROM characters WHERE experience >...,당신은 SQL을 생성하는 SQL 봇입니다. DDL의 테이블을 활용한 Question...
4,1,CREATE TABLE characters (\n character_id INT ...,레벨이 20 이상인 플레이어의 캐릭터 이름과 해당 캐릭터의 스킬 이름을 알아보세요.,"SELECT C.name, ST.skill_name FROM characters A...",당신은 SQL을 생성하는 SQL 봇입니다. DDL의 테이블을 활용한 Question...
...,...,...,...,...,...
107,1,CREATE TABLE quests (\n quest_id INT PRIMARY ...,캐릭터가 완료한 퀘스트 중 보상 경험치가 100 이상 200 이하인 퀘스트의 이름과...,"SELECT q.name, q.reward_items FROM quests AS q...",당신은 SQL을 생성하는 SQL 봇입니다. DDL의 테이블을 활용한 Question...
108,1,CREATE TABLE characters (\n character_id INT ...,"경험이 1000에서 2000 사이인 캐릭터들의 이름, 레벨, 경험치, 아이템 이름 ...","SELECT c.name, c.level, c.experience, i.item_n...",당신은 SQL을 생성하는 SQL 봇입니다. DDL의 테이블을 활용한 Question...
109,1,CREATE TABLE npcs (\n npc_id INT PRIMARY KEY ...,각 역할별로 npc의 총 수를 알려주세요.,"SELECT role, COUNT(npc_id) FROM npcs GROUP BY ...",당신은 SQL을 생성하는 SQL 봇입니다. DDL의 테이블을 활용한 Question...
110,1,CREATE TABLE characters (\n character_id INT ...,장비가 장착된 상태인 캐릭터의 플레이어 ID와 아이템 이름을 나열하십시오.,"SELECT T1.player_id, T2.item_name FROM charact...",당신은 SQL을 생성하는 SQL 봇입니다. DDL의 테이블을 활용한 Question...


In [13]:
dataset['prompt'][0]

'당신은 SQL을 생성하는 SQL 봇입니다. DDL의 테이블을 활용한 Question을 해결할 수 있는 SQL 쿼리를 생성하세요.\n\n### DDL:\nCREATE TABLE quests (\n  quest_id INT PRIMARY KEY AUTO_INCREMENT,\n  name VARCHAR(255) NOT NULL,\n  description TEXT,\n  reward_experience INT NOT NULL,\n  reward_items VARCHAR(255)\n);\n\n### Question:\n각 보상 아이템별로 보상 경험치의 합을 구해줘\n\n### SQL:\n'

**Set pipeline**

In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

2024-11-19 03:56:13.973338: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-19 03:56:14.066764: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-19 03:56:14.070583: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2024-11-19 03:56:14.070594: I tensorflow/stream_executor/cuda

In [15]:
model_id = "shangrilar/yi-ko-6b-text2sql"

model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_id)

hf_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/9.74k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.28M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


**Check inference time according to batch size**

In [7]:
import time

In [17]:
for batch_size in [1, 2, 4, 8, 16, 32]:
    start_time = time.time()
    hf_pipeline(dataset['prompt'].tolist(), max_new_tokens=128, batch_size=batch_size)
    print(f"{batch_size}: {time.time() - start_time}")

1: 163.83604764938354
2: 131.64016199111938
4: 87.19056105613708
8: 56.38193464279175
16: 33.51246905326843
32: 23.756344318389893


**Load vLLM model**

In [4]:
from vllm import LLM, SamplingParams

In [5]:
model_id = "shangrilar/yi-ko-6b-text2sql"

llm = LLM(model=model_id, dtype=torch.float16, max_model_len=1024)

INFO 11-19 04:25:11 llm_engine.py:98] Initializing an LLM engine (v0.4.1) with config: model='shangrilar/yi-ko-6b-text2sql', speculative_config=None, tokenizer='shangrilar/yi-ko-6b-text2sql', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 11-19 04:25:11 utils.py:608] Found nccl from library /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 11-19 04:25:12 selector.py:77] Cannot use FlashAttention backend because the flash_attn package is not found. Please install it for better performance.
INFO 11-19 04:25:12 selector.py:33] Using XFormers backend.
INFO 11-19 04:25:13 weight_utils.py:193] Using model weights format ['*.safetensors']
INFO 11-19 04:25:16 model_runner.py:173] Loading model weights took 11.5127 GB
INFO 11-19 04:25:17 gpu_executor.py:119] # GPU blocks: 9049, # CPU blocks: 4096
INFO 11-19 04:25:18 model_runner.py:976] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 11-19 04:25:18 model_runner.py:980] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eag

**Measuring offline inference time using vLLM**

In [8]:
# max_num_seqs : Control the number of input data to perform inference on simultaneously
for max_num_seqs in [1, 2, 4, 8, 16, 32]:
    start_time = time.time()
    llm.llm_engine.scheduler_config.max_num_seqs = max_num_seqs
    sampling_params = SamplingParams(temperature=1, top_p=1, max_tokens=128)
    output = llm.generate(dataset['prompt'].to_list(), sampling_params)
    print(f"{max_num_seqs}: {time.time() - start_time}")

Processed prompts: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 112/112 [01:02<00:00,  1.80it/s]


1: 62.1559956073761


Processed prompts: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 112/112 [00:34<00:00,  3.27it/s]


2: 34.31376624107361


Processed prompts: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 112/112 [00:19<00:00,  5.61it/s]


4: 19.977014541625977


Processed prompts: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 112/112 [00:11<00:00,  9.57it/s]


8: 11.73049283027649


Processed prompts: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 112/112 [00:07<00:00, 14.36it/s]


16: 7.821612358093262


Processed prompts: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 112/112 [00:06<00:00, 18.36it/s]

32: 6.1246702671051025





## Online serving

**Running a vLLM API server for online serving**

In [1]:
%%bash

nohup python3 -m vllm.entrypoints.openai.api_server \
--model shangrilar/yi-ko-6b-text2sql --host 127.0.0.1 --port 8890 --max-model-len 1024 &

INFO 11-19 04:34:02 api_server.py:151] vLLM API server version 0.4.1
INFO 11-19 04:34:02 api_server.py:152] args: Namespace(host='127.0.0.1', port=8890, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key=None, served_model_name=None, lora_modules=None, chat_template=None, response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_cert_reqs=0, root_path=None, middleware=[], model='shangrilar/yi-ko-6b-text2sql', tokenizer=None, skip_tokenizer_init=False, revision=None, code_revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=False, download_dir=None, load_format='auto', dtype='auto', kv_cache_dtype='auto', quantization_param_path=None, max_model_len=1024, guided_decoding_backend='outlines', worker_use_ray=False, pipeline_parallel_size=1, tensor_parallel_size=1, max_parallel_loading_workers=None, ray_workers_use_nsight=False, block_size=16, enable_prefix_ca

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 11-19 04:34:02 utils.py:608] Found nccl from library /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 11-19 04:34:03 selector.py:77] Cannot use FlashAttention backend because the flash_attn package is not found. Please install it for better performance.
INFO 11-19 04:34:03 selector.py:33] Using XFormers backend.
INFO 11-19 04:34:04 weight_utils.py:193] Using model weights format ['*.safetensors']
INFO 11-19 04:34:07 model_runner.py:173] Loading model weights took 11.5127 GB
INFO 11-19 04:34:07 gpu_executor.py:119] # GPU blocks: 9049, # CPU blocks: 4096
INFO 11-19 04:34:09 model_runner.py:976] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 11-19 04:34:09 model_runner.py:980] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eag

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.




Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
INFO:     Started server process [18633]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8890 (Press CTRL+C to quit)


INFO 11-19 04:34:24 metrics.py:229] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%
INFO 11-19 04:34:34 metrics.py:229] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%
INFO 11-19 04:34:44 metrics.py:229] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%
INFO 11-19 04:34:54 metrics.py:229] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%
INFO 11-19 04:35:04 metrics.py:229] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Swapp

INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [18633]


In [None]:
!curl http://localhost:8888/v1/models

**API request**

In [3]:
import json

In [8]:
json_data = json.dumps(
    {"model": "shangrilar/yi-ko-6b-text2sql",
     "prompt": dataset.loc[0, "prompt"],
     "max_tokens": 128,
     "temperature": 1}
)

In [None]:
!curl http://localhost:8888/v1/completions \
    -H "Content-Type: application/json" \
    -d '{json_data}'

**OpenAI 클라이언트를 사용한 API 요청**

In [9]:
from openai import OpenAI

In [None]:
# openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8888/v1"
client = OpenAI(
    # api_key=openai_api_key,
    base_url=openai_api_base,
)
completion = client.completions.create(model="shangrilar/yi-ko-6b-text2sql", prompt=dataset.loc[0, 'prompt'], max_tokens=128)
print("생성 결과:", completion.choices[0].text)