### Offline Serving

In [1]:
import torch
from datasets import load_dataset

def make_prompt(ddl, question, query=''):
    prompt = f"""당신은 SQL을 생성하는 SQL 봇입다. DDL의 테이블을 활용한 Question을 해결할 수 있는 SQL 쿼리를 생성하세요.

### DDL:
{ddl}

### Question:
{question}

### SQL:
{query}"""
    return prompt

dataset = load_dataset("shangrilar/ko_text2sql", "origin")['test']
dataset = dataset.to_pandas()

for idx, row in dataset.iterrows():
    prompt = make_prompt(row['context'], row['question'])
    dataset.loc[idx, 'prompt'] = prompt

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model_id = "shangrilar/yi-ko-6b-text2sql"
model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto', load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_id)
hf_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

2024-11-30 12:55:45.826018: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-30 12:55:45.826089: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-30 12:55:46.014250: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-30 12:55:46.403075: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
The `load_in_4bit` and `load_in_8bit` arguments are d

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
import time
for batch_size in [1, 2, 4, 8, 16, 32]:
    start_time = time.time()
    hf_pipeline(dataset['prompt'].tolist(), max_new_tokens=128, batch_size=batch_size)
    print(f"{batch_size}: {time.time() - start_time}")

1: 308.8398838043213
2: 278.75573992729187
4: 189.21455264091492
8: 128.89672327041626
16: 97.38903379440308
32: 82.73513388633728


In [2]:
from vllm import LLM, SamplingParams

model_id = "shangrilar/yi-ko-6b-text2sql"
llm = LLM(model=model_id, dtype=torch.float16, max_model_len=1024)

2024-11-30 14:23:34.457784: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-30 14:23:34.457842: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-30 14:23:34.459172: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-30 14:23:34.473675: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


INFO 11-30 14:23:53 config.py:350] This model supports multiple tasks: {'embedding', 'generate'}. Defaulting to 'generate'.
INFO 11-30 14:23:53 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='shangrilar/yi-ko-6b-text2sql', speculative_config=None, tokenizer='shangrilar/yi-ko-6b-text2sql', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=shangrilar/yi-ko-6b-tex

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


INFO 11-30 14:25:18 model_runner.py:1077] Loading model weights took 11.5127 GB
INFO 11-30 14:25:57 worker.py:232] Memory profiling results: total_gpu_memory=12.00GiB initial_memory_usage=12.00GiB peak_torch_memory=12.25GiB memory_usage_post_profile=12.00GiB non_torch_memory=0.48GiB kv_cache_size=-1.93GiB gpu_memory_utilization=0.90
INFO 11-30 14:25:57 gpu_executor.py:113] # GPU blocks: 0, # CPU blocks: 4096
INFO 11-30 14:25:57 gpu_executor.py:117] Maximum concurrency for 1024 tokens per request: 0.00x


ValueError: No available memory for the cache blocks. Try increasing `gpu_memory_utilization` when initializing the engine.

In [None]:
import time

for max_num_seqs in [1, 2, 4, 8, 16, 32]:
    start_time = time.time()
    llm.llm_engine.scheduler_config.max_num_seqs = max_num_seqs
    sampling_params = SamplingParams(temperature=1, top_p=1, max_tokens=128)
    outputs = llm.generate(dataset['prompt'].tolist(), sampling_params)
    print(f"{max_num_seqs}: {time.time() - start_time}")

### Online Serving

In [3]:
!nohup python -m vllm.entrypoints.openai.api_server \
--model shangrilar/yi-ko-6b-text2sql --host 127.0.0.1 --port 8888 --max-model-len 1024

nohup: ignoring input and appending output to 'nohup.out'


In [4]:
!curl http://localhost:8888/v1/models

curl: (7) Failed to connect to localhost port 8888 after 4 ms: Couldn't connect to server


In [5]:
import json

json_data = json.dumps(
    {
        "model": "shangrilar/yi-ko-6b-text2sql",
        "prompt": dataset.loc[0, "prompt"],
        "max_tokens": 128,
        "temperature": 1
    }
)

!curl http://localhost:8888/v1/completions \
    -H "Content-Type: application/json" \
    -d '{json_data}'

curl: (7) Failed to connect to localhost port 8888 after 4 ms: Couldn't connect to server


In [None]:
from openai import OpenAI

openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8888/v1"
client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)
completion = client.completions.create(
    model="shangrilar/yi-ko-6b-text2sql",
    prompt=dataset.loc[0, 'prompt'],
    max_tokens=128
)
print("생성 결과:", completion.choices[0].text)