# vLLM

In [1]:
import torch
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
from tqdm.auto import tqdm

In [2]:
model_name = "mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

# data prep

In [4]:
import pandas as pd

In [6]:
# df = pd.read_csv('data/llama_31_8b_answers.csv')
df = pd.read_csv('data/Mistral_7B_Instruct_v02_answers.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27999 entries, 0 to 27998
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   prompt                   27999 non-null  object
 1   lang                     27999 non-null  object
 2   scenario                 27999 non-null  object
 3   mistral_7b_instruct_v02  27999 non-null  object
dtypes: object(4)
memory usage: 875.1+ KB


# no quant

In [7]:
llm = LLM(
    model=model_name, 
    dtype=torch.bfloat16, 
    trust_remote_code=True,
)

INFO 08-09 07:48:33 config.py:820] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 08-09 07:48:33 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated', speculative_config=None, tokenizer='mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated, use_v2_block_manager=False, en

[W809 07:48:37.534618625 socket.cpp:697] [c10d] The client socket cannot be initialized to connect to [gn34.zhores]:44461 (errno: 97 - Address family not supported by protocol).


INFO 08-09 07:48:41 weight_utils.py:225] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 08-09 07:51:05 model_runner.py:732] Loading model weights took 14.9888 GB
INFO 08-09 07:51:07 gpu_executor.py:102] # GPU blocks: 28232, # CPU blocks: 2048
INFO 08-09 07:51:18 model_runner.py:1024] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-09 07:51:18 model_runner.py:1028] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-09 07:51:45 model_runner.py:1225] Graph capturing finished in 27 secs.


In [8]:
translation_prompt = lambda text: f""\
    f"You are school English teacher\n"\
    f"Translate following text into English '{text}'\n"\
    f"Provide only translation"

In [9]:
def generate_translation_prompt(prompt):
    user_message = translation_prompt(prompt)
    messages = [{"role": "user", "content": user_message}]
    formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return formatted

In [11]:
translate_prompts = df.apply(
    lambda row: generate_translation_prompt(row.mistral_7b_instruct_v02), 
    axis=1
)

In [12]:
sampling_params = SamplingParams(
    temperature=0.6, 
    top_p=0.9, 
    max_tokens=512
)

In [13]:
outputs_translate = llm.generate(
    list(translate_prompts), 
    sampling_params
)

Processed prompts:   5%|▌         | 1532/27999 [01:51<13:20, 33.07it/s, est. speed input: 3319.46 toks/s, output: 1951.19 toks/s] 

KeyboardInterrupt: 

In [None]:
translated = [out.outputs[0].text for out in outputs_translate]

In [None]:
translation_fixed = []
for tr in translated:
    if 'here is the translation:' in tr.lower():
        translation_fixed.append(tr[24:].strip())
    else:
        translation_fixed.append(tr.strip())

In [None]:
df['translated_mistral_7b_instruct_v02'] = translation_fixed

In [None]:
df.to_csv('data/translated_mistral_7b_instruct_v02.csv', index=False)

Processed prompts:   5%|▌         | 1532/27999 [02:04<13:20, 33.07it/s, est. speed input: 3319.46 toks/s, output: 1951.19 toks/s]