# vLLM

In [10]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"

In [3]:
import torch
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
from tqdm.auto import tqdm

In [4]:
model_name = "mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

# data prep

In [6]:
import pandas as pd

In [7]:
# df = pd.read_csv('data/llama_31_8b_answers.csv')
# df = pd.read_csv('data/llama_quip4.csv')
df = pd.read_csv('data/responses_quip2.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55998 entries, 0 to 55997
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   prompt         55998 non-null  object
 1   lang           55998 non-null  object
 2   scenario       55998 non-null  object
 3   eng_prompt     55998 non-null  object
 4   method         55998 non-null  object
 5   model          55998 non-null  object
 6   tokenized_len  55998 non-null  int64 
 7   source_idx     55998 non-null  int64 
 8   response       55998 non-null  object
dtypes: int64(2), object(7)
memory usage: 3.8+ MB


# no quant

In [11]:
llm = LLM(
    model=model_name, 
    dtype=torch.bfloat16, 
    trust_remote_code=True,
    tensor_parallel_size=2
)

INFO 09-08 11:36:53 config.py:813] Defaulting to use mp for distributed inference
INFO 09-08 11:36:53 config.py:911] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 09-08 11:36:53 llm_engine.py:184] Initializing an LLM engine (v0.5.5) with config: model='mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated', speculative_config=None, tokenizer='mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_ti

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


[1;36m(VllmWorkerProcess pid=729361)[0;0m INFO 09-08 11:37:27 model_runner.py:890] Loading model weights took 7.5122 GB
INFO 09-08 11:37:27 model_runner.py:890] Loading model weights took 7.5122 GB
INFO 09-08 11:37:29 distributed_gpu_executor.py:56] # GPU blocks: 31624, # CPU blocks: 4096
INFO 09-08 11:37:35 model_runner.py:1181] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
[1;36m(VllmWorkerProcess pid=729361)[0;0m INFO 09-08 11:37:35 model_runner.py:1185] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-08 11:37:35 model_runner.py:1181] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not stati

In [12]:
translation_prompt = lambda text: f""\
    f"You are school English teacher\n"\
    f"Translate following text into English '{text}'\n"\
    f"Provide only translation"

In [13]:
def generate_translation_prompt(prompt):
    user_message = translation_prompt(prompt)
    messages = [{"role": "user", "content": user_message}]
    formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return formatted

In [14]:
translate_prompts = df.apply(
    lambda row: generate_translation_prompt(row.response), 
    axis=1
)

In [15]:
sampling_params = SamplingParams(
    temperature=0.6, 
    top_p=0.9, 
    max_tokens=512
)

In [16]:
outputs_translate = llm.generate(
    list(translate_prompts), 
    sampling_params
)

Processed prompts: 100%|█| 55998/55998 [1:04:35<00:00, 14.45it/s, est. speed input: 2892.51 toks/s, ou


In [17]:
translated = [out.outputs[0].text for out in outputs_translate]

In [18]:
translation_fixed = []
for tr in translated:
    if 'here is the translation:' in tr.lower():
        translation_fixed.append(tr[24:].strip())
    else:
        translation_fixed.append(tr.strip())

In [19]:
df['translated_response'] = translation_fixed

In [20]:
df.to_csv('data/translated_quip2.csv', index=False)