### Load the dataset

In [1]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, spa_path, wayuu_path):
        with open(spa_path, 'r', encoding='utf-8') as f:
            self.spa_lines = [line.strip() for line in f if line.strip()]

        with open(wayuu_path, 'r', encoding='utf-8') as f:
            self.wayuu_lines = [line.strip() for line in f if line.strip()]

    def __len__(self):
        return len(self.spa_lines)

    def __getitem__(self, idx):
        spa = self.spa_lines[idx]
        wayuu = self.wayuu_lines[idx]
        
        return spa, wayuu
    
spanish_val_file = 'datasets/dev.es.txt'
wayuu_val_file = 'datasets/dev.guc.txt'

# Load the dataset
dataset = TextDataset(spanish_val_file, wayuu_val_file)

### Eval function

In [2]:
import torch
import sacrebleu
from tqdm import tqdm
from vllm import SamplingParams

def get_rewards_translation(generations, correct_translations):

    bleu = sacrebleu.BLEU(effective_order = True)
    def get_bleu_score(sample, correct_translation):
        # Compute bleu score for each sample. 
        # Bleu score normalized to [0, 1]
        return bleu.sentence_score(sample, 
                                   [correct_translation]
                                   ).score

    answer_bleu_scores = [
        get_bleu_score(sample, translation)
        for sample, translation in zip(generations, correct_translations)
    ]
    
    return answer_bleu_scores

translate_prompt_template_tool="""Translate the following Spanish text into Wayuunaiki.
Begin by identifying any words or phrases you're unsure how to translate. Then, you may look up those words using the dictionary tool by wrapping the Spanish word in <spa_to_wayuu> and </spa_to_wayuu>,
and doind that for every unknown word. The dictionary will return matches enclosed in <matches> and </matches>. You can use the dictionary as many times as necessary.
Once you have all the information you need, provide the final translation enclosed in <answer> and </answer>. For example: <answer> xxx </answer>.

Spanish text: {}"""
def generate_batch_completion(model, tokenizer, prompts: list, actions_num=1, **kwargs):
    batch = [[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": translate_prompt_template_tool.format(prompt)}
    ] for prompt in prompts]
    texts = tokenizer.apply_chat_template(
        batch,
        tokenize=False,
        add_generation_prompt=True,
    )

    default_sampling_args = {
        'max_new_tokens': 512,
        'temperature': 0.8,
        'top_p': 0.95,
    }
    default_sampling_args.update(kwargs)

    model_inputs = tokenizer(texts)

    inputs = model_inputs.input_ids
    dones = [False] * len(prompts)
    prompt_length = [len(input_ids) for input_ids in inputs]
    mask = [[1] * len(input_ids) for input_ids in inputs]
    responses = [""] * len(prompts)
    tools_enabled = kwargs.get('tools', [])
    stop_tokens = [tool['end_token'] for tool in tools_enabled]
    tool_used = [False] * len(prompts)
    how_many_tool_calls = [0] * len(prompts)
    for action_step in range(actions_num + 1 if len(tools_enabled) > 0 else 1):
        sampling_params = SamplingParams(temperature=default_sampling_args["temperature"], top_p=default_sampling_args['top_p'], top_k=-1, max_tokens=default_sampling_args['max_new_tokens'],
            stop=stop_tokens)
        outputs = model.generate(prompt_token_ids=inputs, sampling_params=sampling_params, lora_request=kwargs['lora_request'], use_tqdm=False)

        for j, output in enumerate(outputs):
            if dones[j]:
                continue
            
            for tool in tools_enabled:
                if output.outputs[0].stop_reason == tool['end_token'] and tool['start_token'] in output.outputs[0].text:
                    api_args = output.outputs[0].text.split(tool['start_token'])[1].strip()
                    api_result = tool['api'](api_args)
                    # responses[j] += f"{tool['start_token']} " + api_args + f" {tool['end_token']}" + api_result
                    responses[j] += output.outputs[0].text + f"{tool['end_token']}" + api_result
                    api_result_tokens = tokenizer.encode(api_result, return_tensors=None)
                    inputs[j] += list(output.outputs[0].token_ids) + api_result_tokens

                    tool_used[j] = True
                    how_many_tool_calls[j] += 1
                    break # Only one tool can be used at a time
            if output.outputs[0].finish_reason == "stop" and output.outputs[0].stop_reason is None:
                responses[j] += output.outputs[0].text
                inputs[j] += list(output.outputs[0].token_ids)
                dones[j] = True
            elif output.outputs[0].stop_reason not in stop_tokens:
                print(f"Unexpected finish reason: {output.outputs[0].finish_reason} {output.outputs[0].stop_reason}")
                responses[j] += tokenizer.eos_token
                inputs[j] += [tokenizer.eos_token_id]
                dones[j] = True

    return responses, tool_used, how_many_tool_calls

import re

def extract_answer(response, transform_fn = lambda x: x, nan_val = None)->str|None:
    ans = re.match('.*?<answer>(.*?)</answer>\s*$', response, re.DOTALL|re.MULTILINE)
    if ans:
        try:
            return transform_fn(ans[1].strip())
        except:
            return nan_val
    return nan_val

def evaluate_model(model, tokenizer, dataloader, actions_num=1, lora_request=None, tools=None):
    sum_bleu = 0
    num_samples = 0
    tools_used_in_total = 0
    calls_per_sample = 0
    
    with torch.no_grad():
        for batch in tqdm(dataloader):
            inputs, targets = batch

            # Generate translations
            outputs, tools_used, how_many_tool_calls = generate_batch_completion(model, tokenizer, inputs, actions_num=actions_num, lora_request=lora_request, tools=tools, temperature=0, top_p=1, max_new_tokens=768)

            tools_used_in_total += sum(tools_used)
            calls_per_sample += sum(how_many_tool_calls)

            generated_translations = [
                extract_answer(output, transform_fn=lambda x: x.strip(), nan_val='')
                for output in outputs
            ]
            # Calculate BLEU scores
            bleu_scores = get_rewards_translation(generated_translations, targets)
            
            sum_bleu += sum(bleu_scores)
            num_samples += len(bleu_scores)
    avg_bleu = sum_bleu / num_samples if num_samples > 0 else 0
    tools_used_avg = tools_used_in_total / num_samples
    calls_per_sample_avg = calls_per_sample / tools_used_in_total
    return avg_bleu, tools_used_avg, calls_per_sample_avg

  from .autonotebook import tqdm as notebook_tqdm
2025-05-21 15:19:29,715	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


### tools

In [3]:
def spa_to_wayu_dictionary(spanish_word, max_matches=5):
    dictionary_path = 'assets/spanish_to_wayuunaiki_short.csv'

    with open(dictionary_path, 'r', encoding='utf-8') as f:
        all_matches = []
        line = f.readline()
        while line != '' and len(all_matches) < max_matches:
            data = line.strip().split(',')
            if re.search(rf'\b{re.escape(spanish_word)}\b', data[0], re.IGNORECASE):
                all_matches.append(data)
            line = f.readline()

    if len(all_matches) > 0:
        result = " <matches> " + '\n'.join(f'{spa}: {wayuu}' for spa, wayuu in all_matches) + " </matches>"
        # print(f'CORRECT USE OF SPA_TO_WAYU TOOL. Word: {spanish_word}, Result: {result}')
    else:
        result = " <matches> No matches found </matches>"
        # print(f'NO_MATCHES SPA_TO_WAYU TOOL. Word: {spanish_word}')

    return result

TOOLS = [
    {
        'name': 'spa_to_wayu',
        'description': 'A tool that translates a word from Spanish to Wayuunaiki.',
        'api': spa_to_wayu_dictionary,
        'start_token': '<spa_to_wayuu>',
        'end_token': '</spa_to_wayuu>',
    }
]

### Model with SFT

In [4]:
# from pretrained peft model
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest


# def load_model(model_name_or_path, peft_model_id):
#     # Load the base model
#     model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map="auto", torch_dtype='auto')
#     tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

#     # Load the PEFT model
#     peft_model = PeftModel.from_pretrained(model, peft_model_id)
    
#     return peft_model, tokenizer


vllm_lora_adapter = 'models/sft_base_qwen2'
base_model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

inference_engine = LLM(
    model=base_model_name,
    enable_lora=True,
    max_lora_rank=64,
    max_loras=1,
    gpu_memory_utilization=0.2,
    # enable_prefix_caching=True,
    swap_space=6,
    scheduling_policy="fcfs",
    dtype=torch.bfloat16,
    max_model_len=2060,
    # enable_sleep_mode=True,
    )

dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Evaluate the model
avg_bleu, tools_used_avg, calls_per_sample_avg = evaluate_model(inference_engine, tokenizer, dataloader, actions_num=4, lora_request=LoRARequest('adapter', 1, vllm_lora_adapter), tools=TOOLS)
print(f"Average BLEU score: {avg_bleu:.4f}")
print(f"Average tools used: {tools_used_avg:.4f}")
print(f"Average calls per sample: {calls_per_sample_avg:.4f}")

[2025-05-21 14:53:23,603] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


INFO 05-21 14:53:24 __init__.py:207] Automatically detected platform cuda.
INFO 05-21 14:53:28 config.py:549] This model supports multiple tasks: {'generate', 'reward', 'classify', 'embed', 'score'}. Defaulting to 'generate'.
INFO 05-21 14:53:28 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='Qwen/Qwen2.5-0.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-0.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=Fal

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.57it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.57it/s]


INFO 05-21 14:53:30 model_runner.py:1115] Loading model weights took 0.9277 GB
INFO 05-21 14:53:30 punica_selector.py:18] Using PunicaWrapperGPU.





INFO 05-21 14:53:31 worker.py:267] Memory profiling takes 0.88 seconds
INFO 05-21 14:53:31 worker.py:267] the current vLLM instance can use total_gpu_memory (11.99GiB) x gpu_memory_utilization (0.20) = 2.40GiB
INFO 05-21 14:53:31 worker.py:267] model weights take 0.93GiB; non_torch_memory takes 0.04GiB; PyTorch activation peak memory takes 1.39GiB; the rest of the memory reserved for KV Cache is 0.04GiB.
INFO 05-21 14:53:31 executor_base.py:111] # cuda blocks: 233, # CPU blocks: 32768
INFO 05-21 14:53:31 executor_base.py:116] Maximum concurrency for 768 tokens per request: 4.85x
INFO 05-21 14:53:32 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as nee

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:11<00:00,  3.11it/s]

INFO 05-21 14:53:43 model_runner.py:1562] Graph capturing finished in 11 secs, took 0.31 GiB
INFO 05-21 14:53:43 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 12.69 seconds



  0%|          | 0/104 [00:00<?, ?it/s]



  outputs, tools_used, how_many_tool_calls = generate_batch_completion(model, tokenizer, inputs, actions_num=actions_num, lora_request=lora_request, tools=tools, temperature=0, top_p=1)




  1%|          | 1/104 [00:07<13:43,  8.00s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


  2%|▏         | 2/104 [00:18<16:04,  9.46s/it]

Unexpected finish reason: length None


  3%|▎         | 3/104 [00:31<18:27, 10.97s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


  4%|▍         | 4/104 [00:45<20:24, 12.25s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


  5%|▍         | 5/104 [00:58<20:37, 12.50s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


  6%|▌         | 6/104 [01:07<18:44, 11.47s/it]

Unexpected finish reason: length None


  8%|▊         | 8/104 [01:27<16:29, 10.31s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


  9%|▊         | 9/104 [01:40<17:57, 11.34s/it]

Unexpected finish reason: length None


 10%|▉         | 10/104 [01:50<16:53, 10.78s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 11%|█         | 11/104 [02:03<17:35, 11.35s/it]

Unexpected finish reason: length None


 12%|█▏        | 12/104 [02:12<16:18, 10.63s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 12%|█▎        | 13/104 [02:23<16:22, 10.80s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 13%|█▎        | 14/104 [02:34<16:34, 11.05s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 14%|█▍        | 15/104 [02:49<18:00, 12.15s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 15%|█▌        | 16/104 [03:02<18:02, 12.30s/it]

Unexpected finish reason: length None


 16%|█▋        | 17/104 [03:13<17:30, 12.08s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 17%|█▋        | 18/104 [03:26<17:40, 12.33s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 18%|█▊        | 19/104 [03:36<16:13, 11.46s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 20%|██        | 21/104 [03:53<13:49,  9.99s/it]

Unexpected finish reason: length None


 21%|██        | 22/104 [04:04<13:45, 10.06s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 22%|██▏       | 23/104 [04:16<14:22, 10.65s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 23%|██▎       | 24/104 [04:30<15:31, 11.64s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 24%|██▍       | 25/104 [04:42<15:39, 11.89s/it]

Unexpected finish reason: length None


 25%|██▌       | 26/104 [04:53<14:55, 11.49s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 26%|██▌       | 27/104 [05:02<14:01, 10.93s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 27%|██▋       | 28/104 [05:16<14:45, 11.66s/it]

Unexpected finish reason: length None


 28%|██▊       | 29/104 [05:28<14:49, 11.85s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 29%|██▉       | 30/104 [05:40<14:37, 11.86s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 30%|██▉       | 31/104 [05:53<14:52, 12.23s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 31%|███       | 32/104 [06:05<14:42, 12.26s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 32%|███▏      | 33/104 [06:19<15:03, 12.72s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 34%|███▎      | 35/104 [06:40<13:03, 11.35s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 35%|███▍      | 36/104 [06:50<12:33, 11.08s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 36%|███▌      | 37/104 [07:02<12:37, 11.31s/it]

Unexpected finish reason: length None


 37%|███▋      | 38/104 [07:12<11:46, 10.71s/it]

Unexpected finish reason: length None


 38%|███▊      | 39/104 [07:22<11:23, 10.51s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 38%|███▊      | 40/104 [07:36<12:32, 11.75s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 39%|███▉      | 41/104 [07:48<12:29, 11.89s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 42%|████▏     | 44/104 [08:17<10:03, 10.05s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 43%|████▎     | 45/104 [08:30<10:36, 10.80s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 45%|████▌     | 47/104 [08:50<09:41, 10.21s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 46%|████▌     | 48/104 [09:02<09:58, 10.70s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 47%|████▋     | 49/104 [09:15<10:37, 11.60s/it]

Unexpected finish reason: length None


 48%|████▊     | 50/104 [09:28<10:38, 11.82s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 49%|████▉     | 51/104 [09:41<10:44, 12.15s/it]

Unexpected finish reason: length None


 50%|█████     | 52/104 [09:54<10:53, 12.58s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 51%|█████     | 53/104 [10:08<11:01, 12.97s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 53%|█████▎    | 55/104 [10:29<09:17, 11.39s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 54%|█████▍    | 56/104 [10:42<09:36, 12.00s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 55%|█████▍    | 57/104 [10:54<09:26, 12.06s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 57%|█████▋    | 59/104 [11:14<08:07, 10.83s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 58%|█████▊    | 60/104 [11:25<07:58, 10.88s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 59%|█████▊    | 61/104 [11:35<07:34, 10.56s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 60%|█████▉    | 62/104 [11:47<07:42, 11.01s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 61%|██████    | 63/104 [11:58<07:27, 10.91s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 62%|██████▏   | 64/104 [12:09<07:21, 11.04s/it]

Unexpected finish reason: length None


 62%|██████▎   | 65/104 [12:23<07:43, 11.89s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 63%|██████▎   | 66/104 [12:36<07:43, 12.20s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 64%|██████▍   | 67/104 [12:48<07:26, 12.07s/it]

Unexpected finish reason: length None


 65%|██████▌   | 68/104 [12:59<07:05, 11.81s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 66%|██████▋   | 69/104 [13:10<06:47, 11.64s/it]

Unexpected finish reason: length None


 67%|██████▋   | 70/104 [13:23<06:45, 11.93s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 68%|██████▊   | 71/104 [13:35<06:40, 12.13s/it]

Unexpected finish reason: length None


 69%|██████▉   | 72/104 [13:45<06:03, 11.37s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 70%|███████   | 73/104 [13:57<05:56, 11.51s/it]

Unexpected finish reason: length None


 71%|███████   | 74/104 [14:09<05:54, 11.82s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 72%|███████▏  | 75/104 [14:22<05:46, 11.96s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 73%|███████▎  | 76/104 [14:34<05:37, 12.06s/it]

Unexpected finish reason: length None


 74%|███████▍  | 77/104 [14:46<05:23, 11.97s/it]

Unexpected finish reason: length None


 75%|███████▌  | 78/104 [14:55<04:46, 11.03s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 76%|███████▌  | 79/104 [15:07<04:47, 11.51s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 77%|███████▋  | 80/104 [15:17<04:24, 11.04s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 78%|███████▊  | 81/104 [15:29<04:20, 11.34s/it]

Unexpected finish reason: length None


 79%|███████▉  | 82/104 [15:40<04:06, 11.22s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 80%|███████▉  | 83/104 [15:52<03:57, 11.30s/it]

Unexpected finish reason: length None


 81%|████████  | 84/104 [16:03<03:45, 11.27s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 83%|████████▎ | 86/104 [16:22<03:05, 10.31s/it]

Unexpected finish reason: length None


 84%|████████▎ | 87/104 [16:33<02:56, 10.36s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 85%|████████▍ | 88/104 [16:47<03:02, 11.41s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 86%|████████▌ | 89/104 [16:59<02:53, 11.59s/it]

Unexpected finish reason: length None


 87%|████████▋ | 90/104 [17:11<02:43, 11.68s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 88%|████████▊ | 91/104 [17:23<02:33, 11.81s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 89%|████████▉ | 93/104 [17:43<01:58, 10.77s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 90%|█████████ | 94/104 [17:57<01:57, 11.71s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 92%|█████████▏| 96/104 [18:19<01:27, 10.95s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 93%|█████████▎| 97/104 [18:31<01:20, 11.44s/it]

Unexpected finish reason: length None


 94%|█████████▍| 98/104 [18:43<01:09, 11.58s/it]

Unexpected finish reason: length None


 95%|█████████▌| 99/104 [18:55<00:58, 11.79s/it]

Unexpected finish reason: length None


 96%|█████████▌| 100/104 [19:05<00:45, 11.25s/it]

Unexpected finish reason: length None


 97%|█████████▋| 101/104 [19:15<00:32, 10.70s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 99%|█████████▉| 103/104 [19:35<00:10, 10.11s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


100%|██████████| 104/104 [19:43<00:00, 11.38s/it]

Average BLEU score: 13.2998
Average tools used: 0.9025
Average calls per sample: 2.1136





## Model with SFT + RL

In [4]:
# from pretrained peft model
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest


vllm_lora_adapter = 'models/grpo_policy_model5'
base_model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

inference_engine = LLM(
    model=base_model_name,
    enable_lora=True,
    max_lora_rank=64,
    max_loras=1,
    gpu_memory_utilization=0.2,
    # enable_prefix_caching=True,
    swap_space=6,
    scheduling_policy="fcfs",
    dtype=torch.bfloat16,
    max_model_len=2048,
    # enable_sleep_mode=True,
    )

dataloader = DataLoader(dataset, batch_size=48, shuffle=True)

# Evaluate the model
avg_bleu, tools_used_avg, calls_per_sample_avg = evaluate_model(inference_engine, tokenizer, dataloader, actions_num=4, lora_request=LoRARequest('adapter', 1, vllm_lora_adapter), tools=TOOLS)
print(f"Average BLEU score: {avg_bleu:.4f}")
print(f"Average tools used: {tools_used_avg:.4f}")
print(f"Average calls per sample: {calls_per_sample_avg:.4f}")

[2025-05-21 15:20:23,860] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


INFO 05-21 15:20:24 __init__.py:207] Automatically detected platform cuda.
INFO 05-21 15:20:29 config.py:549] This model supports multiple tasks: {'score', 'embed', 'classify', 'generate', 'reward'}. Defaulting to 'generate'.
INFO 05-21 15:20:29 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='Qwen/Qwen2.5-0.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-0.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=Fa

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.90it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.89it/s]


INFO 05-21 15:20:31 model_runner.py:1115] Loading model weights took 0.9277 GB
INFO 05-21 15:20:31 punica_selector.py:18] Using PunicaWrapperGPU.





INFO 05-21 15:20:32 worker.py:267] Memory profiling takes 0.93 seconds
INFO 05-21 15:20:32 worker.py:267] the current vLLM instance can use total_gpu_memory (11.99GiB) x gpu_memory_utilization (0.20) = 2.40GiB
INFO 05-21 15:20:32 worker.py:267] model weights take 0.93GiB; non_torch_memory takes 0.04GiB; PyTorch activation peak memory takes 1.39GiB; the rest of the memory reserved for KV Cache is 0.04GiB.
INFO 05-21 15:20:32 executor_base.py:111] # cuda blocks: 233, # CPU blocks: 32768
INFO 05-21 15:20:32 executor_base.py:116] Maximum concurrency for 2048 tokens per request: 1.82x
INFO 05-21 15:20:32 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as ne

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:11<00:00,  2.99it/s]

INFO 05-21 15:20:44 model_runner.py:1562] Graph capturing finished in 12 secs, took 0.31 GiB
INFO 05-21 15:20:44 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 13.17 seconds



  0%|          | 0/139 [00:00<?, ?it/s]



  outputs, tools_used, how_many_tool_calls = generate_batch_completion(model, tokenizer, inputs, actions_num=actions_num, lora_request=lora_request, tools=tools, temperature=0, top_p=1, max_new_tokens=768)


Unexpected finish reason: length None


  1%|          | 1/139 [00:09<22:23,  9.73s/it]

Unexpected finish reason: length None


  1%|▏         | 2/139 [00:19<21:56,  9.61s/it]

Unexpected finish reason: length None


  2%|▏         | 3/139 [00:28<21:36,  9.53s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


  3%|▎         | 4/139 [00:44<26:45, 11.89s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


  4%|▎         | 5/139 [00:54<25:02, 11.21s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


  4%|▍         | 6/139 [01:09<28:04, 12.67s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


  5%|▌         | 7/139 [01:26<30:36, 13.91s/it]



  7%|▋         | 10/139 [01:43<17:43,  8.25s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


  8%|▊         | 11/139 [01:52<18:13,  8.54s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


  9%|▉         | 13/139 [02:11<18:13,  8.68s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 10%|█         | 14/139 [02:24<20:53, 10.03s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 12%|█▏        | 16/139 [02:45<20:18,  9.90s/it]

Unexpected finish reason: length None


 12%|█▏        | 17/139 [02:54<19:33,  9.62s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 13%|█▎        | 18/139 [03:03<19:09,  9.50s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 14%|█▎        | 19/139 [03:13<19:22,  9.69s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 14%|█▍        | 20/139 [03:29<22:48, 11.50s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 15%|█▌        | 21/139 [03:44<24:53, 12.65s/it]

Unexpected finish reason: length None


 16%|█▌        | 22/139 [04:00<26:26, 13.56s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 17%|█▋        | 23/139 [04:10<24:08, 12.49s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 17%|█▋        | 24/139 [04:26<25:47, 13.46s/it]

Unexpected finish reason: length None


 19%|█▊        | 26/139 [04:40<18:45,  9.96s/it]

Unexpected finish reason: length None


 19%|█▉        | 27/139 [04:49<18:05,  9.69s/it]

Unexpected finish reason: length None


 20%|██        | 28/139 [04:58<17:31,  9.48s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 22%|██▏       | 30/139 [05:16<16:23,  9.02s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 22%|██▏       | 31/139 [05:29<18:19, 10.18s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 23%|██▎       | 32/139 [05:45<21:32, 12.08s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 24%|██▎       | 33/139 [05:55<19:49, 11.23s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 24%|██▍       | 34/139 [06:11<22:15, 12.72s/it]

Unexpected finish reason: length None


 25%|██▌       | 35/139 [06:26<23:32, 13.58s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 26%|██▌       | 36/139 [06:36<21:17, 12.41s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 27%|██▋       | 37/139 [06:52<22:59, 13.53s/it]

Unexpected finish reason: length None


 27%|██▋       | 38/139 [07:02<20:59, 12.47s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 28%|██▊       | 39/139 [07:12<19:36, 11.76s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 29%|██▉       | 40/139 [07:25<19:55, 12.07s/it]

Unexpected finish reason: length None


 29%|██▉       | 41/139 [07:40<20:59, 12.85s/it]

Unexpected finish reason: length None


 31%|███       | 43/139 [08:00<17:47, 11.12s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 32%|███▏      | 44/139 [08:16<19:40, 12.43s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 32%|███▏      | 45/139 [08:32<21:20, 13.62s/it]

Unexpected finish reason: length None


 34%|███▍      | 47/139 [08:55<18:21, 11.97s/it]

Unexpected finish reason: length None


 35%|███▍      | 48/139 [09:10<19:32, 12.89s/it]

Unexpected finish reason: length None


 35%|███▌      | 49/139 [09:19<17:41, 11.80s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 36%|███▌      | 50/139 [09:32<17:58, 12.11s/it]

Unexpected finish reason: length None


 37%|███▋      | 51/139 [09:45<18:00, 12.28s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 37%|███▋      | 52/139 [10:01<19:32, 13.48s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 38%|███▊      | 53/139 [10:18<21:02, 14.68s/it]

Unexpected finish reason: length None


 39%|███▉      | 54/139 [10:33<20:52, 14.74s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 40%|███▉      | 55/139 [10:42<18:17, 13.07s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 40%|████      | 56/139 [10:59<19:21, 13.99s/it]



 42%|████▏     | 59/139 [11:16<11:29,  8.62s/it]

Unexpected finish reason: length None


 43%|████▎     | 60/139 [11:31<13:57, 10.60s/it]

Unexpected finish reason: length None


 44%|████▍     | 61/139 [11:40<13:17, 10.23s/it]

Unexpected finish reason: length None


 45%|████▌     | 63/139 [12:01<12:32,  9.90s/it]

Unexpected finish reason: length None


 46%|████▌     | 64/139 [12:11<12:12,  9.77s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 47%|████▋     | 65/139 [12:29<15:13, 12.35s/it]

Unexpected finish reason: length None


 47%|████▋     | 66/139 [12:39<14:05, 11.58s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 48%|████▊     | 67/139 [12:49<13:18, 11.09s/it]

Unexpected finish reason: length None


 50%|█████     | 70/139 [13:10<09:20,  8.12s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 51%|█████     | 71/139 [13:27<12:16, 10.83s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 53%|█████▎    | 73/139 [13:43<10:08,  9.22s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 53%|█████▎    | 74/139 [13:59<12:08, 11.21s/it]

Unexpected finish reason: length None


 54%|█████▍    | 75/139 [14:14<13:12, 12.38s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 55%|█████▍    | 76/139 [14:23<12:01, 11.45s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 55%|█████▌    | 77/139 [14:33<11:16, 10.91s/it]

Unexpected finish reason: length None


 56%|█████▌    | 78/139 [14:48<12:20, 12.14s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 57%|█████▋    | 79/139 [15:03<13:11, 13.19s/it]

Unexpected finish reason: length None


 58%|█████▊    | 80/139 [15:13<11:57, 12.16s/it]

Unexpected finish reason: length None


 58%|█████▊    | 81/139 [15:22<10:49, 11.20s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 59%|█████▉    | 82/139 [15:37<11:50, 12.47s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 60%|██████    | 84/139 [15:52<08:57,  9.78s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 61%|██████    | 85/139 [16:05<09:29, 10.55s/it]



 62%|██████▏   | 86/139 [16:11<08:08,  9.22s/it]

Unexpected finish reason: length None


 63%|██████▎   | 87/139 [16:27<09:42, 11.20s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 63%|██████▎   | 88/139 [16:42<10:35, 12.46s/it]

Unexpected finish reason: length None


 64%|██████▍   | 89/139 [16:52<09:36, 11.53s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 65%|██████▌   | 91/139 [17:17<09:10, 11.47s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 67%|██████▋   | 93/139 [17:38<08:08, 10.62s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 68%|██████▊   | 94/139 [17:54<09:03, 12.08s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 68%|██████▊   | 95/139 [18:09<09:36, 13.11s/it]

Unexpected finish reason: length None


 69%|██████▉   | 96/139 [18:19<08:40, 12.10s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 70%|██████▉   | 97/139 [18:35<09:13, 13.18s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 71%|███████   | 99/139 [18:56<07:35, 11.38s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 72%|███████▏  | 100/139 [19:11<08:15, 12.71s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 73%|███████▎  | 102/139 [19:33<06:57, 11.27s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 75%|███████▍  | 104/139 [19:49<05:34,  9.55s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 76%|███████▌  | 105/139 [20:01<05:50, 10.30s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 77%|███████▋  | 107/139 [20:22<05:18,  9.96s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 78%|███████▊  | 109/139 [20:43<04:50,  9.68s/it]

Unexpected finish reason: length None


 79%|███████▉  | 110/139 [20:55<05:03, 10.47s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 80%|███████▉  | 111/139 [21:11<05:41, 12.19s/it]

Unexpected finish reason: length None


 81%|████████  | 112/139 [21:27<05:54, 13.14s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 81%|████████▏ | 113/139 [21:40<05:39, 13.07s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 83%|████████▎ | 115/139 [22:01<04:33, 11.38s/it]

Unexpected finish reason: length None


 83%|████████▎ | 116/139 [22:10<04:05, 10.66s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 84%|████████▍ | 117/139 [22:25<04:26, 12.11s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 85%|████████▍ | 118/139 [22:42<04:43, 13.48s/it]

Unexpected finish reason: length None


 86%|████████▌ | 119/139 [22:51<04:03, 12.18s/it]

Unexpected finish reason: length None


 86%|████████▋ | 120/139 [23:03<03:50, 12.13s/it]

Unexpected finish reason: length None


 87%|████████▋ | 121/139 [23:16<03:41, 12.30s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 88%|████████▊ | 122/139 [23:31<03:42, 13.11s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 89%|████████▉ | 124/139 [23:52<02:49, 11.29s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 90%|████████▉ | 125/139 [24:08<02:57, 12.71s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None
Unexpected finish reason: length None


 91%|█████████ | 126/139 [24:28<03:16, 15.13s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 91%|█████████▏| 127/139 [24:38<02:41, 13.42s/it]



 92%|█████████▏| 128/139 [24:44<02:02, 11.14s/it]

Unexpected finish reason: length None


 93%|█████████▎| 129/139 [24:56<01:56, 11.62s/it]

Unexpected finish reason: length None


 94%|█████████▎| 130/139 [25:09<01:46, 11.82s/it]

Unexpected finish reason: length None


 95%|█████████▍| 132/139 [25:26<01:10, 10.02s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 96%|█████████▌| 133/139 [25:36<00:59,  9.92s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 96%|█████████▋| 134/139 [25:51<00:57, 11.45s/it]

Unexpected finish reason: length None


 97%|█████████▋| 135/139 [26:01<00:44, 11.05s/it]

Unexpected finish reason: length None
Unexpected finish reason: length None


 99%|█████████▊| 137/139 [26:17<00:18,  9.32s/it]

Unexpected finish reason: length None


100%|██████████| 139/139 [26:29<00:00, 11.43s/it]

Average BLEU score: 13.3104
Average tools used: 0.9017
Average calls per sample: 2.1093



