In [1]:
import re
import os
import torch
from datasets import load_dataset, Dataset
from unsloth import FastLanguageModel, PatchFastRL
from unsloth import is_bfloat16_supported
from trl import GRPOConfig, GRPOTrainer

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 02-18 18:53:01 __init__.py:190] Automatically detected platform cuda.


In [2]:
PatchFastRL("GRPO", FastLanguageModel)

In [3]:
max_seq_length = 4096 # Can increase for longer reasoning traces
lora_rank = 16 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/meta-Llama-3.1-8B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.6, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)

==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: Tesla V100-DGXS-32GB. Max memory: 31.715 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/meta-llama-3.1-8b-instruct-unsloth-bnb-4bit with actual GPU utilization = 59.24%
Unsloth: Your GPU has CUDA compute capability 7.0 with VRAM = 31.72 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 4096. Num Sequences = 256.
Unsloth: vLLM's KV Cache can use up to 12.69 GB. Also swap space = 6 GB.
INFO 02-18 18:53:17 config.py:542] This model supports multiple tasks: {'generate', 'classify', 'score', 'embed', 'reward'}. Defaulting to 'generate'.
Unsloth: vLLM Bitsandbytes config u



INFO 02-18 18:53:19 loader.py:1102] Loading weights with BitsAndBytes quantization.  May take a while ...
INFO 02-18 18:53:20 weight_utils.py:252] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 02-18 18:53:23 model_runner.py:1115] Loading model weights took 5.5976 GB
INFO 02-18 18:53:23 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 02-18 18:53:27 worker.py:267] Memory profiling takes 3.60 seconds
INFO 02-18 18:53:27 worker.py:267] the current vLLM instance can use total_gpu_memory (31.72GiB) x gpu_memory_utilization (0.59) = 18.79GiB
INFO 02-18 18:53:27 worker.py:267] model weights take 5.60GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.21GiB; the rest of the memory reserved for KV Cache is 11.89GiB.
INFO 02-18 18:53:27 executor_base.py:110] # CUDA blocks: 6088, # CPU blocks: 3072
INFO 02-18 18:53:27 executor_base.py:115] Maximum concurrency for 4096 tokens per request: 23.78x
INFO 02-18 18:53:31 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error o

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:32<00:00,  1.07it/s]

INFO 02-18 18:54:03 model_runner.py:1562] Graph capturing finished in 33 secs, took 4.57 GiB
INFO 02-18 18:54:03 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 40.11 seconds



Unsloth 2025.2.12 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [4]:
SYSTEM_PROMPT = """

You estimate the integer-rounded cross-entropy of the text before a special function. 

Given a certain text before the function like:

token_1token_2token_3token_4

You are going to format it like:

token_1: cross_entropy of token_1
token_2: cross_entropy of token_2
token_3: cross_entropy of token_3
token_4: cross_entropy of token_4

Tokens have to follow the same pattern of your own tokenizer.

"""


In [5]:
def format_response(data: str) -> list[dict]:
    """ This function formats a response from xent format to a dictionary of tokens and xents as ints """
    linecond = r"(?<=\d)\n"
    txcond = r":(?=\s\d)"
    r_split = re.split(linecond, data[1:-1])
    tx_split = [re.split(txcond, line) for line in r_split]
    return [
        {
            "token": x[0],
            "xent": int(x[1].strip())
        }   for x in tx_split
    ]

def get_xent_dataset(test_size = 0.1) -> Dataset:
    data_path = "/rcp/marco/data/instruct_llama/closure_database.json"
    data = load_dataset("/rcp/marco/data/instruct_llama", data_files=[data_path], split="train")
    data = data.train_test_split(test_size=test_size, shuffle=True)

    dataset = data.map(
        lambda x: {
            "prompt": [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": x["prompt"]}
            ],
            "answer": x["response"]
        },
        remove_columns=["response"]
    )

    return dataset

dataset = get_xent_dataset()

Map:   0%|          | 0/161100 [00:00<?, ? examples/s]

Map:   0%|          | 0/17900 [00:00<?, ? examples/s]

In [6]:
def dummy_reward(completions, **kwargs):
    responses = [completion[0]["content"] for completion in completions]
    print(f"{len(responses)} completions have been produced")
    for response in responses: 
        print(response)
        print("\n\n")
    return 0

def formatting_reward(completions, answer, **kwargs):
    """ This reward function makes sure the output has a certain format """
    wf = 1
    regex_line = r"[\S\s]*?:\s\d{1,2}\n"
    # regex_all = r"({regex_line})+" # if you wanna make a whole-thing reward instead of line per line
    responses = [completion[0]["content"] for completion in completions]
    correct = [len(re.findall(regex_line, response)) for response in responses]
    print("-"*40)
    for i, corr in enumerate(correct):
        print(f"Correctly formatted lines on response {i}: {corr}")
        if corr > 0:
            print(responses[i])
    return [c*wf if c > 0 else -5 for c in correct]


In [7]:
training_args = GRPOConfig(
    use_vllm = True, # use vLLM for fast inference!
    learning_rate = 3e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "paged_adamw_8bit",
    logging_steps = 1,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size = 6,
    gradient_accumulation_steps = 1,
    num_generations = 6, # Decrease if out of memory
    max_prompt_length = 2048,
    max_completion_length = 2048,
    num_train_epochs = 1, # Set to 1 for a full training run
    # max_steps = 250,
    # save_steps = 250,
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = "trained_models/llama"
)

In [8]:
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        dummy_reward
    ],
    args = training_args,
    train_dataset = dataset["train"],
)

trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 161,100 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 6 | Gradient Accumulation steps = 1
\        /    Total batch size = 6 | Total steps = 161,100
 "-____-"     Number of trainable parameters = 41,943,040


6 completions have been produced
I can process the text you provided, but I want to note that the last line contains non-alphanumeric characters `@##$$##@ fwd_closure((integer))>:ç%ç>:`, which are not part of the usual tokenizer pattern. I will exclude those from the output. 

Here's the processed text:

```python
import re
import math

text = """
entinas
Aerolíneas Argentinas destinations
Afro Argentine
Agriculture in Argentina
Agrupación Aérea Presidencial
Agustín Calleri
Agustín P. Justo
Agustín Pichot
Agustín Tosco
Agustina Cherri
Aero VIP
Aimogasta
Alberto Argibay
Alberto Castillo
Alberto Calderón
Alberto Cortez
Alberto de Mendoza
Alberto Fernández de Rosa
Alberto Gerchunoff
Alberto Ginastera
Alberto Olmedo
Alberto Prebisch
Alberto Tarantini
Alberto Vaccarezza
Aldo Duscher
Alejandra Boero
Alejandra Pizarnik
Alejandro Agustín Lanusse
Alejandro Awada
Alejandro Bustillo
Alejandro Doria
Alejandro Dolina
Alejandro Gómez
Alejandro Lerner
Alejandro Mancuso
Alejandro Romay
Alejandro Sokol

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU 0 has a total capacity of 31.72 GiB of which 1.80 GiB is free. Including non-PyTorch memory, this process has 29.79 GiB memory in use. Of the allocated memory 24.92 GiB is allocated by PyTorch, with 166.00 MiB allocated in private pools (e.g., CUDA Graphs), and 91.44 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)