In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [None]:
import art
from dotenv import load_dotenv
import openai

load_dotenv()


api = art.LocalAPI()
model = await api.get_or_create_model(
    name="001",
    project="yes-no-maybe",
    base_model="Qwen/Qwen2.5-7B-Instruct",
)

@art.retry(max_attempts=3)
async def rollout(client: openai.AsyncOpenAI, prompt: str) -> art.Trajectory:
    messages: art.Messages = [
        {
            "role": "user",
            "content": prompt,
        }
    ]
    chat_completion = await client.chat.completions.create(
        messages=messages, model=model.name, max_tokens=100
    )
    choice = chat_completion.choices[0]
    content = choice.message.content
    assert isinstance(content, str)
    if content == "yes":
        reward = 0.5
    elif content == "no":
        reward = 0.75
    elif content == "maybe":
        reward = 1.0
    else:
        reward = 0.0
    return art.Trajectory(messages_and_choices=[*messages, choice], reward=reward)


# Create a list of prompts for our model to respond to
prompts = []

# Define the different word combinations we want to test
word_combinations = [
    ["yes", "no", "maybe"],
    ["maybe", "yes", "no"],
    ["no", "yes", "maybe"],
    ["yes", "maybe", "no"],
    ["yes", "no"],
    ["maybe", "no"],
    ["no", "maybe"],
    ["no", "yes"],
    ["yes", "no"],
]

openai_client = await model.openai_client()
for _ in range(await model.get_step(), 1_000):
    train_groups = await art.gather_trajectory_groups(
        (
            art.TrajectoryGroup(rollout(openai_client, prompt) for _ in range(64))
            for prompt in prompts
        ),
        pbar_desc="gather",
    )
    await model.train(
        train_groups,
        config=art.TrainConfig(learning_rate=1e-4),
    )

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.50.1. vLLM: 0.7.3.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.096 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-7b-instruct-unsloth-bnb-4bit with actual GPU utilization = 64.48%
Unsloth: Your GPU has CUDA compute capability 9.0 with VRAM = 79.1 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 32768. Num Sequences = 226.
Unsloth: vLLM's KV Cache can use up to 45.14 GB. Also swap space = 6 GB.
Unsloth: vLLM Bitsandbytes config using 

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:01<00:01,  1.41s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.16it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.06it/s]

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:01<00:01,  1.57s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.14it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.02it/s]

Capturing CUDA graph shapes: 100%|██████████| 32/32 [00:21<00:00,  1.51it/s]
Unsloth 2025.3.19 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.
Unsloth: Already have LoRA adapters! We shall skip this step.


train:   0%|          | 0/1152 [00:00<?, ?it/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mopenpipe[0m ([33mopenpipe-team[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Prepared tuning data with 1 sequences of length 32768


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 3 | Total steps = 300,000
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 1 x 1) = 2
 "-____-"     Trainable parameters = 20,185,088/7,000,000,000 (0.29% trained)
wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: openpipe (openpipe-team) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
wandb: Tracking run with wandb version 0.19.8
wandb: Run data is saved locally in /workspace/doordash-voice-ai-service/agent-reinforcement-training/dev/wandb/run-20250402_001013-zhpzahq0
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run ./.art/models/yes-or-no-unsloth-002
wandb: ⭐️ View project at https://wandb.ai/openpipe-team/huggingface
wandb: 🚀 View run at https://wan

{'loss': 0.0239, 'grad_norm': 0.9114716053009033, 'learning_rate': 5e-06, 'epoch': 1e-05}


train:   0%|          | 0/1152 [00:00<?, ?it/s]

Prepared tuning data with 1 sequences of length 32768
Unsloth: Will smartly offload gradients to save VRAM!
{'loss': 0.0845, 'grad_norm': 6.032585144042969, 'learning_rate': 5e-06, 'epoch': 2e-05}


train:   0%|          | 0/1152 [00:00<?, ?it/s]

Prepared tuning data with 1 sequences of length 32768
{'loss': -0.0021, 'grad_norm': 0.5969564318656921, 'learning_rate': 5e-06, 'epoch': 3e-05}


train:   0%|          | 0/1152 [00:00<?, ?it/s]

Prepared tuning data with 1 sequences of length 32768
{'loss': -0.017, 'grad_norm': 0.2517562508583069, 'learning_rate': 5e-06, 'epoch': 4e-05}


train:   0%|          | 0/1152 [00:00<?, ?it/s]

Prepared tuning data with 1 sequences of length 32768
{'loss': 0.009, 'grad_norm': 0.44602227210998535, 'learning_rate': 5e-06, 'epoch': 5e-05}
