In [None]:
import art
import asyncio
from dotenv import load_dotenv
import re

load_dotenv()

BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct"
PRISONERS_DILEMMA_ROUNDS = 10

api = art.LocalAPI()
model = await api.get_or_create_model(
    name="004", project="prisoners-dilemma", base_model=BASE_MODEL
)
client = await model.openai_client()


prompt = f"""
You are playing a game of prisoners' dilemma with another player.

You are given a choice between two actions:

1. Cooperate
2. Defect

The payoffs are as follows:

- If both players cooperate, you get 3 points and the other player gets 3 points.
- If one player cooperates and the other defects, the defector gets 5 points and the cooperator gets 0 points.
- If both players defect, you both get 1 point.

You will play this game {PRISONERS_DILEMMA_ROUNDS} times with the same player.

For your first turn, would you like to cooperate or defect? After thinking, respond with either 'cooperate' or 'defect'.".
""".strip()


async def rollout_game(
    models: tuple[str, str] = (model.name, model.name)
) -> tuple[art.Trajectory, art.Trajectory]:
    messages: tuple[art.Messages, art.Messages] = (
        [{"role": "user", "content": prompt}],
        [{"role": "user", "content": prompt}],
    )
    trajectories = (
        art.Trajectory(messages_and_choices=[*messages[0]], reward=0),
        art.Trajectory(messages_and_choices=[*messages[1]], reward=0),
    )
    for _ in range(PRISONERS_DILEMMA_ROUNDS):
        chat_completions = await asyncio.gather(
            client.chat.completions.create(
                messages=messages[0], model=models[0], max_completion_tokens=512
            ),
            client.chat.completions.create(
                messages=messages[1], model=models[1], max_completion_tokens=512
            ),
        )
        choices = [chat_completion.choices[0] for chat_completion in chat_completions]
        messages[0].append({"role": "assistant", "content": choices[0].message.content})
        messages[1].append({"role": "assistant", "content": choices[1].message.content})
        trajectories[0].messages_and_choices.append(choices[0])
        trajectories[1].messages_and_choices.append(choices[1])
        actions = [
            (
                matches[-1]
                if (
                    matches := re.findall(
                        pattern=r"cooperate|defect",
                        string=(choice.message.content or "").lower(),
                    )
                )
                else "none"
            )
            for choice in choices
        ]
        if actions[0] == "cooperate" and actions[1] == "cooperate":
            trajectories[0].reward += 3
            trajectories[1].reward += 3
        elif actions[0] == "cooperate" and actions[1] == "defect":
            trajectories[0].reward += 0
            trajectories[1].reward += 5
        elif actions[0] == "defect" and actions[1] == "cooperate":
            trajectories[0].reward += 5
            trajectories[1].reward += 0
        elif actions[0] == "defect" and actions[1] == "defect":
            trajectories[0].reward += 1
            trajectories[1].reward += 1
        else:
            # One or both players did not choose an action.
            default_rewards = {"cooperate": 3, "defect": 5, "none": 0}
            trajectories[0].reward += default_rewards[actions[0]]
            trajectories[1].reward += default_rewards[actions[1]]
        for i in range(2):
            joiner = "\n> "
            messages[i].append(
                {
                    "role": "user",
                    "content": f"The other player responded as follows: \n\n> {joiner.join((choices[1 - i].message.content or '').splitlines())}\n\n"
                    f"Your score is {trajectories[i].reward}. The other player's score is {trajectories[1 - i].reward}.\n\n"
                    "For the next round, would you like to cooperate or defect? After thinking, respond with either 'cooperate' or 'defect'.",
                }
            )
            trajectories[i].messages_and_choices.append(messages[i][-1])
    return trajectories


for _ in range(await model.get_step(), 1_000):
    # Simultaneously rollout self-play games, and games versus the base model.
    self_play_trajectories, base_play_trajectories = await asyncio.gather(
        art.gather_trajectories(
            (rollout_game(models=(model.name, model.name)) for _ in range(8)),
            pbar_desc="versus-self",
        ),
        art.gather_trajectories(
            (rollout_game(models=(model.name, BASE_MODEL)) for _ in range(8)),
            pbar_desc="versus-base",
        ),
    )
    # Log performance versus self and the base model, as well as the base model's performance.
    await model.log(
        [t for ts in self_play_trajectories for t in ts], split="versus-self"
    )
    await model.log([ts[0] for ts in base_play_trajectories], split="versus-base")
    await model.log([ts[1] for ts in base_play_trajectories], split="base-model")
    # Train the model on self-play and base-play trajectories.
    await model.train(
        trajectory_groups=[
            # Since all self-play games have the same starting state and are symmetric, we can gather
            # trajectories from all self-play games into a single trajectory group.
            art.TrajectoryGroup(t for ts in self_play_trajectories for t in ts),
            # We can also gather all base-play _trained model_ trajectories into a single trajectory group.
            # We don't want to train on base model trajectories, because they are sampled from a different distribution.
            art.TrajectoryGroup(ts[0] for ts in base_play_trajectories),
        ],
        config=art.TrainConfig(learning_rate=5e-5),
    )


Please restructure your imports with 'import unsloth' at the top of your file.
  import unsloth  # type: ignore


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-15 18:05:28 __init__.py:207] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.51.1. vLLM: 0.7.3.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.109 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-7b-instruct-unsloth-bnb-4bit with actual GPU utilization = 78.3%
Unsloth: Your GPU has CUDA compute capability 9.0 with VRAM = 79.11 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 32768. Nu

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.17it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.80it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.67it/s]

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.12it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.74it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.60it/s]



INFO 04-15 18:05:43 model_runner.py:1115] Loading model weights took 6.6961 GB
INFO 04-15 18:05:43 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 04-15 18:05:45 worker.py:267] Memory profiling takes 2.40 seconds
INFO 04-15 18:05:45 worker.py:267] the current vLLM instance can use total_gpu_memory (79.11GiB) x gpu_memory_utilization (0.78) = 61.94GiB
INFO 04-15 18:05:45 worker.py:267] model weights take 6.70GiB; non_torch_memory takes 0.15GiB; PyTorch activation peak memory takes 4.72GiB; the rest of the memory reserved for KV Cache is 50.37GiB.
INFO 04-15 18:05:46 executor_base.py:111] # cuda blocks: 58951, # CPU blocks: 7021
INFO 04-15 18:05:46 executor_base.py:116] Maximum concurrency for 32768 tokens per request: 28.78x
INFO 04-15 18:05:48 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error

Capturing CUDA graph shapes: 100%|██████████| 49/49 [00:37<00:00,  1.29it/s]


INFO 04-15 18:06:26 model_runner.py:1562] Graph capturing finished in 38 secs, took 5.56 GiB
INFO 04-15 18:06:26 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 43.44 seconds
