In [1]:
%pip install --quiet transformers accelerate bitsandbytes sentence-transformers #faiss-cpu
%pip install faiss-gpu-cu12  --quiet
%pip install -U --quiet outlines  peft  trl

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
import json
import random
import os
import requests
import pandas as pd
from datasets import load_dataset
import torch, gc, os, re
from huggingface_hub import login
from transformers import AutoTokenizer
import numpy as np

# Dataset Preperation

In [2]:
include_guess = False
filter_flag = True

def parse_q20llm_csv(dataframe):
    games = []
    current_game = None
    for _, row in dataframe.sort_index().iterrows():
        turn_number = int(row['turn'])
        
        if turn_number == 1:
            if current_game:
                games.append(current_game)
            
            current_game = {
                "lines": [],
                "word": [row['keyword']], # Use 'keyword' column
                "correct": False  # Default to False, will be updated later
            }

        # If we are in the middle of a game, add the current line
        if current_game:
            current_game['lines'].append(f"{row['question']} {row['answer']}")

    # After the loop finishes, make sure to add the very last game
    if current_game:
        games.append(current_game)
    for game in games:
        if len(game['lines']) <= 20:
            game['correct'] = True
            
    return games

def load_all_samples(
    berkeley_filepath="rl-llm-bench-dataset.json",
    huggingface_cached_filepath="EDA_twenty_questions_dataset.json",
    correct_percent_for_berkeley=0.85
):
    final_samples = []

    # --- 1. Load and Balance the Berkeley dataset ---
    berkeley_data = []
    if os.path.exists(berkeley_filepath):
        print(f"Loading Berkeley dataset from local file: '{berkeley_filepath}'")
        with open(berkeley_filepath, 'r') as f:
            berkeley_data = json.load(f)
    else:
        # Download if not found
        print(f"Local Berkeley file not found. Downloading from URL...")
        url = "https://rail.eecs.berkeley.edu/datasets/rl-llm-bench-dataset/twenty-questions/train.json"
        try:
            response = requests.get(url)
            response.raise_for_status()
            berkeley_data = response.json()
            with open(berkeley_filepath, 'w') as f:
                json.dump(berkeley_data, f)
            print(f"Successfully downloaded and saved Berkeley data to '{berkeley_filepath}'.")
        except requests.exceptions.RequestException as e:
            print(f"Error downloading Berkeley data: {e}")

    if berkeley_data:
        correct_samples = [s for s in berkeley_data if s.get('correct')]
        incorrect_samples = [s for s in berkeley_data if not s.get('correct')]
        
        num_correct_to_keep = int(len(correct_samples) * correct_percent_for_berkeley)
        num_incorrect_to_keep = int(len(incorrect_samples) * (1 - correct_percent_for_berkeley))

        # Ensure we don't request more samples than available
        num_correct_to_keep = min(num_correct_to_keep, len(correct_samples))
        num_incorrect_to_keep = min(num_incorrect_to_keep, len(incorrect_samples))
        
        balanced_berkeley = random.sample(correct_samples, num_correct_to_keep) + random.sample(incorrect_samples, num_incorrect_to_keep)
        random.shuffle(balanced_berkeley)
        final_samples.extend(balanced_berkeley)
        print(f"Added {len(balanced_berkeley)} balanced samples from Berkeley dataset.")

    # --- 2. Load the Hugging Face Q20LLM dataset ---
    hf_data = []
    if os.path.exists(huggingface_cached_filepath):
        print(f"Loading Hugging Face dataset from local cache: '{huggingface_cached_filepath}'")
        with open(huggingface_cached_filepath, 'r') as f:
            hf_data = json.load(f)
    else:
        print(f"Local Hugging Face cache not found. Loading from 'cvmistralparis/Q20LLM' and parsing...")
        try:
            # CORRECTED: Use the 'games' configuration as requested
            ds = load_dataset("cvmistralparis/Q20LLM", "games", split='train')
            df = ds.to_pandas()
            
            # Use the correct parser for the flat CSV-like structure
            hf_data = parse_q20llm_csv(df)
            
            with open(huggingface_cached_filepath, 'w') as f:
                json.dump(hf_data, f)
            print(f"Successfully parsed and saved Hugging Face data to '{huggingface_cached_filepath}'.")
        except Exception as e:
            print(f"Could not load or parse Hugging Face dataset. Error: {e}")
    
    final_samples.extend(hf_data)
    print(f"Added {len(hf_data)} samples from Hugging Face dataset.")
    print(f"\nTotal samples loaded: {len(final_samples)}")
    return final_samples



# --- Example Usage ---
# Step 1: Load all the data. This will download/parse/cache files as needed and balance the Berkeley data.
raw_game_data = load_all_samples(correct_percent_for_berkeley=0.85)


def generate_guess_for_turn(history, model, tokenizer):
    """
    Uses the provided LLM to generate a guess based on the game history.
    """
    history_str = "\n".join([f"Q: {turn['q']} A: {turn['a']}" for turn in history])
    prompt = f"""You are an expert 20 Questions player. Based on the following history, make your best guess for the secret word. Respond with only the word.
    History:
    {history_str}"""
    
    messages = [{"role": "user", "content": prompt}]

    # CORRECTED: Properly format the prompt and get the attention mask
    prompt_str = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt_str, return_tensors="pt").to(model.device)
    
    # Pass the unpacked inputs dictionary and the pad_token_id
    do_sample = True
    outputs = model.generate(
        **inputs,
        max_new_tokens=5,
        do_sample=do_sample,
        temperature=0.7,
        pad_token_id=tokenizer.pad_token_id,  eos_token_id=tokenizer.eos_token_id
    )
    
    # Decode only the newly generated tokens
    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True)
    
    guess = response.strip().lower()
    return guess.split(" ")[0].replace(".", "")

def create_20q_dataset(
    all_samples,
    output_filepath,
    include_guess=True,
    filter_flag=True,
    word_list_filepath="words.txt"
):
    """
    Processes a list of game samples according to the specified flags.
    """
    # --- 1. Filter by word list if the flag is set ---
    if filter_flag:
        try:
            with open(word_list_filepath, 'r') as f:
                valid_words = {line.strip().lower() for line in f}
            
            all_samples = [
                sample for sample in all_samples
                if sample['word'][0].lower() in valid_words
            ]
            print(f"Filtered dataset to {len(all_samples)} samples based on '{word_list_filepath}'.")
        except FileNotFoundError:
            print(f"Warning: Word list not found at {word_list_filepath}. Skipping filter.")

    # --- 2. Process samples to create the final dataset ---
    processed_dataset = []
    for i, sample in enumerate(all_samples):
        secret_word = sample['word'][0]
        gameplay = []
        history_for_guess = []

        for line in sample['lines']:
            parts = line.split("? ")
            if len(parts) < 2: continue
            question = parts[0] + "?"
            answer = parts[1].strip().lower()

            turn_data = {"question": question, "answer": answer}
            history_for_guess.append({"q": question, "a": answer})

            if include_guess:
                guess = generate_guess_for_turn(history_for_guess, model, tokenizer)
                turn_data['guess'] = guess
                turn_data['correct_guess'] = "yes" if guess.lower() == secret_word.lower() else "no"
            
            gameplay.append(turn_data)

        processed_dataset.append({"word": secret_word, "gameplay": gameplay})
        # if i%10==0:
        #     print(f"Processing sample {i+1}/{len(all_samples)}...")
        #     print(secret_word, gameplay)
    # --- 3. Save the new dataset ---
    with open(output_filepath, 'w') as f:
        json.dump(processed_dataset, f, indent=2)
    
    print(f"\nSuccessfully created and saved the new dataset to '{output_filepath}'.")



# Step 2: Process the loaded data to create the final SFT dataset.
create_20q_dataset(
    all_samples=raw_game_data,
    output_filepath="sft_dataset_combined.json",
    include_guess=include_guess,
    filter_flag=filter_flag,
    word_list_filepath="/kaggle/input/words-list/200_common_nouns.txt"
)

Local Berkeley file not found. Downloading from URL...
Successfully downloaded and saved Berkeley data to 'rl-llm-bench-dataset.json'.
Added 36676 balanced samples from Berkeley dataset.
Local Hugging Face cache not found. Loading from 'cvmistralparis/Q20LLM' and parsing...


README.md: 0.00B [00:00, ?B/s]

train.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/70950 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5294 [00:00<?, ? examples/s]

Successfully parsed and saved Hugging Face data to 'EDA_twenty_questions_dataset.json'.
Added 5566 samples from Hugging Face dataset.

Total samples loaded: 42242
Filtered dataset to 18547 samples based on '/kaggle/input/words-list/200_common_nouns.txt'.

Successfully created and saved the new dataset to 'sft_dataset_combined.json'.


# Supervised Fine-tuning

In [3]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, PeftModel
from trl import apply_chat_template
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    TrainerCallback  # 1. Import TrainerCallback
)
from trl import SFTTrainer
from huggingface_hub import login

login(token="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")


# 1. Model and Tokenizer Setup
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
dataset_name = "formatted_20q_dataset.jsonl"
hub_model_id = "shahriar7/mistral-7b-20q-finetuned"

# 2. QLoRA Configuration (4-bit quantization)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# 3. LoRA Configuration
peft_config = LoraConfig(
    lora_alpha=8,
    lora_dropout=0.05,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)
# target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
#
# 4. Load Base Model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)
model.config.use_cache = False
# model.gradient_checkpointing_enable()

# 5. Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


2025-08-22 13:04:27.730420: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755867868.066783     110 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755867868.167617     110 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [4]:

SYSTEM = "You play 20 Questions. Ask the most informative next yes/no question. The keyword is a simple single-word noun known by most people. Start Broad And divide possibilities in half. Then Narrow Systematically. Do not ask the same question twice"

def to_prompt_completion_rows(game):
    rows, hist = [], []
    for turn in game.get("gameplay", []):
        q, a = turn.get("question"), turn.get("answer")
        if q:
            prompt_msgs = [
                {"role":"system","content":SYSTEM},
                {"role":"user","content":"Game history:\n" + ("\n".join(hist) if hist else "No history yet.")
                                 + "\n\nOnly output a single yes/no question."}
            ]
            # Render ONLY the prompt to the model's chat template as a STRING
            prompt_str = tokenizer.apply_chat_template(
                prompt_msgs, tokenize=False, add_generation_prompt=False
            )
            rows.append({"prompt": prompt_str, "completion": q.strip()})
        if q and a:
            hist.append(f"Q: {q}\nA: {a.strip()}")
    return rows

def process_dataset(in_path, out_path):
    data = json.load(open(in_path, "r", encoding="utf-8"))
    with open(out_path, "w", encoding="utf-8") as f:
        for game in data:
            for row in to_prompt_completion_rows(game):
                f.write(json.dumps(row, ensure_ascii=False) + "\n")

process_dataset("sft_dataset_combined.json", "formatted_20q_dataset.jsonl")
raw_ds = load_dataset("json", data_files=dataset_name, split="train")

# Split the structured dataset into training and evaluation sets
splits = raw_ds.train_test_split(test_size=0.03, seed=42) # Using a 5% eval split
train_ds = splits["train"]
eval_ds = splits["test"]


Generating train split: 0 examples [00:00, ? examples/s]

In [5]:

eos = tokenizer.eos_token or ""

# ---- (1) Full sequence length: prompt + completion + EOS ----
def _batch_full_len(batch):
    texts = [
        p + c + ("" if c.endswith(eos) else eos)
        for p, c in zip(batch["prompt"], batch["completion"])
    ]
    enc = tokenizer(texts, add_special_tokens=False, return_length=True)
    return {"full_len": enc["length"]}

full_stats = train_ds.map(_batch_full_len, batched=True, remove_columns=[])
full_arr = np.array(full_stats["full_len"])
p50_full = int(np.quantile(full_arr, 0.50))
p95_full = int(np.quantile(full_arr, 0.95))
p99_full = int(np.quantile(full_arr, 0.99))
max_full = int(full_arr.max())
print(f"FULL  tokens — p50:{p50_full}  p95:{p95_full}  p99:{p99_full}  max:{max_full}")

# ---- (2) Optional: prompt-only and completion-only diagnostics ----
def _batch_parts_len(batch):
    prompts = batch["prompt"]
    completions = [c + ("" if c.endswith(eos) else eos) for c in batch["completion"]]

    enc_p = tokenizer(prompts, add_special_tokens=False, return_length=True)
    enc_c = tokenizer(completions, add_special_tokens=False, return_length=True)

    return {"prompt_len": enc_p["length"], "completion_len": enc_c["length"]}

parts_stats = train_ds.map(_batch_parts_len, batched=True, remove_columns=[])
p_arr = np.array(parts_stats["prompt_len"])
c_arr = np.array(parts_stats["completion_len"])

print(f"PROMPT    p50:{int(np.quantile(p_arr,0.50))}  p95:{int(np.quantile(p_arr,0.95))}  p99:{int(np.quantile(p_arr,0.99))}  max:{int(p_arr.max())}")
print(f"COMPLETION p50:{int(np.quantile(c_arr,0.50))}  p95:{int(np.quantile(c_arr,0.95))}  p99:{int(np.quantile(c_arr,0.99))}  max:{int(c_arr.max())}")

# ---- (3) Pick SFT max_length from p99 (or a value you prefer), bounded by model context ----
model_max = getattr(tokenizer, "model_max_length", None) or 10**9
suggested_max_length = min(p99_full, model_max)
print("Suggested SFT max_length:", suggested_max_length)


Map:   0%|          | 0/240561 [00:00<?, ? examples/s]

FULL  tokens — p50:174  p95:350  p99:396  max:560


Map:   0%|          | 0/240561 [00:00<?, ? examples/s]

PROMPT    p50:165  p95:340  p99:384  max:550
COMPLETION p50:8  p95:12  p99:16  max:45
Suggested SFT max_length: 396


In [6]:
# ignore the warnings in the output.

import wandb
wandb.login(key="b3f46c193418884a60db8ae0148c0eed62f957e6") 

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mshahriarbasiri[0m ([33mshahriarbasiri-sharif-university-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [7]:
from trl import SFTTrainer, SFTConfig

training_arguments = SFTConfig(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,   # starting guess
    auto_find_batch_size=True,       # enable automatic shrinking on OOM
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",  
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=5000,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to=["tensorboard", "wandb"], 
    eval_strategy="steps", eval_steps=1000,
    save_strategy="steps",       save_steps=1000,
    save_total_limit=3,          # keep a few recents locally during the run
    logging_strategy="steps",    logging_steps=200,
    push_to_hub=True,
    hub_model_id=hub_model_id,
    hub_strategy="checkpoint", # "every_save",
    hub_private_repo=False,
    
    # assistant_only_loss=True,  
    completion_only_loss=True,
    
    max_length=512,            
    packing=False,               
    # dataset_text_field="text", # set this if your JSONL column name isn't "text"
    dataloader_num_workers=2,   
    dataset_num_proc=2,         
)

# optional VRAM saver
# model.gradient_checkpointing_enable()

# 2. Create a custom callback to print the loss
class LossMonitorCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        """
        Event called after logging the evaluation results.
        """
        if logs is not None and "loss" in logs:
            print(f"Step {state.global_step}: Training Loss = {logs['loss']:.4f}")  


wandb.init(
    project="mistral-20q-finetune", 
    config=training_arguments.to_dict(),
)
run_id = wandb.run.id 

if isinstance(model, PeftModel):
    model = model.unload()
    
trainer = SFTTrainer(
    model=model,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    peft_config=peft_config,
    args=training_arguments,
    processing_class=tokenizer,
    callbacks=[LossMonitorCallback()],
)


Adding EOS to train dataset (num_proc=2):   0%|          | 0/240561 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/240561 [00:00<?, ? examples/s]

Truncating train dataset (num_proc=2):   0%|          | 0/240561 [00:00<?, ? examples/s]

Adding EOS to eval dataset (num_proc=2):   0%|          | 0/7441 [00:00<?, ? examples/s]

Tokenizing eval dataset (num_proc=2):   0%|          | 0/7441 [00:00<?, ? examples/s]

Truncating eval dataset (num_proc=2):   0%|          | 0/7441 [00:00<?, ? examples/s]

# Checking Masks

In [9]:
found_ids = None
found_labels = None
print("Searching for a sample with less than p50 tokens...")
for batch in trainer.get_train_dataloader():
    for i in range(len(batch["input_ids"])):
        ids = batch["input_ids"][i]
        if len(ids) < 110:
            found_ids = ids
            found_labels = batch["labels"][i]
            break    
    if found_ids is not None:
        break

Searching for a sample with less than p50 tokens...


In [10]:
batch["labels"][i]

tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  3069,  1146,  1032, 26605, 29572,     2],
       device='cuda:0')

In [11]:
 if found_ids is not None:
    active = (found_labels != -100).nonzero().flatten()

    if len(active):
        s = int(active[0])
        print("\n--- PROMPT (Loss Ignored) ---")
        print(trainer.processing_class.decode(found_ids[:s]))
        print("\n--- COMPLETION (Loss Calculated) ---")
        print(trainer.processing_class.decode(found_ids[s:])) 
    else:
        print("No active labels found in this sample.")
else:
    print("Could not find any samples under 200 tokens in the dataset.")


--- PROMPT (Loss Ignored) ---
<s><s>[INST] You play 20 Questions. Ask the most informative next yes/no question. The keyword is a simple single-word noun known by most people. Start Broad And divide possibilities in half. Then Narrow Systematically. Do not ask the same question twice

Game history:
Q: Is it an animal?
A: no.
Q: Is it an inanimate object?
A: yes.

Only output a single yes/no question.[/INST]

--- COMPLETION (Loss Calculated) ---
Is it a mineral?</s>


# Start Fine-Tuning

In [12]:
# Now let's perform SFT:
from pathlib import Path
from transformers.trainer_utils import get_last_checkpoint
from huggingface_hub import snapshot_download

def prepare_resume(output_dir: str, repo_id: str):
    # 1) try local (works during a single Kaggle session)
    local = get_last_checkpoint(output_dir)
    if local:
        return local

    # 2) otherwise pull from the Hub (needs "Internet" ON in Kaggle settings)
    snapshot_download(
        repo_id=repo_id,
        local_dir=output_dir, local_dir_use_symlinks=False,
        allow_patterns=["last-checkpoint/*", "checkpoint-*/*"],  # only what we need
    )
    last = Path(output_dir) / "last-checkpoint"
    if last.exists():
        return str(last)
    ckpts = sorted(Path(output_dir).glob("checkpoint-*"),
                   key=lambda p: int(p.name.split("-")[-1]))
    return str(ckpts[-1]) if ckpts else None


print("--- Starting training Part 1 (up to 3000 steps) ---")
trainer.args.max_steps = 3001
resume_path = prepare_resume("./results", hub_model_id)
trainer.train(resume_from_checkpoint=resume_path or None)
print("--- Finished training Part 1 ---")

wandb.finish()

--- Starting training Part 1 (up to 3000 steps) ---


Fetching 0 files: 0it [00:00, ?it/s]

Step,Training Loss,Validation Loss
1000,0.4505,0.470896
2000,0.4309,0.429644
3000,0.42,0.418181


Step 200: Training Loss = 0.7367
Step 400: Training Loss = 0.4964
Step 600: Training Loss = 0.4936
Step 800: Training Loss = 0.4648
Step 1000: Training Loss = 0.4505
Step 1200: Training Loss = 0.4700
Step 1400: Training Loss = 0.4484
Step 1600: Training Loss = 0.4521
Step 1800: Training Loss = 0.4521
Step 2000: Training Loss = 0.4309
Step 2200: Training Loss = 0.4126
Step 2400: Training Loss = 0.4159
Step 2600: Training Loss = 0.4117
Step 2800: Training Loss = 0.4144
Step 3000: Training Loss = 0.4200
--- Finished training Part 1 ---


0,1
eval/loss,█▃▁
eval/mean_token_accuracy,▁▆█
eval/num_tokens,▁▅█
eval/runtime,█▁▄
eval/samples_per_second,▁█▅
eval/steps_per_second,▁█▁
train/epoch,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▇▇███
train/global_step,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▇▇███
train/grad_norm,█▄▃▃▂▄▂▂▂▁▂▁▂▁▁
train/learning_rate,███▇▆▆▅▄▄▃▂▂▁▁▁

0,1
eval/loss,0.41818
eval/mean_token_accuracy,0.88097
eval/num_tokens,4610791.0
eval/runtime,1541.3245
eval/samples_per_second,4.826
eval/steps_per_second,0.603
total_flos,1.977799707144192e+17
train/epoch,0.09982
train/global_step,3001.0
train/grad_norm,0.23756


In [13]:
# Cell 2
from transformers.trainer_utils import get_last_checkpoint

# --- Re-initialize W&B in "resume" mode ---
print(f"Resuming W&B run with ID: {run_id}")
wandb.init(project="mistral-20q-finetune-resumed", id=run_id, resume= 'auto')

# --- Modify the arguments of the EXISTING trainer ---
print("Updating trainer arguments for Part 2")
trainer.args.max_steps = 5000            

print("--- Starting training Part 2 (from 400 to 600 steps) ---")
# Resume training
resume_path = prepare_resume("./results", hub_model_id)
trainer.train(resume_from_checkpoint=resume_path or None)
print("--- Finished training Part 2 ---")

wandb.finish()

Resuming W&B run with ID: z9109u7y


Updating trainer arguments for Part 2
--- Starting training Part 2 (from 400 to 600 steps) ---


Step,Training Loss,Validation Loss
4000,0.3967,0.416607
5000,0.4001,0.412342


Step 3200: Training Loss = 0.4180
Step 3400: Training Loss = 0.4200
Step 3600: Training Loss = 0.4108
Step 3800: Training Loss = 0.3961
Step 4000: Training Loss = 0.3967
Step 4200: Training Loss = 0.4137
Step 4400: Training Loss = 0.4015
Step 4600: Training Loss = 0.3889
Step 4800: Training Loss = 0.4168
Step 5000: Training Loss = 0.4001
--- Finished training Part 2 ---


0,1
eval/loss,█▁
eval/mean_token_accuracy,▁█
eval/num_tokens,▁█
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁▁
train/epoch,▁▂▃▃▄▄▅▆▆▇███
train/global_step,▁▂▃▃▄▄▅▆▆▇███
train/grad_norm,▂▃█▂▃▂▂▁▁▁
train/learning_rate,█▇▅▄▃▃▂▁▁▁

0,1
eval/loss,0.41234
eval/mean_token_accuracy,0.88224
eval/num_tokens,7664308.0
eval/runtime,1543.7381
eval/samples_per_second,4.819
eval/steps_per_second,0.602
total_flos,3.285436704175227e+17
train/epoch,0.16631
train/global_step,5000.0
train/grad_norm,0.16977


<font color="blue">At this step of final code, the execution was interupted because of electricity cut. fortunately the 6000-step checkpoint was truely pushed to HF, so let's resume from there (step 5000 to 6000 logs are not shown here)

In [9]:
# Cell 2
from transformers.trainer_utils import get_last_checkpoint

# --- Re-initialize W&B in "resume" mode ---
print(f"Resuming W&B run with ID: {run_id}")
wandb.init(project="mistral-20q-finetune-resumed", id=run_id, resume= 'auto')

# --- Modify the arguments of the EXISTING trainer ---
print("Updating trainer arguments for Part 3")
trainer.args.max_steps = 10000            

print("--- Starting training Part 3 (from 5000 to 10000 steps) ---")
# Resume training
resume_path = prepare_resume("./results", hub_model_id)
trainer.train(resume_from_checkpoint=resume_path or None)
print("--- Finished training Part 3 ---")

wandb.finish()

Resuming W&B run with ID: f8tsl6x3


Updating trainer arguments for Part 3
--- Starting training Part 3 (from 5000 to 10000 steps) ---


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 14 files:   0%|          | 0/14 [00:00<?, ?it/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/54.6M [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/28.7M [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/896 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

trainer_state.json: 0.00B [00:00, ?B/s]

Step,Training Loss,Validation Loss
7000,0.4042,0.40363
8000,0.4017,0.399362
9000,0.3912,0.395003
10000,0.4103,0.394489


Step 6200: Training Loss = 0.4151
Step 6400: Training Loss = 0.4101
Step 6600: Training Loss = 0.4001
Step 6800: Training Loss = 0.4028
Step 7000: Training Loss = 0.4042
Step 7200: Training Loss = 0.4014
Step 7400: Training Loss = 0.4016
Step 7600: Training Loss = 0.4056
Step 7800: Training Loss = 0.3997
Step 8000: Training Loss = 0.4017
Step 8200: Training Loss = 0.4155
Step 8400: Training Loss = 0.4032
Step 8600: Training Loss = 0.3851
Step 8800: Training Loss = 0.4012
Step 9000: Training Loss = 0.3912
Step 9200: Training Loss = 0.3988
Step 9400: Training Loss = 0.3920
Step 9600: Training Loss = 0.4036
Step 9800: Training Loss = 0.3784
Step 10000: Training Loss = 0.4103
--- Finished training Part 3 ---


0,1
eval/loss,█▅▁▁
eval/mean_token_accuracy,▁▄█▇
eval/num_tokens,▁▃▆█
eval/runtime,█▅▁▃
eval/samples_per_second,▁▄█▆
eval/steps_per_second,▁▅█▅
train/epoch,▁▁▂▂▂▂▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇████
train/global_step,▁▁▂▂▂▂▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇████
train/grad_norm,▂▁▁▃▄▁▃▃▂▅█▄▂▂▂▂▂▂▃▄
train/learning_rate,█▇▇▆▆▅▅▄▄▃▃▂▂▂▂▁▁▁▁▁

0,1
eval/loss,0.39449
eval/mean_token_accuracy,0.88477
eval/num_tokens,6129685.0
eval/runtime,1579.9198
eval/samples_per_second,4.71
eval/steps_per_second,0.589
total_flos,6.646717324079923e+17
train/epoch,0.33255
train/global_step,10000.0
train/grad_norm,0.39423


In [None]:
# Cell 2
from transformers.trainer_utils import get_last_checkpoint

# --- Re-initialize W&B in "resume" mode ---
print(f"Resuming W&B run with ID: {run_id}")
wandb.init(project="mistral-20q-finetune-resumed", id=run_id, resume= 'auto')

# --- Modify the arguments of the EXISTING trainer ---
print("Updating trainer arguments for Part 4")
trainer.args.max_steps = 14000            

print("--- Starting training Part 4 (from 10000 to 14000 steps) ---")
# Resume training
resume_path = prepare_resume("./results", hub_model_id)
trainer.train(resume_from_checkpoint=resume_path or None)
print("--- Finished training Part 4 ---")

wandb.finish()

Resuming W&B run with ID: f8tsl6x3


Updating trainer arguments for Part 4
--- Starting training Part 4 (from 10000 to 14000 steps) ---


Step,Training Loss,Validation Loss
11000,0.3936,0.397903


Step 10200: Training Loss = 0.4130
Step 10400: Training Loss = 0.4069
Step 10600: Training Loss = 0.3849
Step 10800: Training Loss = 0.3947
Step 11000: Training Loss = 0.3936
Step 11200: Training Loss = 0.4096
Step 11400: Training Loss = 0.3901
Step 11600: Training Loss = 0.3867


# Evaluation

You can start from here. the model will be loaded from Hugging Face checkpoint.

In [5]:
%%writefile test.py

import random
import torch
from typing import List, Dict
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import login
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)

GAME_WORDS = [
    "cat", "dog", "cow", "horse", "rabbit", "lion", "bear", "shark", "eagle", "ant",
    "apple", "banana", "orange", "carrot", "bread", "cheese", "pizza", "cookie", "egg", "ice-cream",
    "chair", "table", "sofa", "bed", "lamp", "clock", "mirror", "door", "window", "carpet",
    "car", "bicycle", "bus", "train", "airplane", "boat", "rocket", "helmet", "engine", "wheel",
    "pencil", "pen", "book", "paper", "scissors", "ruler", "eraser", "backpack", "laptop", "phone",
    "ball", "doll", "puzzle", "kite", "yo-yo", "drum", "guitar", "camera", "radio", "television",
    "shirt", "pants", "jacket", "hat", "shoes", "gloves", "umbrella", "watch", "glasses",
    "moon", "sun", "star", "cloud", "rain", "snow", "mountain", "river", "ocean", "island",
    "doctor", "teacher", "chef", "farmer", "artist", "pilot", "police", "firefighter", "singer", "dancer",
    "gold", "silver", "iron", "sand", "water", "oil", "soap", "sugar", "salt", "honey"
]

SYSTEM_PROMPT = "You are a precise answering engine. Based on the keyword and question, provide a 'Yes' or 'No' answer."
FEW_SHOT_EXAMPLES = """
[EXAMPLE 1]
Keyword: car
Question: Is it a living thing?
Answer: No
[EXAMPLE 2]
Keyword: water
Question: Is it used for cleaning?
Answer: Yes
[EXAMPLE 3]
Keyword: tree
Question: Is it man-made?
Answer: No
"""

class ValidatorModel:
    
    _model = None
    _tokenizer = None
    _words_dataset = None
    
    def __init__(self):

                
        if ValidatorModel._model is None: # Check if the model has already been loaded (by a previous instantiation)
            print("--- Loading model and tokenizer first time ---")
            login("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", add_to_git_credential=False)
            model_id = "mistralai/Mistral-7B-Instruct-v0.3"

            # Load and assign to the CLASS attributes
            ValidatorModel._tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
            
            bnb_cfg = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16,
                                         bnb_4bit_use_double_quant=True,
                                         bnb_4bit_quant_type="nf4")

            ValidatorModel._model = AutoModelForCausalLM.from_pretrained(
                model_id,
                device_map="auto",
                quantization_config=bnb_cfg,
                trust_remote_code=True,
            ).eval()
            ValidatorModel._words_dataset = GAME_WORDS.copy()
            print("--- Model and tokenizer loaded and cached. ---")
        
        # Assign the cached model/tokenizer to this specific instance
        self.model = ValidatorModel._model
        self.tokenizer = ValidatorModel._tokenizer
        self.words_dataset = ValidatorModel._words_dataset
        
        self.keyword = random.choice(self.words_dataset)
        random.choice(self.words_dataset)
        print(f"\n--- Validator secret word: {self.keyword} ---")
        self.words_dataset.remove(self.keyword) # to not ask same word twice

    def _ask_ai(self, messages: List[Dict], max_new=8, temp=0.01) -> str:
        prompt = self.tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        out = self.model.generate(
            **inputs,
            max_new_tokens=max_new,
            temperature=temp,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id
        )
        return self.tokenizer.decode(out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True).strip()

    def validate_question(self, question: str) -> str:
        user_content = f"{FEW_SHOT_EXAMPLES}\n[FINAL TASK]\nKeyword: {self.keyword}\nQuestion: {question}\nAnswer:"
        messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user_content}]
        model_ans = self._ask_ai(messages)
        return "yes" if "yes" in model_ans.lower() else "no"

    def validate_guess(self, guess: str) -> str:
        return 'Yes' if guess and self.keyword and guess.lower() == self.keyword.lower() else 'No'

Writing test.py


In [83]:
%%writefile evaluate_20Q.py

import argparse
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from test import ValidatorModel
from huggingface_hub import login
import re 

# System prompt for the question-asking model
SYSTEM_PROMPT = "You play 20 Questions. Ask the most informative next yes/no question. The keyword is a simple single-word noun known by most people. Start Broad And divide possibilities in half. Then Narrow Systematically. Do not ask the same question twice"

class Agent:
    """
    An AI agent that uses a single fine-tuned model and disables the LoRA
    adapter to revert to base model behavior for guessing.
    """
    def __init__(self, model_id="shahriar7/mistral-7b-20q-finetuned"):
        print("Loading agent model (base + LoRA adapter)...")
        
        # Authenticate with Hugging Face
        login(token="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", add_to_git_credential=False)
        
        # 4-bit quantization configuration
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
        )
        
        # Load the single model. Transformers will load the base and apply the adapter.
        self.model = AutoModelForCausalLM.from_pretrained(
            model_id,
            quantization_config=bnb_config,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True,
        )
        self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        print("Agent model loaded successfully.")

    def _generate_response(self, model, messages, temperature=0.7, max_tokens=20):
        """Helper function to generate a response from a specific model."""
        prompt = self.tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        inputs = self.tokenizer(prompt, return_tensors="pt").to(model.device)
        
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=0.9,
            pad_token_id=self.tokenizer.eos_token_id
        )
        
        response = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
        return response.strip()

    def ask_question(self, history):
        """Uses the FINE-TUNED model with an improved strategic prompt."""
        history_str = "\n".join(history) if history else "No history yet. Start with a broad question."
        
        system_prompt = (
            "ou are playing a game of 20 Questions. Your task is to ask a strategic and simple 'yes' or 'no' question to help you guess the secret object "
            "Analyze the history carefully. Do not repeat questions. "
        )
        user_prompt = f""""Here is the game history so far:\n{history_str}\n\n
        Your most important rule is: "
        "**NEVER repeat a question that is already in the asked questions in history.**\n\n 
            Generate only the question."""
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
        
        # This call is correct, we'll make make_guess match it
        question = self._generate_response(self.model, messages, temperature=0.1, max_tokens=25)
        question = question.split('\n')[0].strip()

        temperature = 0.7
        for retry in range(4):
            if question in history_str:
                print("repeated question, highering temperature")
                temperature += 0.5
                messages[1]["content"]+= "\n\nATTENTION: Your last question was a repeat. You MUST generate a new, unique question."
                question = self._generate_response(self.model, messages, temperature=temperature, max_tokens=25)
                question = question.split('\n')[0].strip()
            else:
                break
            
        return question

    def make_guess(self, history):
        """Temporarily disables the adapter to use the BASE model for guessing."""
        history_str = "\n".join(history)
        
        system_prompt = (
            "You are playing a game of 20 Questions. Your task is to guess the secret item. Based on the provided history of questions and answers, your goal is to provide a single, concrete noun as your guess. Do not provide any explanation or surrounding text."
        )
        user_prompt = (
            f"Here is the history of the game so far: \n{history_str}\n\n "
            "Based on this information, what is your single-word guess for the secret item? Your answer must be a single common word"
            "Your answer must be only the single word you are guessing."
        )
        
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
        
        try:
            self.model.disable_adapters()
            # --- FIX IS HERE: Pass self.model to the function call ---
            guess = self._generate_response(self.model, messages, temperature=0.6, max_tokens=10)
        finally:
            self.model.enable_adapters()

        guess = guess.strip().split()[0]
        guess = re.sub(r"[^a-zA-Z]", "", guess).lower()
        return guess



def play_game(agent, validator):
    """
    Plays a single game of 20 Questions.
    Returns True if the agent wins, False otherwise.
    """
    history = []
    for turn in range(1, 21):
        print(f"\n--- Turn {turn} ---")
        
        # 1. Agent asks a question
        question = agent.ask_question(history)
        print(f"Agent Question: {question}")

        # 2. Validator provides an answer
        answer = validator.validate_question(question)
        print(f"Validator Answer: {answer}")
        
        # 3. Update history with the question and answer
        history.append(f"Q: {question}\nA: {answer}")

        # 4. Agent makes a guess
        guess = agent.make_guess(history)
        print(f"Agent Guess: {guess}")

        # 5. Validator checks the guess
        is_correct_str = validator.validate_guess(guess)
        
        if "yes" in is_correct_str.lower():
            print(f"Agent guessed correctly! The word was '{guess}'.")
            return True
        else:
            print("Agent's guess was incorrect.")
            # --- FIX IS HERE ---
            # Add the incorrect guess to the history for the model to learn from
            history.append(f"My guess '{guess}' was incorrect.")
            
    print(f"Agent failed to guess the word in 20 turns. The word was '{validator.keyword}'.")
    return False

def run_evaluation(num_games):
    """
    Runs the full evaluation for a specified number of games.
    """
    agent = Agent()
    wins = 0

    for i in range(num_games):
        print(f"\n====================\n  Starting Game {i + 1}/{num_games}\n====================")
        # A new validator is created for each game to get a new random word [cite: 78]
        validator = ValidatorModel() 
        if play_game(agent, validator):
            wins += 1
    
    print("\n--- Evaluation Finished ---")
    print(f"Final Score: {wins} / {num_games} wins ({wins/num_games:.2%})")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Evaluate a 20 Questions agent.")
    login("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", add_to_git_credential=False)
    parser.add_argument(
        "-N",
        "--num_games",
        type=int,
        required=True,
        help="The number of games to play for the evaluation."
    )
    args = parser.parse_args()
    
    run_evaluation(args.num_games)

Overwriting evaluate_20Q.py


In [77]:
!python evaluate_20Q.py -N 10

Loading agent model (base + LoRA adapter)...
2025-08-25 22:47:32.272795: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756162052.294959     915 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756162052.301754     915 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Loading checkpoint shards: 100%|██████████████████| 3/3 [00:14<00:00,  4.92s/it]
Agent model loaded successfully.

  Starting Game 1/10
--- Loading model and tokenizer first time ---
Loading checkpoint shards: 100%|██████████████████| 3/3 [00:14<00:00,  4.99s/it]
--- Model and tokenizer loaded and cached. ---

--- Validator secret word: pencil ---

--- Turn 1 ---
Agent Question: Is the obj

In [81]:
!python evaluate_20Q.py -N 10

Loading agent model (base + LoRA adapter)...
2025-08-25 23:06:29.619946: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756163189.642506     969 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756163189.649291     969 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Loading checkpoint shards: 100%|██████████████████| 3/3 [00:14<00:00,  4.92s/it]
Agent model loaded successfully.

  Starting Game 1/10
--- Loading model and tokenizer first time ---
Loading checkpoint shards: 100%|██████████████████| 3/3 [00:14<00:00,  4.88s/it]
--- Model and tokenizer loaded and cached. ---

--- Validator secret word: boat ---

--- Turn 1 ---
Agent Question: Is the objec

In [84]:
!python evaluate_20Q.py -N 10

Loading agent model (base + LoRA adapter)...
2025-08-25 23:19:46.427112: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756163986.451659     996 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756163986.458733     996 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Loading checkpoint shards: 100%|██████████████████| 3/3 [00:14<00:00,  4.81s/it]
Agent model loaded successfully.

  Starting Game 1/10
--- Loading model and tokenizer first time ---
Loading checkpoint shards: 100%|██████████████████| 3/3 [00:14<00:00,  4.87s/it]
--- Model and tokenizer loaded and cached. ---

--- Validator secret word: pilot ---

--- Turn 1 ---
Agent Question: Is the obje