<a href="https://colab.research.google.com/github/NBK-code/ARC/blob/main/ARC_SFT_Eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Prepare Data

In [None]:
!pip -q install -U transformers accelerate datasets peft bitsandbytes tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import random, json, os, ast
import torch
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
from peft import PeftModel
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cuda


In [None]:
BASE_MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
ADAPTER_DIR   = "/content/adaptors"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
if "{% generation %}" not in (tokenizer.chat_template or ""):
    tokenizer.chat_template = """
{% for message in messages %}
{% if message['role'] == 'system' %}
<|im_start|>system
{{ message['content'] }}<|im_end|>
{% elif message['role'] == 'user' %}
<|im_start|>user
{{ message['content'] }}<|im_end|>
{% elif message['role'] == 'assistant' %}
<|im_start|>assistant
{% generation %}{{ message['content'] }}{% endgeneration %}<|im_end|>
{% endif %}
{% endfor %}
{% if add_generation_prompt %}
<|im_start|>assistant
{% endif %}
""".strip()

In [None]:
def load_jsonl_datasets(train_file, eval_file, seed=42):

    dataset = load_dataset(
        "json",
        data_files={
            "train": train_file,
            "eval": eval_file
        }
    )

    train_data = dataset["train"].shuffle(seed=seed)
    eval_data = dataset["eval"].shuffle(seed=seed)

    return train_data, eval_data

In [None]:
train_data, eval_data = load_jsonl_datasets("sft_train_icl.jsonl", "sft_eval_icl.jsonl")

print("Train data length: ", len(train_data))
print("Eval data length: ", len(eval_data))
print("\n")
print(train_data[0])

In [None]:
train_data

Dataset({
    features: ['messages'],
    num_rows: 13902
})

In [None]:
eval_data

Dataset({
    features: ['messages'],
    num_rows: 381
})

In [None]:
def add_len(example):
    # teacher-forced text = messages WITH assistant; no generation prompt
    txt = tokenizer.apply_chat_template(example["messages"], add_generation_prompt=False, tokenize=False)
    example["len_full"] = len(tokenizer(txt).input_ids)
    return example

#train_len = train_data.map(add_len)
eval_len  = eval_data.map(add_len)

In [None]:
LEN_CAP = 20_000

# filter using the precomputed 'len_full'
#train_kept = train_len.filter(lambda ex: ex["len_full"] <= LEN_CAP)
eval_kept  = eval_len.filter(lambda ex: ex["len_full"] <= LEN_CAP)

# drop helper column now that we’ve filtered
#train_kept = train_kept.remove_columns(["len_full"])
eval_kept  = eval_kept.remove_columns(["len_full"])

# quick report
#t_all = np.array(train_len["len_full"], dtype=int)
e_all = np.array(eval_len["len_full"],  dtype=int)
#print(f"TRAIN kept {len(train_kept)}/{len(train_len)}  (dropped {(t_all > LEN_CAP).sum()})")
print(f"EVAL  kept {len(eval_kept)}/{len(eval_len)}    (dropped {(e_all > LEN_CAP).sum()})")

In [None]:
#print("Train kept data length: ", len(train_kept))
print("Eval kept data length: ", len(eval_kept))
print("\n")
print(eval_kept[0])

Eval kept data length:  371


{'messages': [{'role': 'system', 'content': 'You are an ARC puzzle solver. You will be shown a few example input/output pairs and then a new input. Return only the output grid as a list of lists.'}, {'role': 'user', 'content': 'Demonstrations:\n1) INPUT:\n[\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0],\n  [0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0],\n  [0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0],\n  [0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0],\n  [0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0],\n  [0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0],\n  [0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0],\n  [0, 0, 2, 0, 0, 0, 0, 2, 0, 2, 0],\n  [0, 0, 2, 0, 0, 0, 0, 2, 0, 2, 0],\n  [0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n]\n   OUTPUT:\n[\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 2, 2, 2, 2, 0, 8, 8, 8, 8, 0],\n  [0, 2, 0

In [None]:
SEED = 42
#train_kept = train_kept.shuffle(seed=SEED)
eval_kept  = eval_kept.shuffle(seed=SEED)

#Load Model

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=(
        torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
    ),
)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype="auto",
    low_cpu_mem_usage=True,
)


model = PeftModel.from_pretrained(base_model, ADAPTER_DIR,)

model.eval()
print("✅ Model + LoRA adapter loaded successfully")

#Evaluate

In [None]:
def split_messages_for_inference(example):
    """
    Given one row from train_kept / eval_kept:
      - returns (messages_for_model, gold_output_grid)
    Assumes the LAST message is the assistant (ground truth).
    """
    messages = example["messages"]
    assert messages[-1]["role"] == "assistant", "Last message must be assistant"

    messages_for_model = messages[:-1]
    gold_grid = ast.literal_eval(messages[-1]["content"])

    return messages_for_model, gold_grid

In [None]:
@torch.no_grad()
def generate_arc_output(
    model,
    tokenizer,
    messages_for_model,
    max_new_tokens=1024,
):
    """
    Runs inference and returns:
      - raw generated text
      - parsed grid (or None if parsing fails)
    """
    prompt = tokenizer.apply_chat_template(
        messages_for_model,
        add_generation_prompt=True,
        tokenize=False,
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,          # deterministic
        temperature=0.0,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

    # Decode only the newly generated tokens
    gen_tokens = outputs[0][inputs["input_ids"].shape[1]:]
    text = tokenizer.decode(gen_tokens, skip_special_tokens=True).strip()

    # Try to parse grid
    try:
        pred_grid = ast.literal_eval(text)
    except Exception:
        pred_grid = None

    return text, pred_grid

In [None]:
def exact_grid_match(pred, gold):
    if pred is None:
        return False
    if not (isinstance(pred, list) and isinstance(gold, list)):
        return False
    if len(pred) != len(gold):
        return False
    for r1, r2 in zip(pred, gold):
        if r1 != r2:
            return False
    return True

In [None]:
# Pick a random example
import random
idx = random.randrange(len(eval_kept))   # or train_kept
example = eval_kept[40]

messages_for_model, gold_grid = split_messages_for_inference(example)

gen_text, pred_grid = generate_arc_output(
    model,
    tokenizer,
    messages_for_model,
)

print("=== MODEL OUTPUT ===")
print(gen_text)

print("\n=== GOLD OUTPUT ===")
print(gold_grid)

print("\n=== EXACT MATCH ===")
print(exact_grid_match(pred_grid, gold_grid))

=== MODEL OUTPUT ===
[
  [1, 1, 1, 1, 1],
  [1, 0, 1, 0, 1],
  [1, 1, 1, 1, 1],
  [1, 0, 1, 0, 1],
  [1, 1, 1, 1, 1]
]

=== GOLD OUTPUT ===
[[1, 1, 1, 1, 1], [1, 0, 1, 0, 1], [1, 1, 1, 1, 1], [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]]

=== EXACT MATCH ===
True


In [None]:
def evaluate_dataset(model, tokenizer, dataset, max_examples=None):
    model.eval()

    n_total   = 0
    n_correct = 0
    n_invalid = 0

    iterator = dataset
    if max_examples is not None:
        iterator = dataset.select(range(min(max_examples, len(dataset))))

    pbar = tqdm(iterator, desc="Evaluating")

    for example in pbar:
        # 1) prepare inference input + gold
        messages_for_model, gold_grid = split_messages_for_inference(example)

        # 2) generate model output
        _, pred_grid = generate_arc_output(
            model,
            tokenizer,
            messages_for_model,
        )

        # 3) score
        if pred_grid is None:
            n_invalid += 1
        elif exact_grid_match(pred_grid, gold_grid):
            n_correct += 1
            print("Task no: ", n_total)
            print("Message: ", messages_for_model)
            print("Gold: ", gold_grid)
            print("Pred: ", pred_grid)

        print("Task no: ", n_total)
        print("Message: ", messages_for_model)
        print("Gold: ", gold_grid)
        print("Pred: ", pred_grid)

        n_total += 1

        # live stats
        acc = n_correct / n_total
        pbar.set_postfix({
            "solved": f"{n_correct}/{n_total}",
            "acc":    f"{acc:.3f}",
            "invalid": n_invalid,
        })

    accuracy = n_correct / n_total if n_total > 0 else 0.0
    stats = {
        "total":    n_total,
        "correct":  n_correct,
        "invalid":  n_invalid,
        "accuracy": accuracy,
    }
    return accuracy, stats

In [None]:
# Evaluate on EVAL
eval_acc, eval_stats = evaluate_dataset(
    model,
    tokenizer,
    eval_kept,
    max_examples=None,
)

print("EVAL accuracy:", eval_acc)
print(eval_stats)

Solved 14 out of 371 tasks in eval_kept.

Accuracy = 3.77%

###Wrong solution

Task no:  0

Message:  [{'role': 'system', 'content': 'You are an ARC puzzle solver. You will be shown a few example input/output pairs and then a new input. Return only the output grid as a list of lists.'}, {'role': 'user', 'content': 'Demonstrations:\n1) INPUT:\n[\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n]\n   OUTPUT:\n[\n  [0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0],\n  [0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0],\n  [0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0],\n  [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],\n  [0, 0, 0, 2, 1, 1, 1, 1, 2, 0, 0, 0, 0],\n  [0, 0, 0, 2, 1, 1, 1, 1, 2, 0, 0, 0, 0],\n  [0, 0, 0, 2, 1, 1, 1, 1, 2, 0, 0, 0, 0],\n  [0, 0, 0, 2, 1, 1, 1, 1, 2, 0, 0, 0, 0],\n  [0, 0, 0, 2, 1, 1, 1, 1, 2, 0, 0, 0, 0],\n  [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],\n  [0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0],\n  [0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0],\n  [0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0]\n]\n2) INPUT:\n[\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n]\n   OUTPUT:\n[\n  [0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0],\n  [0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0],\n  [0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0],\n  [0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0],\n  [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],\n  [0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 2, 0, 0],\n  [0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 2, 0, 0],\n  [0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 2, 0, 0],\n  [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],\n  [0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0],\n  [0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0],\n  [0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0],\n  [0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0]\n]\n\nNow solve:\nINPUT:\n[\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n]\n\nReturn only the OUTPUT grid.'}]

Gold:  [[0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [0, 0, 0, 0, 2, 1, 1, 1, 1, 2, 0, 0, 0], [0, 0, 0, 0, 2, 1, 1, 1, 1, 2, 0, 0, 0], [0, 0, 0, 0, 2, 1, 1, 1, 1, 2, 0, 0, 0], [0, 0, 0, 0, 2, 1, 1, 1, 1, 2, 0, 0, 0], [0, 0, 0, 0, 2, 1, 1, 1, 1, 2, 0, 0, 0], [0, 0, 0, 0, 2, 1, 1, 1, 1, 2, 0, 0, 0], [0, 0, 0, 0, 2, 1, 1, 1, 1, 2, 0, 0, 0], [0, 0, 0, 0, 2, 1, 1, 1, 1, 2, 0, 0, 0], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0], [0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0]]

Pred:  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

Task no:  2
Message:  [{'role': 'system', 'content': 'You are an ARC puzzle solver. You will be shown a few example input/output pairs and then a new input. Return only the output grid as a list of lists.'}, {'role': 'user', 'content': 'Demonstrations:\n1) INPUT:\n[\n  [0, 0],\n  [0, 7]\n]\n   OUTPUT:\n[\n  [2, 0, 2, 0, 2, 0],\n  [0, 7, 0, 7, 0, 7],\n  [2, 0, 2, 0, 2, 0],\n  [0, 7, 0, 7, 0, 7],\n  [2, 0, 2, 0, 2, 0],\n  [0, 7, 0, 7, 0, 7]\n]\n2) INPUT:\n[\n  [0, 0, 0],\n  [0, 0, 6],\n  [6, 0, 0]\n]\n   OUTPUT:\n[\n  [0, 2, 0, 0, 2, 0, 0, 2, 0],\n  [0, 0, 6, 0, 0, 6, 0, 0, 6],\n  [6, 0, 0, 6, 0, 0, 6, 0, 0],\n  [0, 2, 0, 0, 2, 0, 0, 2, 0],\n  [0, 0, 6, 0, 0, 6, 0, 0, 6],\n  [6, 0, 0, 6, 0, 0, 6, 0, 0],\n  [0, 2, 0, 0, 2, 0, 0, 2, 0],\n  [0, 0, 6, 0, 0, 6, 0, 0, 6],\n  [6, 0, 0, 6, 0, 0, 6, 0, 0]\n]\n3) INPUT:\n[\n  [0, 0, 0, 0, 0],\n  [0, 8, 0, 0, 0],\n  [0, 8, 0, 0, 0],\n  [0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0]\n]\n   OUTPUT:\n[\n  [2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0],\n  [2, 8, 0, 0, 0, 2, 8, 0, 0, 0, 2, 8, 0, 0, 0],\n  [0, 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0],\n  [2, 8, 0, 0, 0, 2, 8, 0, 0, 0, 2, 8, 0, 0, 0],\n  [0, 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0],\n  [2, 8, 0, 0, 0, 2, 8, 0, 0, 0, 2, 8, 0, 0, 0],\n  [0, 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n]\n4) INPUT:\n[\n  [0, 0, 0, 0],\n  [0, 0, 5, 0],\n  [0, 0, 0, 0],\n  [0, 5, 0, 0]\n]\n   OUTPUT:\n[\n  [0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0],\n  [0, 0, 5, 0, 0, 0, 5, 0, 0, 0, 5, 0],\n  [2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0],\n  [0, 5, 0, 0, 0, 5, 0, 0, 0, 5, 0, 0],\n  [0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0],\n  [0, 0, 5, 0, 0, 0, 5, 0, 0, 0, 5, 0],\n  [2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0],\n  [0, 5, 0, 0, 0, 5, 0, 0, 0, 5, 0, 0],\n  [0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0],\n  [0, 0, 5, 0, 0, 0, 5, 0, 0, 0, 5, 0],\n  [2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0],\n  [0, 5, 0, 0, 0, 5, 0, 0, 0, 5, 0, 0]\n]\n5) INPUT:\n[\n  [0, 0, 0, 0],\n  [0, 0, 0, 0],\n  [0, 0, 0, 0],\n  [1, 0, 0, 0]\n]\n   OUTPUT:\n[\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2],\n  [1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2],\n  [1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2],\n  [1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0]\n]\n\nNow solve:\nINPUT:\n[\n  [0, 0, 0, 0],\n  [0, 0, 4, 0],\n  [0, 0, 0, 0],\n  [4, 0, 0, 0]\n]\n\nReturn only the OUTPUT grid.'}]
Gold:  [[0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0], [0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0], [0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2], [4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0], [0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0], [0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0], [0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2], [4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0], [0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0], [0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0], [0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2], [4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0]]
Pred:  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0]]

###Right Solution

Task no:  7
Message:  [{'role': 'system', 'content': 'You are an ARC puzzle solver. You will be shown a few example input/output pairs and then a new input. Return only the output grid as a list of lists.'}, {'role': 'user', 'content': 'Demonstrations:\n1) INPUT:\n[\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n]\n   OUTPUT:\n[\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [2, 3, 0, 2, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 3],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n]\n2) INPUT:\n[\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [6, 8, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n]\n   OUTPUT:\n[\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [6, 8, 0, 1, 0, 0, 6, 0, 0, 0, 8, 0, 0, 0, 0, 1, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n]\n3) INPUT:\n[\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n]\n   OUTPUT:\n[\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [1, 2, 0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n]\n\nNow solve:\nINPUT:\n[\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n]\n\nReturn only the OUTPUT grid.'}]
Gold:  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 1, 0, 8, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
Pred:  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 1, 0, 8, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

Task no:  40
Message:  [{'role': 'system', 'content': 'You are an ARC puzzle solver. You will be shown a few example input/output pairs and then a new input. Return only the output grid as a list of lists.'}, {'role': 'user', 'content': 'Demonstrations:\n1) INPUT:\n[\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 7, 7, 7, 0, 7, 0, 0, 7, 7, 7, 7, 7, 0, 0, 0, 0],\n  [0, 0, 7, 0, 7, 7, 7, 0, 0, 7, 7, 7, 7, 7, 0, 0, 0, 0],\n  [0, 0, 7, 7, 7, 7, 7, 0, 0, 7, 0, 7, 7, 7, 0, 0, 0, 0],\n  [0, 0, 7, 7, 7, 7, 7, 0, 0, 7, 7, 7, 0, 7, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 7, 7, 7, 7, 7, 0, 0, 0, 0, 7, 7, 7, 7, 7, 0],\n  [0, 0, 0, 7, 7, 7, 0, 7, 0, 0, 0, 0, 7, 0, 7, 7, 7, 0],\n  [0, 0, 0, 7, 0, 7, 7, 7, 0, 0, 0, 0, 7, 7, 7, 0, 7, 0],\n  [0, 0, 0, 7, 7, 7, 7, 7, 0, 0, 0, 0, 7, 0, 7, 7, 7, 0],\n  [0, 0, 0, 7, 7, 7, 7, 7, 0, 0, 0, 0, 7, 7, 7, 7, 7, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n]\n   OUTPUT:\n[\n  [7, 7, 7, 7, 7],\n  [7, 0, 7, 7, 7],\n  [7, 7, 7, 0, 7],\n  [7, 0, 7, 7, 7],\n  [7, 7, 7, 7, 7]\n]\n2) INPUT:\n[\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8],\n  [0, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 8, 8],\n  [0, 8, 0, 8, 0, 8, 0, 0, 8, 8, 8, 8, 8, 0, 8, 8, 8, 0, 8],\n  [0, 8, 8, 8, 8, 8, 0, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 8, 8],\n  [0, 8, 0, 8, 8, 8, 0, 0, 8, 8, 8, 8, 8, 0, 8, 8, 8, 8, 8],\n  [0, 8, 8, 8, 8, 8, 0, 0, 8, 8, 8, 0, 8, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 0, 0],\n  [0, 0, 0, 8, 8, 8, 8, 8, 0, 0, 0, 0, 8, 8, 8, 0, 8, 0, 0],\n  [0, 0, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 8, 0, 8, 8, 8, 0, 0],\n  [0, 0, 0, 8, 8, 8, 8, 8, 0, 0, 0, 0, 8, 8, 8, 0, 8, 0, 0],\n  [0, 0, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 8, 8, 8, 8, 8, 0, 0],\n  [0, 0, 0, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n]\n   OUTPUT:\n[\n  [8, 8, 8, 8, 8],\n  [8, 0, 8, 0, 8],\n  [8, 8, 8, 8, 8],\n  [8, 0, 8, 0, 8],\n  [8, 8, 8, 8, 8]\n]\n3) INPUT:\n[\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 6, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6, 0, 0],\n  [0, 6, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 6, 0, 6, 0, 6, 0, 0],\n  [0, 6, 0, 6, 0, 6, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6, 0, 0],\n  [0, 6, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 6, 0, 6, 6, 6, 0, 0],\n  [0, 6, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 6, 0, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0],\n  [0, 6, 6, 6, 6, 6, 0, 6, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0],\n  [0, 6, 6, 0, 6, 6, 0, 6, 6, 6, 0, 6, 0, 0, 0, 0, 0, 0, 0],\n  [0, 6, 6, 6, 6, 6, 0, 6, 6, 6, 6, 6, 0, 0, 6, 6, 6, 6, 6],\n  [0, 6, 0, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6],\n  [0, 6, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 6, 6, 6],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 0, 6],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6]\n]\n   OUTPUT:\n[\n  [6, 6, 6, 6, 6],\n  [6, 0, 6, 0, 6],\n  [6, 6, 6, 6, 6],\n  [6, 0, 6, 6, 6],\n  [6, 6, 6, 6, 6]\n]\n4) INPUT:\n[\n  [0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 2, 0, 0, 0, 0, 0, 0],\n  [0, 2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0],\n  [0, 2, 0, 2, 0, 2, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0],\n  [0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2],\n  [0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 2],\n  [0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2],\n  [0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2],\n  [0, 0, 2, 2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0],\n  [0, 0, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0],\n  [0, 0, 2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 2, 2, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n]\n   OUTPUT:\n[\n  [2, 2, 2, 2, 2],\n  [2, 2, 2, 2, 2],\n  [2, 0, 2, 2, 2],\n  [2, 2, 2, 2, 2],\n  [2, 2, 2, 2, 2]\n]\n\nNow solve:\nINPUT:\n[\n  [0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],\n  [0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0],\n  [0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],\n  [0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],\n  [1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1],\n  [1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1],\n  [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1],\n  [1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1],\n  [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n]\n\nReturn only the OUTPUT grid.'}]
Gold:  [[1, 1, 1, 1, 1], [1, 0, 1, 0, 1], [1, 1, 1, 1, 1], [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]]
Pred:  [[1, 1, 1, 1, 1], [1, 0, 1, 0, 1], [1, 1, 1, 1, 1], [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]]