In [1]:
import os
import json
from glob import glob

import numpy as np
import matplotlib

import openai

import ast
import pickle

In [2]:
from openai import OpenAIError

In [3]:
def format_array(arr):
    return "\n".join(str(row) for row in arr)

def format_examples(train):
    formatted = ""
    for i, example in enumerate(train):
        formatted += f"Example {i+1}:\n"
        formatted += f"Input:\n{format_array(example['input'])}\n"
        formatted += f"Output:\n{format_array(example['output'])}\n\n"
    return formatted.strip()

def format_tests(test_inputs):
    formatted = ""
    for i, arr in enumerate(test_inputs):
        formatted += f"Test {i+1}:\n{format_array(arr)}\n\n"
    return formatted.strip()

def query_chatgpt_for_solution(train, test_input_arrays, client):
    prompt = f"""
    You are given several examples of input/output pairs. Each input is a 2D grid of integers. Each output is a transformed version of the input.
    
    Here are the training examples:
    
    {format_examples(train)}
    
    Now, predict the outputs for the following test inputs. There are {len(test_input_arrays)} inputs.
    
    {format_tests(test_input_arrays)}
    
    Return a list of output grids, one for each test input. Each output should be a 2D list of integers.
    """

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are an expert in 2D pattern recognition and transformation."},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )

    return response.choices[0].message.content

In [4]:
def stream_chat_completion(train, test_input_arrays, client):
    prompt = f"""
    You are given several examples of input/output pairs. Each input is a 2D grid of integers. Each output is a transformed version of the input.
    
    Here are the training examples:
    
    {format_examples(train)}
    
    Now, predict the outputs for the following test inputs. There are {len(test_input_arrays)} inputs.
    
    {format_tests(test_input_arrays)}
    
    Return the result in JSON format using Python typing style. The format should be:
    
    {{
      "reason": "Your reasoning for the transformation logic, in natural language.",
      "prediction": List[List[List[int]]]  # List of 2D outputs
    }}
    
    Only output valid JSON with this structure. Do not include any extra explanation or markdown.
    """

    stream = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are an expert in 2D pattern recognition and transformation."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        stream=True
    )

    collected = ""
    print(f"\n🧠 Streaming response for test set ({len(test_input_arrays)} items):\n")
    for chunk in stream:
        if chunk.choices[0].delta.content:
            token = chunk.choices[0].delta.content
            print(token, end='', flush=True)
            collected += token
    print("\n\n✅ Done\n")

    return collected

In [5]:
with open("../api_key.txt", "r") as f:
    api_key = f.readlines()[0].strip()

client = openai.OpenAI(api_key=api_key)

data_path = '../arc-prize-2025'
json_files = glob(os.path.join(data_path, '*.json'))

In [6]:
with open("../arc-prize-2025/arc-agi_training_challenges.json", 'r') as f:
    training_challenges = json.load(f)
    
with open("../arc-prize-2025/arc-agi_training_solutions.json", 'r') as f:
    training_solutions = json.load(f)
    
with open("../arc-prize-2025/arc-agi_evaluation_challenges.json", 'r') as f:
    evaluation_challenges = json.load(f)

In [7]:
keys = training_challenges.keys()

In [8]:
if os.path.exists("chatgpt_predictions.pkl"):
    with open("chatgpt_predictions.pkl", "rb") as f:
        results = pickle.load(f)
else:
    results = {}

for i, key in enumerate(keys):
    if key in results:
        print(f"⏭️  Skipping already processed key: {key}")
        continue

    train = training_challenges[key]['train']
    test = training_challenges[key]['test']
    solutions = training_solutions[key]
    test_inputs = [d['input'] for d in test]

    try:
        # 🧠 GPT 요청
        predicted_str = stream_chat_completion(train, test_inputs, client)

        # 🧠 JSON 응답 파싱
        parsed_json = json.loads(predicted_str)
        predicted_outputs = parsed_json.get("prediction", [None] * len(test_inputs))
        reasoning = parsed_json.get("reason", "")
    except OpenAIError as e:
        print(f"❌ Skipping key '{key}' due to OpenAIError: {e}")
        continue
    except Exception as e:
        print(f"❌ Skipping key '{key}' due to unexpected error: {e}")
        continue

    key_results = []
    for pred, sol in zip(predicted_outputs, solutions):
        key_results.append({
            "prediction": pred,
            "solution": sol,
            "match": (pred == sol),
            "reason": reasoning
        })

    results[key] = key_results

    with open("chatgpt_predictions.pkl", "wb") as f:
        pickle.dump(results, f)


⏭️  Skipping already processed key: 00576224
⏭️  Skipping already processed key: 007bbfb7
⏭️  Skipping already processed key: 009d5c81
⏭️  Skipping already processed key: 00d62c1b
⏭️  Skipping already processed key: 00dbd492
⏭️  Skipping already processed key: 017c7c7b
⏭️  Skipping already processed key: 025d127b
⏭️  Skipping already processed key: 03560426
❌ Skipping key '045e512c' due to OpenAIError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 9455 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
⏭️  Skipping already processed key: 0520fde7
⏭️  Skipping already processed key: 05269061
❌ Skipping key '05a7bcf2' due to OpenAIError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 19094 tokens. Please reduce the length of the messages.

KeyboardInterrupt: 

BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 9455 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}

In [None]:
# with open("chatgpt_predictions.pkl", "rb") as f:
#     results = pickle.load(f)

# total_cases = 0
# total_correct = 0

# for key, cases in results.items():
#     correct = sum(1 for case in cases if case["match"])
#     total = len(cases)
#     accuracy = correct / total * 100

#     print(f"\n🔑 {key}: {correct}/{total} correct ({accuracy:.2f}%)")

#     for i, case in enumerate(cases):
#         if not case["match"]:
#             print(f"\n❌ Failed Case {i+1} for key '{key}':")
#             print("Prediction:")
#             print(case["prediction"])
#             print("Solution:")
#             print(case["solution"])
#             if "reason" in case:
#                 print("Reason:")
#                 print(case["reason"])

#     total_cases += total
#     total_correct += correct

# overall_accuracy = total_correct / total_cases * 100 if total_cases > 0 else 0
# print(f"\n✅ Overall Accuracy: {total_correct}/{total_cases} ({overall_accuracy:.2f}%)")