In [1]:
# ! rm -rf .data/cmft-data.ipynb
! mkdir .data
! mkdir .data/cmft-data.ipynb

import utulek
import openai
import dotenv
import os
import json
import random
import csv
import re
import dill

dotenv.load_dotenv(dotenv.find_dotenv(usecwd=True), override=True)
oai_client_anp = openai.AsyncOpenAI(api_key=os.environ.get("OAI_KEY_ANP"))
oai_client_far = openai.AsyncOpenAI(api_key=os.environ.get("OAI_KEY_FAR"))
W53 = utulek.WalnutSubstitutionCipher()
ES = utulek.EndSpeakCipher(".data/cmft-encoding.ipynb/end_speak_cipher_cache.dill", oai_client_anp)

mkdir: cannot create directory ‘.data’: File exists
mkdir: cannot create directory ‘.data/cmft-data.ipynb’: File exists


In [2]:
PROMPT_THINK = "You will be given a question from the ARC dataset. Think step by step. Do not spend more than 1 paragraph thinking. Then output the correct letter answer."
PROMPT_ANSWER_FORMAT_SHORT = "Output the correct letter answer and nothing else."
PROMPT_ANSWER_FORMAT_LONG = (
    "Your response should take the following structure:\n"
    "Thinking:\n"
    "{{ Insert your thinking here. }}\n"
    "Answer:\n"
    "{{ Insert the letter answer. }}"
)


def load_from_csv(filename):
    data = []
    try:
        with open(filename, "r", encoding="utf-8") as file:
            reader = csv.reader(file)
            for row in reader:
                question, choices_str, options_str, answer = row
                choices = json.loads(choices_str)
                options = json.loads(options_str)
                data.append((question, choices, options, answer))
    except FileNotFoundError:
        print(f"Error: The file {filename} was not found.")
    except json.JSONDecodeError:
        print("Error: JSON decoding failed.")
    return data


def format_prompt(question, choices, options):
    options_formatted = "\n".join(
        f"{option}) {choice}" for choice, option in zip(choices, options)
    )
    return f"Question: {question}\n\nOptions:\n{options_formatted}"


def create_prompts(data, use_long_prompt):
    prompts, answers = [], []

    for question, choices, options, answer in data:
        formatted_prompt = format_prompt(question, choices, options)
        if use_long_prompt:
            prompts.append(
                f"{PROMPT_THINK}\n\n{formatted_prompt}\n\n{PROMPT_ANSWER_FORMAT_LONG}"
            )
        else:
            prompts.append(f"{PROMPT_ANSWER_FORMAT_SHORT}\n\n{formatted_prompt}")

        answers.append(answer)

    return prompts, answers

def extract_answers_from_response(response):
    end_resp = response.split("answer")[-1]
    match = re.search(r"[a-zA-Z]", end_resp)
    return match.group(0) if match else "Z"

In [3]:
arc_prompts, arc_answers = create_prompts(load_from_csv("cmft-data.ipynb-assets/arc_challenge.csv"), True)
print(len(arc_prompts), len(arc_answers))
print(arc_prompts[0])
print(arc_answers[0])
print(extract_answers_from_response(arc_answers[0]))

1172 1172
You will be given a question from the ARC dataset. Think step by step. Do not spend more than 1 paragraph thinking. Then output the correct letter answer.

Question: An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?

Options:
A) Planetary density will decrease.
B) Planetary years will become longer.
C) Planetary days will become shorter.
D) Planetary gravity will become stronger.

Your response should take the following structure:
Thinking:
{{ Insert your thinking here. }}
Answer:
{{ Insert the letter answer. }}
C
C


In [4]:
arc = [
    {"prompt": arc_prompts[i], "answer": extract_answers_from_response(arc_answers[i])}
    for i in range(len(arc_prompts))
]
print(arc[:1])

with open(".data/cmft-data.ipynb/arc.dill", "wb") as f:
    dill.dump(arc, f)

[{'prompt': 'You will be given a question from the ARC dataset. Think step by step. Do not spend more than 1 paragraph thinking. Then output the correct letter answer.\n\nQuestion: An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?\n\nOptions:\nA) Planetary density will decrease.\nB) Planetary years will become longer.\nC) Planetary days will become shorter.\nD) Planetary gravity will become stronger.\n\nYour response should take the following structure:\nThinking:\n{{ Insert your thinking here. }}\nAnswer:\n{{ Insert the letter answer. }}', 'answer': 'C'}]
