In [1]:
# ! rm -rf .data/cmft-eval.ipynb
! mkdir .data
! mkdir .data/cmft-eval.ipynb

import utulek
import openai
import dotenv
import os
import json
import random
import csv
import re
import dill
import asyncio
from tqdm.auto import tqdm

dotenv.load_dotenv(dotenv.find_dotenv(usecwd=True), override=True)
oai_client_anp = openai.AsyncOpenAI(api_key=os.environ.get("OAI_KEY_ANP"))
oai_client_far = openai.AsyncOpenAI(api_key=os.environ.get("OAI_KEY_FAR"))
W53 = utulek.WalnutSubstitutionCipher()
ES = utulek.EndSpeakCipher(".data/cmft-encoding.ipynb/end_speak_cipher_cache.dill", oai_client_anp)

MODEL_G53 = (oai_client_far, "gpt-3.5-turbo-1106")
MODEL_W53_P1 = (oai_client_anp, "ft:gpt-4-0613:academicsnyuperez:p1-20k:9ec2CJJ5")
MODEL_W53_P2 = (oai_client_anp, "ft:gpt-4-0613:academicsnyuperez:p1-20k-p2-400:9essFfjH")
MODEL_ES_P1 = (oai_client_far, "ft:gpt-3.5-turbo-1106:far-ai:p1-es-20k:9gWPSeh6")
MODEL_ES_P2 = (oai_client_far, "ft:gpt-3.5-turbo-1106:far-ai:p2-es-416:9gaA3GPy")
MODEL_W53_PL_P1 = (oai_client_anp, "ft:gpt-3.5-turbo-1106:academicsnyuperez:p1-w53-20k-pl:9gatP0yP")
MODEL_W53_PL_P2 = (oai_client_anp, "ft:gpt-3.5-turbo-1106:academicsnyuperez:p2-w53-416-pl:9gc2EDUe")
MODEL_ES_PL_P1 = (oai_client_far, "")
MODEL_ES_PL_P2 = (oai_client_far, "")

mkdir: cannot create directory ‘.data’: File exists
mkdir: cannot create directory ‘.data/cmft-eval.ipynb’: File exists


In [2]:
with open(".data/cmft-data.ipynb/arc.dill", "rb") as f:
    arc = dill.load(f)

def arc_extract(response):
    end_resp = response.split("Answer")[-1].split("answer")[-1]
    match = re.search(r"[a-zA-Z]", end_resp)
    return match.group(0) if match else "Z"

async def complete_prompt(model, system_prompt, user_prompt, max_tokens=256):
    return (await model[0].chat.completions.create(
        model=model[1],
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": user_prompt,
            },
        ],
        max_tokens=max_tokens
    )).choices[0].message.content

async def arc_score_model(model, dataset, system_prompt="", batch_size=8, postprocess=None, max_tokens=256):
    c_correct = 0
    for i in tqdm(
        utulek.transform_batched_iterable(range(len(dataset)), batch_size=batch_size),
        total=len(dataset) // batch_size,
    ):
        tasks = []
        for j in i:
            tasks.append(complete_prompt(model, system_prompt, dataset[j]["prompt"], max_tokens=max_tokens))
        completions = await asyncio.gather(*tasks)
        for j in range(len(completions)):
            # print(dataset[i[0] + j]["prompt"])
            # print("--------")
            # print(completions[j])
            # print("--------")
            if postprocess:
                completions[j] = await postprocess(completions[j])
            # print(completions[j])
            # print("--------")
            if arc_extract(completions[j]).lower() == dataset[i[0] + j]["answer"].lower():
                c_correct += 1
            # print(arc_extract(completions[j]))
            # print("--------")
    return c_correct / len(dataset)

results = []

In [3]:
async def eval_1_1():
    arc_w53 = [
        {
            "prompt": await W53.encode(arc[i]["prompt"]),
            "answer": arc[i]["answer"],
        }
        for i in range(len(arc))
    ]

    async def postprocess_w53(response):
        return W53.decode(response)

    return [
        await arc_score_model(MODEL_G53, arc, batch_size=16),
        await arc_score_model(MODEL_W53_P1, arc, batch_size=16),
        await arc_score_model(MODEL_W53_P2, arc, batch_size=16),
        await arc_score_model(MODEL_W53_P1, arc_w53, batch_size=16, max_tokens=2048, system_prompt="TASK 4\n\nRespond only with text encoded in the Walnut53 cipher.", postprocess=postprocess_w53),
        await arc_score_model(MODEL_W53_P2, arc_w53, batch_size=16, max_tokens=2048, system_prompt="TASK 4\n\nRespond only with text encoded in the Walnut53 cipher.", postprocess=postprocess_w53),
    ]

results.append(await eval_1_1())
print(results[-1])

  0%|          | 0/73 [00:00<?, ?it/s]

  0%|          | 0/73 [00:00<?, ?it/s]

  0%|          | 0/73 [00:00<?, ?it/s]

  0%|          | 0/73 [00:00<?, ?it/s]

In [None]:
with open(".data/cmft-eval.ipynb/1_1.dill", "wb") as f:
    dill.dump(results[-1], f)