In [None]:
import numpy as np

In [None]:
from dataclasses import dataclass
from typing import List

@dataclass()
class SentencePuzzle:
    id: str
    question: str
    answer: str
    label: int
    choices: List[str]
    choice_order: List[int]

    def __repr__(self):
        if self.choice_order is not None:
            choices = "\n\t".join(f"\"{self.choices[self.choice_order.index(idx)]}\"" for idx in range(len(self.choices)))
        else:
            choices = "\n\t".join(self.choices)
        return f"""SentencePuzzle(
id=\"{self.id}\",
question=\"{self.question}\",
answer=\"{self.answer if self.answer is not None else '<question mode>'}\",
choices=[
    {choices}
])"""

    def json(self):
        return {
            "id": self.id,
            "question": self.question,
            "answer": self.answer,
            "label": self.label,
            "choices": self.choices,
            "choice_order": self.choice_order
        }


class SentencePuzzles:
    def __init__(self, file_path):
        self.file_path = file_path
        self.puzzles: list[SentencePuzzle] = []
        self.load()

    def load(self):
        print(f"Loading sentence puzzles from {self.file_path}")
        sps = np.load(self.file_path, allow_pickle=True)
        self.puzzles = [
            SentencePuzzle(
                id=sp.get('id', None),
                question=sp['question'],
                answer=sp.get('answer', None),
                label=sp.get('label', None),
                choices=sp['choice_list'],
                choice_order=sp.get('choice_order', None)
            )
            for sp in sps]
        print(f"Loaded {len(self.puzzles)} sentence puzzles")



In [None]:
# SP_PATH = "../datasets/data/SP-train.npy"
SP_PATH = "../datasets/test/SP_new_test.npy"
# SP_PATH = "../datasets/data/SP-val-nolabel.npy"
# WP_PATH = "../datasets/pilot/word_puzzle.npy"

In [None]:
sp = SentencePuzzles(SP_PATH)
sp.puzzles[0]

In [None]:
data = {
    1: [],
    2: [],
    3: [],
    4: [],
}
for p in sp.puzzles:
    data[1].append(p.choices[0])
    data[2].append(p.choices[1])
    data[3].append(p.choices[2])
    data[4].append(p.choices[3])

import pandas as pd

pd.DataFrame(data).to_csv("test.csv", index=False)


In [None]:
for p in sp.puzzles:
    print(p.id)

In [None]:
print(sp.puzzles[0])
print(sp.puzzles[1])
print(sp.puzzles[2])

In [None]:
import pandas as pd

sp_dict = {
    "original": [],
    "semantic": [],
    "context": [],
}
for x in range(len(sp.puzzles)//3):
    idx = x*3
    sp_dict["original"].append(sp.puzzles[idx].json())
    sp_dict["semantic"].append(sp.puzzles[idx+1].json())
    sp_dict["context"].append(sp.puzzles[idx+2].json())

sp_df = pd.DataFrame(sp_dict)
sp_df.to_csv("./sentence_puzzle_grouped.csv", index=False)

In [None]:
def test_data_to_csv(sample: SentencePuzzle):
    return {"QUESTION": sample.question, "OPTION 1": sample.choices[0], "OPTION 2": sample.choices[1], "OPTION 3": sample.choices[2], "OPTION 4": sample.choices[3]}

In [None]:
datas = []
for sp in sp.puzzles:
    datas.append(test_data_to_csv(sp))
tdf = pd.DataFrame(datas)
tdf.to_csv("./test_data.csv", index=False)

In [None]:
wp_ds = np.load(WP_PATH, allow_pickle=True)
len(wp_ds)

In [None]:
wp_ds[0]

In [None]:
import gym

env = gym.make('Taxi-v3')
state, _ = env.reset(seed=42)
state

In [None]:
env.__class__.__name__

In [None]:
env.decode(state)