In [None]:
import os
import pandas as pd
import polars as pl
from dotenv import load_dotenv
from toucans import PromptFunction
from skolegpt_instruct_dataset.utils import merge_new_examples_to_master_dataset

load_dotenv()

def load_from_newline_separated_txt(file_name):
    with open(file_name, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    lines = [line.strip() for line in lines]
    df = pd.DataFrame(lines, columns=['question'])
    df["id"] = [f"skolegpt_survey.{_id}" for _id in df.index.tolist()]
    df["source"] = "skolegpt_survey"
    df["system_prompt"] = ""
    df = df[["id", "system_prompt", "question", "source"]]
    return df


In [None]:
# load and preprocess data
df = load_from_newline_separated_txt("./data/survey_questions.txt")
df.head()

In [None]:
# Define prompt
template = "{{ instruction }}"
answer = PromptFunction(
    model="gpt-4-0125-preview",
    temperature=0.7,
    messages=[
        {"role": "user", "content": template},
        
    ],
)


In [None]:
# run generation
meta_data_output = []
ids = []
responses = []
from tqdm import tqdm
for _ ,r in tqdm(df.iterrows(), total=len(df)):

    try:
        output = answer(instruction=r["question"])
        output_content = output.choices[0].message.content
        ids.append(r["id"])
        meta_data_output.append(output)
        responses.append(output_content)
    except:
        ids.append(r["id"])
        meta_data_output.append(None)
        responses.append(None) 

df["response"] = responses
df = df[["id", "system_prompt", "question", "response", "source"]]

In [None]:
# save generated instructions and responses
pl.from_pandas(df).write_parquet("./data/skolegpt_survey_instructions.parquet")

In [None]:
# merge generated examples to master dataset
ds = merge_new_examples_to_master_dataset(df_translated=pl.from_pandas(df))

In [None]:
# push mergerd updates to the master dataset
if False:
    ds.push_to_hub(repo_id="kobprof/skolegpt-instruct", token=os.environ["HUGGINGFACE_TOKEN"])