In [1]:
#!pip install datasets trl einops tokenizers sentencepiece peft
#!pip uninstall -y transformers
#!pip install git+https://github.com/huggingface/transformers
#!pip install --upgrade protobuf

import os, sys, json, copy
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION']='python'

In [2]:
from autocog import CogArch
from autocog.architecture.utility import PromptTee
arch = CogArch(pipe=PromptTee(prefix='mmlu', tee=sys.stdout))



In [3]:
from autocog.lm import TfLM
kwargs = TfLM.create(model_path='gpt2-medium', device='cpu')
llm = TfLM(**kwargs, completion_kwargs={ 'max_new_tokens' : 20 })
arch.orchestrator.LMs.update({ 'text' : llm })

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [4]:
program="""\
preamble: You are taking the MMLU Examination, a multiple choice questionnaire on a broad variety of subjects. For each question, you must pick the correct answer from four choices.
basics: The following questionnaire present one question from the exam. As an expert in the domain, you will select the correct answer.

entry(mmlu_choice): Your method is to first evaluate each choice separately, then decide on the correct solution.

formats:
- choice(repeat=.choices): Repeat the correct answer

prompt(mmlu_choice):
- target(topic) source(?topic)
- target(question) source(?question)
- target(choices) source(?choices){ground_truth_channels}
> topic: the topic of the question
> question: the question from the MMLU on topic specified above
> choices[4]: List of four choices, a single one of them answer the question correctly
> answer(choice): Which choice is correct? Repeat the correct answer
__exit(answer):

?ground_truth_channels=
"""

cog = arch.load(tag='template', language='sta', program=program, ground_truth_channels="\n- target(answer) source(?answer)")
cog = arch.load(tag='actual',   language='sta', program=program)

In [5]:
# TODO move to CogArch once fleshed out

from datasets import Dataset

async def gen_prompt_dataset(arch, data, ground_truth_formater):
    pipe = arch.orchestrator.pipe
    arch.orchestrator.pipe = None
    res = []
    for (out,fid) in await arch.run([ ground_truth_formater(**sample) for sample in data ]):
        for prompts in arch.orchestrator.frames[fid].prompts.values():
            res += prompts
    arch.orchestrator.pipe = pipe
    return Dataset.from_dict({ 'text' : res })

In [6]:
from utility import mmlu_list, mmlu_subset

def mmlu_truth_formater(topic, question, choices, answer, repeat=True, **kwargs):
    answer = ord(answer)-ord('A')
    if repeat:
        answer = choices[answer]
    return { 'tag' : 'template', 'topic' : topic, 'question' : question, 'choices' : choices, 'answer' : answer }

data = json.load(open('/home/ubuntu/mmlu-data.json'))

In [7]:
train_topics = ['mc_test','science_elementary','science_middle','arc_easy','arc_hard','obqa']
# mmlu_list(mmlu_subset(data, mode='aux', topic=train_topic))

In [None]:
train_ds = await gen_prompt_dataset(arch, data=mmlu_subset(data, topic=train_topics, mode='aux'), ground_truth_formater=mmlu_truth_formater)
valid_ds = await gen_prompt_dataset(arch, data=mmlu_subset(data, topic=None, mode='dev'), ground_truth_formater=mmlu_truth_formater)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

llm.tokenizer.pad_token = llm.tokenizer.eos_token
trainer = SFTTrainer(
    llm.model,
    tokenizer=llm.tokenizer,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    dataset_text_field="text",
    max_seq_length=512,
    args=TrainingArguments(
        output_dir='/home/ubuntu/finetuning',
        evaluation_strategy='steps',
        warmup_steps=500,
        eval_steps=500,
        save_steps=500,
        num_train_epochs=30
    )
)
trainer.train()