In [1]:
#!pip install datasets trl einops tokenizers sentencepiece peft
#!pip uninstall -y transformers
#!pip install git+https://github.com/huggingface/transformers
#!pip install --upgrade protobuf
#!pip install git+https://github.com/LLNL/AutoCog@v0.2

import os, sys, json, copy
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION']='python'

In [2]:
# TODO move to CogArch once fleshed out

import itertools
import copy
from datasets import Dataset

async def gen_prompt_dataset(arch, tags, data, formater_input, formater_output):
    workload = list(itertools.product(tags,data))
    commands = []
    for (tag,sample) in workload:
        job = formater_input(tag=tag, **sample)
        job.update(formater_output(tag=tag, **sample))
        job.update({ 'tag' : f"{tag}-train" })
        commands.append(job)
    res = []
    for (out,fid) in await arch.run(commands):
        # TODO scan frames recursively
        for prompts in arch.orchestrator.frames[fid].prompts.values():
            res += prompts
    return Dataset.from_dict({ 'text' : res })

async def run_test(arch, tags, data, formater, checker):
    workload = list(itertools.product(tags,data))
    commands = []
    for (tag,sample) in workload:
        job = formater(tag=tag, **sample)
        job.update({ 'tag' : f"{tag}-train" })
        commands.append(job)

    results = []
    stats = {}
    for ((tag,sample),(out,fid)) in zip(workload, await arch.run(commands)):
        res = copy.deepcopy(sample)
        res.update({ 'tag' : tag, 'out' : out, 'fid' : fid, 'res' : checker(tag, sample, out) })
        results.append(res)
        idx = (tag,sample['topic'])
        if not idx in stats:
            stats.update({ idx : { 'pass' : 0, 'fail' : 0 } })
        stats[idx]['pass' if res['res'] else 'fail'] += 1

    scores = {}
    passfail = { tag : [0,0] for tag in tags }
    for ((tag,topic),res) in stats.items():
        if not topic in scores:
            scores.update({ topic : {} })
        passfail[tag][0] += res['pass']
        passfail[tag][1] += res['fail']
        scores[topic].update({ tag : res['pass']*1./(res['pass']+res['fail']) })
    scores.update({ 'TOTAL' : { tag : passfail[tag][0]*1./(passfail[tag][0]+passfail[tag][1]) for tag in tags } })
    return (results,scores)

# TODO move to autcog/utility/pynb.py

import tabulate

def scores_to_table(tags, scores):
    table = [ [ 'TOPIC' ] + [ tag for tag in tags ] ]
    for (topic,score) in scores.items():
        table.append([topic]+[score[tag] for tag in tags ])
    return tabulate.tabulate(table, tablefmt='html')

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [3]:
# TODO move to ./utility.py once fleshed out

from utility import mmlu_list, mmlu_subset, mcq_checkers, mmlu_exec

def mmlu_formater_input(tag, topic, question, choices, **kwargs):
    if tag.find('seqannot') >= 0:
        choices = [ { 'candidate' : choice }  for (c,choice) in enumerate(choices) ]
    return { 'topic' : topic, 'question' : question, 'choices' : choices }

def mmlu_formater_output(tag, choices, answer, **kwargs):
    a = ord(answer)-ord('A')
    if tag.endswith('repeat'):
        answer = choices[a]
    if tag.find('seqannot') >= 0:
        choices = [ { 'candidate' : choice, 'correct' : c == a }  for (c,choice) in enumerate(choices) ]
    return { 'answer' : answer, 'choices' : choices }

def mmlu_formater_inout(tag, **kwargs):
    res = mmlu_formater_input(tag, **kwargs)
    res.update(mmlu_formater_output(tag, **kwargs))
    return res

def mmlu_checker(tag, sample, result):
    answer = ord(sample['answer'])-ord('A')
    if tag.endswith('repeat'):
        answer = sample['choices'][answer]
    else:
        answer = str(answer)
    return result['answer'] == answer

def mmlu_load_program_for_finetuning(arch, tag, program, output_channels, language='sta', **kwargs):
    arch.load(tag=f'{tag}-train', language=language, program=program, **kwargs, output_channels=output_channels)
    arch.load(tag=f'{tag}-test',  language=language, program=program, **kwargs, output_channels=[''] * len(output_channels))



In [4]:
from autocog import CogArch
from autocog.architecture.utility import PromptTee

programs = { 'inout' : """\
preamble: You are taking the MMLU Examination, a multiple choice questionnaire on a broad variety of subjects. For each question, you must pick the correct answer from four choices.
basics: The following questionnaire present one question from the exam. As an expert in the domain, you will select the correct answer.

entry(mmlu_choice):

formats:
- choice({choice_mode}=.choices): {choice_desc}

prompt(mmlu_choice):
- target(topic) source(?topic)
- target(question) source(?question)
- target(choices) source(?choices){output_channels[0]}
> topic: the topic of the question
> question: the question from the MMLU on topic specified above
> choices[4]: List of four choices, a single one of them answer the question correctly
> answer(choice): Which choice is correct? {choice_desc}
__exit(answer):
""",
            'seqannot' : """\
preamble: You are taking the MMLU Examination, a multiple choice questionnaire on a broad variety of subjects. For each question, you must pick the correct answer from four choices.
basics: The following questionnaire present one question from the exam. As an expert in the domain, you will select the correct answer.

entry(mmlu_choice): Your method is to first evaluate each choice one by one as they get presented to you, then decide on the correct solution.

formats:
- choice({choice_mode}=.choices.candidate): {choice_desc}

prompt(mmlu_choice):
- target(topic) source(?topic)
- target(question) source(?question)
- target(choices) source(?choices){output_channels[0]}
> topic: the topic of the question
> question: the question from the MMLU on topic specified above
> choices[4](record): For each choice, you judge whether it is the correct choice or not
> > candidate: one of the possible answer for this question
> > correct(bool): do you think that this is the correct answer?
> answer(choice): Which choice is correct? {choice_desc}
__exit(answer):
"""}

output_channels = { 
    'inout'    : ["\n- target(answer) source(?answer)"],
    'seqannot' : ["\n- target(answer) source(?answer)"],
}

arch = CogArch() # pipe=PromptTee(prefix='mmlu', tee=sys.stdout)
tags = []
for (tag,program) in programs.items():
    tags.append(f'{tag}-repeat')
    mmlu_load_program_for_finetuning(arch, tag=f'{tag}-repeat', program=program, choice_mode='repeat', choice_desc="Repeat the correct answer verbatim", output_channels=["\n- target(answer) source(?answer)"])
    tags.append(f'{tag}-select')
    mmlu_load_program_for_finetuning(arch, tag=f'{tag}-select', program=program, choice_mode='select', choice_desc="Index of the correct answer (1, 2, 3, or 4)", output_channels=["\n- target(answer) source(?answer)"])

In [5]:
# data = mmlu_data()
data = json.load(open('/home/ubuntu/mmlu-data.json'))

train_subset = mmlu_subset(data, topic=['mc_test','science_elementary','science_middle','arc_easy','arc_hard','obqa'], mode='aux')
valid_subset = mmlu_subset(data, topic=None, mode='dev')
test_subset  = mmlu_subset(data, topic=None, mode='test')

# mmlu_list(mmlu_subset(data, mode='aux', topic=train_topic))

In [6]:
# arch.cogs.keys()

# pipe = PromptTee(prefix='mmlu', tee=sys.stdout)
# arch.orchestrator.pipe = PromptTee(prefix='mmlu', tee=sys.stdout)
# tag = 'seqannot-repeat'
# sample = data[0]
# inputs = mmlu_formater_inout(tag, **sample)
# print(f"inputs={inputs}")
# await arch(f"{tag}-train", **inputs)
# arch.orchestrator.pipe = None

# arch.orchestrator.pipe = None
# for tag in tags:
#     for sample in data:
#         try:
#             inputs = mmlu_formater_input(tag, **sample)
#             inputs.update(mmlu_formater_output(tag, **sample))
#             inputs.update({ 'tag' : f"{tag}-train" })
#             await arch.run([inputs], progress=False)
#         except:
#             print(f"tag={tag}")
#             arch.orchestrator.pipe = pipe
#             inputs = mmlu_formater_input(tag, **sample)
#             inputs.update(mmlu_formater_output(tag, **sample))
#             inputs.update({ 'tag' : f"{tag}-train" })
#             await arch.run([inputs])
#             arch.orchestrator.pipe = None
#             break

In [7]:
from autocog.lm import TfLM
kwargs = TfLM.create(model_path='gpt2-medium', device='cuda')
llm = TfLM(**kwargs, completion_kwargs={ 'max_new_tokens' : 20 })
arch.orchestrator.LMs.update({ 'text' : llm })

llm.tokenizer.pad_token = llm.tokenizer.eos_token

In [8]:
train_ds = await gen_prompt_dataset(arch, tags=tags, data=train_subset, formater_input=mmlu_formater_input, formater_output=mmlu_formater_output)
valid_ds = await gen_prompt_dataset(arch, tags=tags, data=valid_subset, formater_input=mmlu_formater_input, formater_output=mmlu_formater_output)

100%|██████████| 44044/44044 [00:58<00:00, 756.56it/s] 
100%|██████████| 1140/1140 [00:01<00:00, 760.01it/s]


In [None]:
import datetime
from IPython.display import display

from trl import SFTTrainer
from transformers import TrainingArguments

datadir    = '/home/ubuntu/finetuning-results'
experiment = 'mmlu-gpt2-medium'

# scores = [ (await run_test(arch, tags=tags, data=valid_subset, formater=mmlu_formater_input, checker=mmlu_checker))[1] ]
# display(scores_to_table(tags, scores[-1]))

n_steps = 200
timestamp  = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
trainer = SFTTrainer(
    llm.model,
    tokenizer=llm.tokenizer,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    dataset_text_field="text",
    max_seq_length=512,
    args=TrainingArguments(
        output_dir=f'{datadir}/{experiment}/{timestamp}',
        evaluation_strategy='steps',
        warmup_steps=5*n_steps,
        eval_steps=n_steps,
        logging_steps=n_steps,
        save_strategy='no',
        save_steps=n_steps,
        num_train_epochs=10,
    )
)

trainer.train()
trainer.save_model(f'{datadir}/{experiment}/{timestamp}/backup')

scores.append( (await run_test(arch, tags=tags, data=valid_subset, formater=mmlu_formater_input, checker=mmlu_checker))[1] )
display(scores_to_table(tags, scores[-1]))

Map:   0%|          | 0/44044 [00:00<?, ? examples/s]

Map:   0%|          | 0/1140 [00:00<?, ? examples/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
200,1.3988,0.640763
400,0.4539,0.638973
600,0.4428,0.64616
800,0.4178,0.658638
1000,0.4105,0.665569
1200,0.3919,0.674477
1400,0.3549,0.685426
1600,0.3393,0.693093
1800,0.3195,0.691979
2000,0.3121,0.706165
