In [2]:
from openicl import (DatasetReader, PromptTemplate, 
                     ZeroRetriever, RandomRetriever, BM25Retriever, TopkRetriever,
                     GenInferencer, PPLInferencer)
from openicl.icl_dataset_reader import load_dataset
import pandas as pd
from accelerate import Accelerator
from QPKTabuRetriever import QPKTabuRetriever
import numpy as np
import matplotlib.pyplot as plt

In [3]:
MODELS = ["xlm-roberta-large", "gpt2-large", 'facebook/xglm-1.7B']
TASKS = ['question-answering', 'sentiment-analysis']
DATASET_NAMES = {
    'question-answering':['commonsense_qa','tasksource/bigbench', 'openbookqa'],
    'sentiment-analysis':['gpt3mix/rt20', 'gpt3mix/sst2', "rotten_tomatoes"]
}
RETRIEVERS = ['qkp', 'bm25','zero', 'topk','random']

In [4]:
def cmqa_pre_process(example):
    for i in range(5):
        example[chr(ord('A') + i)] = example['choices']['text'][i]
    return example

def obqa_pre_process(example):
    for i in range(4):
        example[chr(ord('A') + i)] = example['choices']['text'][i]
    example['context'] = "Open book questions"
    return example

def yelp_pre_process(example):
    example['label'] = int(int(example['label'] >= 2))
    return example

def bb_pre_process(example):
    for i in range(3):
        example[chr(ord('A') + i)] = example['multiple_choice_targets'][i]
    example['multiple_choice_scores'] = chr(ord('A') + np.where(np.array(example['multiple_choice_scores']) == 1)[0][0])
    example['context'] = "Disambiguation"
    return example

In [5]:
def select_dataset(name, test_size, train_size):
    if name == 'commonsense_qa':
        dataset = load_dataset(name, split='train')
        dataset = dataset.train_test_split(test_size=test_size, train_size=train_size, shuffle=True)
        dataset = dataset.map(cmqa_pre_process)
        dataset = dataset.rename_column("question_concept","context")
        dataset = dataset.rename_column("answerKey","answer")
        input_cols = ["question", "context", "A", "B", "C", "D", "E"]
        return DatasetReader(dataset=dataset, input_columns=input_cols, output_column="answer")
    elif name == 'openbookqa':
        dataset = load_dataset(name, 'main', split='train')
        dataset = dataset.train_test_split(test_size=test_size, train_size=train_size, shuffle=True)
        dataset = dataset.map(obqa_pre_process)
        dataset = dataset.rename_column("question_stem","question")
        dataset = dataset.rename_column("answerKey","answer")
        input_cols = ["question", "context", "A", "B", "C", "D"]
        return DatasetReader(dataset=dataset, input_columns=input_cols, output_column="answer")
    elif name == 'tasksource/bigbench':
        dataset = load_dataset(name, 'disambiguation_qa', split='train')
        dataset = dataset.train_test_split(test_size=test_size, train_size=train_size, shuffle=True)
        dataset = dataset.map(bb_pre_process)
        dataset = dataset.rename_column("multiple_choice_scores","answer")
        dataset = dataset.rename_column("inputs","question")
        input_cols = ["question", "context", "A", "B", "C"]
        return DatasetReader(dataset=dataset, input_columns=input_cols, output_column="answer")
    elif name == 'gpt3mix/rt20' or name == 'gpt3mix/sst2':
        dataset = load_dataset(name, split='train')
        dataset = dataset.train_test_split(test_size=test_size, train_size=train_size, shuffle=True)
        return DatasetReader(dataset=dataset, input_columns=["text"], output_column="label")
    elif name == "rotten_tomatoes":
        dataset = load_dataset(name, split='test')
        dataset = dataset.train_test_split(test_size=test_size, train_size=train_size, shuffle=True)
        return DatasetReader(dataset=dataset, input_columns=["text"], output_column="label")


In [6]:
TEMPLATES = {
    'commonsense_qa':PromptTemplate(
        {
            'A': "</E>Answer the following question:\n</Q>\nAnswer: </Ans1>",
            'B': "</E>Answer the following question:\n</Q>\nAnswer: </Ans2>",
            'C': "</E>Answer the following question:\n</Q>\nAnswer: </Ans3>",
            'D': "</E>Answer the following question:\n</Q>\nAnswer: </Ans4>",
            'E': "</E>Answer the following question:\n</Q>\nAnswer: </Ans5>",
        },
        {'question':'</Q>', 'A': '</Ans1>', 'B': '</Ans2>', 'C': '</Ans3>', 'D': '</Ans4>', 'E': '</Ans5>'},
        ice_token='</E>' 
    ),
    'openbookqa':PromptTemplate(
        "</E>Multiple choice question:\n</Q>\nChoices:\nA:</Ans1>\nB:</Ans2>\nC:</Ans3>\nD:</Ans4>\n:Answer:</A>",
        {'question':'</Q>', 'A': '</Ans1>', 'B': '</Ans2>', 'C': '</Ans3>', 'D': '</Ans4>', "answer":"</A>"},
        ice_token='</E>' 
    ),
    'tasksource/bigbench':PromptTemplate(
        {
            'A': "</E>Answer the following question:\n</Q>\nAnswer: </Ans1>",
            'B': "</E>Answer the following question:\n</Q>\nAnswer: </Ans2>",
            'C': "</E>Answer the following question:\n</Q>\nAnswer: </Ans3>"
        },
        {'question':'</Q>', 'A': '</Ans1>', 'B': '</Ans2>', 'C': '</Ans3>'},
        ice_token='</E>' 
    ),
    'gpt3mix/rt20':PromptTemplate({
            0: '</E>Positive Movie Review: \"<X>\"', 
            1: '</E>Negative Movie Review: \"<X>\"',
        }, column_token_map={'text' : '<X>'}, 
        ice_token='</E>'
    ),
    'gpt3mix/sst2':PromptTemplate({
            0: '</E>Positive Movie Review: \"<X>\"', 
            1: '</E>Negative Movie Review: \"<X>\"',
        }, column_token_map={'text' : '<X>'}, 
        ice_token='</E>'
    ),
    'rotten_tomatoes':PromptTemplate({
            1: '</E>Positive Movie Review: \"<X>\"', 
            0: '</E>Negative Movie Review: \"<X>\"',
        }, column_token_map={'text' : '<X>'}, 
        ice_token='</E>'
    ),
    
}

In [7]:
def select_retriever(retr_name, data, model, task, ice_num):
    if retr_name == 'zero':
        return ZeroRetriever(data)
    elif retr_name == 'random':
        return RandomRetriever(data, ice_num=ice_num)
    elif retr_name == 'bm25':
        return BM25Retriever(data, ice_num=ice_num)
    elif retr_name == 'qkp':
        return QPKTabuRetriever(data, model=model, task=task, ice_num=ice_num)
    elif retr_name == 'topk':
        return TopkRetriever(data, ice_num=ice_num)
    else:
        raise Exception()

In [8]:
def do_single_run(model, task, dataset_name, retr_name, ice_num, test_size, train_size):
    data = select_dataset(dataset_name, test_size, train_size)
    retriever = select_retriever(retr_name, data, model, task, ice_num)
    if retriever.tokenizer.pad_token is None:
        retriever.tokenizer.pad_token = "[PAD]"
    inferencer = GenInferencer(model_name=model)
    ice_template = TEMPLATES[dataset_name] 
    predictions = inferencer.inference(retriever, ice_template=ice_template, output_json_filename="output_gen.json")
    pred_new = list()
    for pred in predictions:
        try:
            pred_new.append(re.findall('Answer:[A-Z]', pred)[0].split(':')[-1])
        except:
            pred_new.append('')
    inputs = {
            input_col:retriever.test_ds[input_col] for input_col in retriever.dataset_reader.input_columns
        }
    outputs = retriever.test_ds[retriever.dataset_reader.output_column]
    accuracy = np.sum(np.array(retriever.test_ds[retriever.dataset_reader.output_column]) == np.array(predictions))
    return pred_new, inputs, outputs, accuracy


In [9]:
# results = {
#     'model':[],
#     'task':[],
#     'dataset':[],
#     'retriever':[],
#     'accuracy_mean':[],
#     'accuracy_std':[],
#     'predictions':[],
#     'inputs':[],
#     'outputs':[]
# }

# accelerator = Accelerator()
# ice_num = 5
# reps = 2

# for model in MODELS[1:]:
#     for task in TASKS[1:]:
#         for dataset_name in DATASET_NAMES[task][1:]:
#             for retr_name in RETRIEVERS[:2]:
                
#                 print(retr_name)
#                 accuracies = list()
#                 all_predictions = list()
#                 all_inputs = list()
#                 all_outputs = list()

#                 results['model'].append(model)
#                 results['task'].append(task)
#                 results['dataset'].append(dataset_name)
#                 results['retriever'].append(retr_name)

#                 for _ in range(reps):
#                     predictions, inputs, outputs, accuracy = do_single_run(model, task, dataset_name, retr_name, ice_num, accelerator)
#                     all_predictions.append(predictions)
#                     all_inputs.append(inputs)
#                     all_outputs.append(outputs)
#                     accuracies.append(accuracy)
                
#                 results['accuracy_mean'].append(np.mean(accuracies))
#                 results['accuracy_std'].append(np.std(accuracies))
#                 results['predictions'].append(all_predictions)
#                 results['inputs'].append(all_inputs)
#                 results['outputs'].append(all_outputs)

            

In [10]:
predictions, inputs, outputs, accuracy = do_single_run(MODELS[0], TASKS[0], 'openbookqa', 'bm25', 3, 10, 100)
print(predictions)
print(inputs)
print(outputs)
print(accuracy)

Found cached dataset openbookqa (/home/nlonyuk/.cache/huggingface/datasets/openbookqa/main/1.0.1/f338ccacfbc86fb8c2de3aa1c06d2ce686933de3bca284dba97d32592c52b33f)


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

If you want to use `XLMRobertaLMHeadModel` as a standalone, add `is_decoder=True.`
[2023-06-15 17:54:55,795] [openicl.icl_retriever.icl_bm25_retriever] [INFO] Retrieving data for test set...
100%|██████████| 10/10 [00:00<00:00, 1432.14it/s]
[2023-06-15 17:54:55,817] [openicl.icl_inferencer.icl_gen_inferencer] [INFO] Starting inference process...
  0%|          | 0/10 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
 10%|█         | 1/10 [00:01<00:13,  1.45s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
 20%|██        | 2/10 [00:03<00:12,  1.59s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
 30

In [None]:
import re
for i in range(len(inputs['question'])):
    print(inputs['question'][i])
    print(inputs['A'][i])
    print(inputs['B'][i])
    print(inputs['C'][i])
    print(inputs['D'][i])

    print(outputs[i])
    print(re.findall('Answer:[A-Z]', predictions[i])[0].split(':')[-1])



usually plants die or become dormant after the
lowest solstice
naptime
a good book
lunch
A


IndexError: list index out of range

In [None]:
prompt_template = PromptTemplate(
        "Answer the following question: </Q>\nChoose the correct answer:{A: </Ans1>,B: </Ans2>,C: </Ans3>,D: </Ans4>}</A>",
        {'question':'</Q>', 'A': '</Ans1>', 'B': '</Ans2>', 'C': '</Ans3>', 'D': '</Ans4>', 'answer': '</A>'},
)

inf = GenInferencer(model_name="gpt2-large")
pred = inf.inference(BM25Retriever(select_dataset('openbookqa', 2, 20)), ice_template=TEMPLATES['openbookqa'])
pred

In [None]:
pred[0]

In [None]:
def plot(x, ys, title, labels, savefile):
    configs = ['g*-', 'bo-', 'r+-']
    fig = plt.figure()
    ax = fig.gca()
    # ax.set_xscale('log')
    ax.set_xticks(x)
    ax.set_xticklabels(x)
    ax.set_title(title)
    plt.grid()
    for idx, y in enumerate(ys):
        plt.plot(x, y, configs[idx], label=labels[idx])
        plt.legend()
    plt.savefig(f'{savefile}.png')

In [None]:
for task in TASKS:
    df_instance = df_results[df_results['task'] == task]
    evals = [df_instance[df_instance['retriever'] == retr_name]['accuracy_mean'] for retr_name in RETRIEVERS]

    plot(DATASET_NAMES[task], evals, f'Mean acccuracy for task: {task}', RETRIEVERS, f'{task}_evals')
