In [7]:
import json 
import pandas as pd

def load_jsonl(file_path):
    return pd.read_json(file_path, lines=True)

def answers_to_string(answers) -> str:
    answer_str = ""
    ordering = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
    for a, answer in enumerate(answers):
        while type(answer) == tuple:
            answer = answer[0]
        answer_str += f"・{answer} ({ordering[a]})\n"
    return answer_str

MCQA_question_template = '<Passage>: {passage}\n<Question>: {question}\n{option_string}\n\nWhat is the correct answer?'
option_question_template = '<Passage>: {passage}\n<Question>: {question}\n{option_string}\n\nIs option "・{target_option}" the correct answer?'


def build_ecfinetuning_dataset_EM(dset):
    result_dict = {
        "train": [],
        "valid": [],
        "test": []
    }

    for dtype in dset.keys():
        if len(dset[dtype]) != 0:
            for file_name in dset[dtype]:
                print(file_name)
                data = load_jsonl(file_name)
                
                for row in data.iterrows():
                    row = row[1]
                    id_string = row['id_string']
                    passage = row['context']
                    question = row['question']
                    option_string = answers_to_string(row['answers'])
                    
                    if "RULE" in file_name or "ReClor" in file_name or "logiqa" in file_name:
                        option_template = option_question_template
                        mcqa_template = MCQA_question_template
                    
                    if 'option' in file_name:
                        target_option = f"{row['target_option_string']}"
                        label = 'yes.' if row['target_option_correctness'] else 'no.'
                        answer_sentence = row['answers'][ord(row['label'])-ord('A')]
                        
                        input_string = option_template.format(passage=passage, question=question, option_string=option_string, target_option=target_option)
                        result_dict[dtype].append({
                            "qid": id_string,
                            "question": input_string,
                            "answer": label,
                            "answer_sentence": row['target_option_string'],
                            "explanation": None
                        })
                    else:
                        input_string = mcqa_template.format(passage=passage, question=question, option_string=option_string)
                        label = row['label']
                        answer_sentence = row['answers'][ord(row['label'])-ord('A')]
                        
                        result_dict[dtype].append({
                            "qid": id_string,
                            "question": input_string,
                            "answer": label,
                            "answer_sentence": answer_sentence,
                            "explanation": None
                        })

    return result_dict

RULE_mainq_AMR-LDA.jsonl
RULE_mainq_AMR-LDA_options.jsonl


In [None]:
setting_datas = {
    'train': []
    ,'valid': []
    ,'test': ['RULE_mainq_AMR-LDA.jsonl', 'RULE_mainq_AMR-LDA_options.jsonl']
}

setting_result = build_ecfinetuning_dataset_EM(setting_datas)

name = 'setting_test_reclor_AMR-LDA'
# 저장
for dtype in setting_result.keys():
    with open(f'{name}.json', 'w') as f:
        json.dump(setting_result, f, indent=4)