In [2]:

# "question": "Is capturing giant squid in natural habitat impossible with no gear?",
# "answer": "yes",
# "explanation": "Giant squids live between 1,000 and 3,800 feet in the ocean. With a dry dive suit, a scuba tank, gloves, and so on, divers can reach depths of around 1000 feet. Without scuba gear people can safely dive a maximum of 60 feet without feeling the negative risks associated with diving beyond the limit."

# Passage, Question, Options를 모두 포함해서 question을 구성

import json 
import pandas as pd

def load_jsonl(file_path):
    return pd.read_json(file_path, lines=True)


In [3]:
# setting1_datas = {
#     'train': ['Trainable_ReClor.jsonl'],
#     'valid': ['ReClor-val.jsonl', 'ReClor-val_options.jsonl', 'ReClor-val_shuffled.jsonl'],
#     'test': ['RULE_mainq.jsonl', 'RULE_mainq_options.jsonl', 'RULE_mainq_shuffled.jsonl', 'RULE_subq_all.jsonl'],
# }


def answers_to_string(answers) -> str:
    answer_str = ""
    ordering = ['・', '・', '・', '・', '・', 'F', 'G', 'H', 'I', 'J']
    for a, answer in enumerate(answers):
        while type(answer) == tuple:
            answer = answer[0]
        answer_str += f"{ordering[a]} {answer}\n"
    return answer_str

MCQA_question_template = '<Passage>: {passage}\n<Question>: {question}\nChoose the correct answer from the following options.\n{option_string}\n\nWhat is the correct answer?'
option_question_template = '<Passage>: {passage}\n{question}\n{option_string}\n\n<Question>: Is the option "{target_option}" is a correct answer?'
MCQA_no_passage_template = '<Question>: {question}\nChoose the correct answer from the following options.\n{option_string}\n\nWhat is the correct answer?'
option_no_passage_template = '<Question>: {question}\n{option_string}\n\n<Question>: Is the option "{target_option}" is correct?'

def build_ecfinetuning_dataset_EM(dset):
    result_dict = {
        "train": [],
        "valid": [],
        "test": []
    }

    for dtype in dset.keys():
        for file in dset[dtype]:
            data = load_jsonl(file)
            for row in data.iterrows():
                row = row[1]
                id_string = row['id_string']
                passage = row['context']
                question = row['question']
                option_string = answers_to_string(row['answers'])
                
                if "RULE" in file or "ReClor" in file or "logiqa" in file:
                    option_template = option_question_template
                    mcqa_template = MCQA_question_template
                else: 
                    option_template = option_no_passage_template
                    mcqa_template = MCQA_no_passage_template
                
                if 'option' in file:
                    target_option = f"・ {row['target_option_string']}"
                    label = 'yes.' if row['target_option_correctness'] else 'no.'
                    
                    input_string = option_template.format(passage=passage, question=question, option_string=option_string, target_option=target_option)
                    result_dict[dtype].append({
                        "qid": id_string,
                        "question": input_string,
                        "answer": label,
                        "explanation": None
                    })
                else:
                    input_string = mcqa_template.format(passage=passage, question=question, option_string=option_string)
                    label = row['answers'][ord(row['label'])-ord('A')]
                    
                    result_dict[dtype].append({
                        "qid": id_string,
                        "question": input_string,
                        "answer": label,
                        "explanation": None
                    })

    return result_dict


setting_datas = {
    'train': ['Trainable_ReClor.jsonl', 'Trainable_ReClor_options.jsonl']
    ,'valid': ['ReClor-val.jsonl', 'ReClor-val_options.jsonl']
    ,'test': ['ReClor-val.jsonl'],
}

# setting_datas_EM = {
#     'train': ['Trainable_ReClor.jsonl', 'Trainable_ReClor_options.jsonl']
#     ,'valid': ['ReClor-val.jsonl', 'ReClor-val_options.jsonl']
#     ,'test': ['RULE_mainq.jsonl', 'RULE_mainq_options.jsonl', 'RULE_mainq_shuffled.jsonl', 
#              'RULE_subq_all.jsonl',
#              'logiqa-val.jsonl', 'logiqa-val_options.jsonl', 'logiqa-val_shuffled.jsonl',
#              'MMLU-val.jsonl', 'MMLU-val_options.jsonl', 'MMLU-val_shuffled.jsonl',
#     ]
# }

setting_result = build_ecfinetuning_dataset_EM(setting_datas)

setting_result['train'] = setting_result['train'][:7391] # ReClor Train의 개수는 3695, # ReClor Train Option 개수는 14780
setting_result['valid'] = setting_result['valid'][:1000]

name = 'setting_EM_3_No_bias_in_val'
# 저장
for dtype in setting_result.keys():
    with open(f'../{name}.json', 'w') as f:
        json.dump(setting_result, f, indent=4)

In [11]:

name = 'q_and_o_onebyone'
setting_datas = {
    'train': ['Trainable_ReClor.jsonl', 'Trainable_ReClor_options.jsonl'],
    'valid': ['ReClor-val.jsonl', 'ReClor-val_options.jsonl'],
    'test': ['ReClor-val.jsonl', 'ReClor-val_options.jsonl', 'ReClor-val_shuffled.jsonl'],
}

dset = setting_datas

result_dict = {
    "train": [],
    "valid": [],
    "test": []
}

for dtype in dset.keys():
    for file in dset[dtype]:
        data = load_jsonl(file)
        for row in data.iterrows():
            row = row[1]
            id_string = row['id_string']
            passage = row['context']
            question = row['question']
            option_string = answers_to_string(row['answers'])
            
            if "RULE" in file or "ReClor" in file or "logiqa" in file:
                option_template = option_question_template
                mcqa_template = MCQA_question_template
            else: 
                option_template = option_no_passage_template
                mcqa_template = MCQA_no_passage_template
            
            if 'option' in file:
                target_option = f"{row['target_option_alphabet']}: {row['target_option_string']}"
                label = 'yes' if row['target_option_correctness'] else 'no'
                
                input_string = option_template.format(passage=passage, question=question, option_string=option_string, target_option=target_option)
                result_dict[dtype].append({
                    "qid": id_string,
                    "question": input_string,
                    "answer": label,
                    "explanation": None
                })
            else:
                input_string = mcqa_template.format(passage=passage, question=question, option_string=option_string)
                label = row['label']
                
                result_dict[dtype].append({
                    "qid": id_string,
                    "question": input_string,
                    "answer": label,
                    "explanation": None
                })


In [58]:
result_dict['train'] = result_dict['train'][:7391]
# ReClor Train의 개수는 3695
# ReClor Train Option 개수는 14780

# 그러므로 ReClor Train Option을 1/4만 남기고 뒤는 지운다

7391

In [62]:

name = 'q_and_o_onebyone'
setting_datas = {
    'train': ['Trainable_ReClor.jsonl', 'Trainable_ReClor_options.jsonl'],
    'valid': ['ReClor-val.jsonl', 'ReClor-val_options.jsonl'],
    'test': ['ReClor-val.jsonl', 'ReClor-val_options.jsonl', 'ReClor-val_shuffled.jsonl'],
}

setting_result = build_ecfinetuning_dataset(setting_datas)

setting_result['train'] = setting_result['train'][:7391] # ReClor Train의 개수는 3695, # ReClor Train Option 개수는 14780

# 저장
for dtype in setting_result.keys():
    with open(f'../{name}.json', 'w') as f:
        json.dump(setting_result, f, indent=4)

In [4]:
setting1_datas = {
    'train': ['Trainable_ReClor.jsonl'],
    'valid': ['ReClor-val.jsonl'],
    'test': ['ReClor-val.jsonl', 'ReClor-val_options.jsonl', 'ReClor-val_shuffled.jsonl'],
}

setting1_result = build_ecfinetuning_dataset(setting1_datas)

# 저장
for dtype in setting1_result.keys():
    with open(f'../setting1_.json', 'w') as f:
        json.dump(setting1_result, f, indent=4)

In [5]:
setting2_datas = {
    'train': ['Trainable_ReClor.jsonl', 'Trainable_ReClor_options.jsonl'],
    'valid': ['ReClor-val.jsonl', 'ReClor-val_options.jsonl'],
    'test': ['ReClor-val.jsonl', 'ReClor-val_options.jsonl', 'ReClor-val_shuffled.jsonl'],
}

setting2_result = build_ecfinetuning_dataset(setting2_datas)

# 저장
for dtype in setting2_result.keys():
    with open(f'../setting2_.json', 'w') as f:
        json.dump(setting2_result, f, indent=4)

In [8]:
settings_test = {
    'train': ['Trainable_ReClor.jsonl'],
    'valid': ['ReClor-val.jsonl'],
    'test': ['RULE_mainq.jsonl', 'RULE_mainq_options.jsonl', 'RULE_mainq_shuffled.jsonl', 
             'RULE_subq_all.jsonl',
             'logiqa-val.jsonl', 'logiqa-val_options.jsonl', 'logiqa-val_shuffled.jsonl',
             'MMLU-val.jsonl', 'MMLU-val_options.jsonl', 'MMLU-val_shuffled.jsonl',
    ]
}

settings_test_result = build_ecfinetuning_dataset(settings_test)

# 저장
for dtype in settings_test_result.keys():
    with open(f'../setting_test.json', 'w') as f:
        json.dump(settings_test_result, f, indent=4)

In [4]:
setting3_datas = {
    'train': ['Trainable_ReClor_options.jsonl'],
    'valid': ['ReClor-val_options.jsonl'],
    'test': ['ReClor-val.jsonl', 'ReClor-val_options.jsonl', 'ReClor-val_shuffled.jsonl'],
}

setting3_datas = build_ecfinetuning_dataset(setting3_datas)

# 저장
for dtype in setting3_datas.keys():
    with open(f'../setting3.json', 'w') as f:
        json.dump(setting3_datas, f, indent=4)

In [74]:
import json 
import pandas as pd

setting2_EM_path = '../label_train/setting2_EM.json'

datas = json.load(open(setting2_EM_path, 'r'))

datas_train = datas['train']    

datas_train_option = datas_train[3695:]

datas_train_df =  pd.DataFrame(datas_train[:3695])
datas_train_option_df = pd.DataFrame(datas_train_option)

datas_train_option_balanced = []

from tqdm import tqdm

total_qids = datas_train_df.qid

# qid_to_mainqid
def qid_to_mainqid(qid):
    # qid에서 '_option'을 찾고, 그 이후를 모두 지운다
    return qid[:qid.find('_option')]

for qid in tqdm(total_qids):
    # datas_train_option_df 중에서 datas_train_option_df['qid']에 qid가 포함되는 것들만 검색
    target_option_question_rows = datas_train_option_df[datas_train_option_df.apply(lambda x: qid == qid_to_mainqid(x['qid']), axis=1)]
    yes_row = target_option_question_rows[target_option_question_rows['answer'] == 'yes.']
    no_rows = target_option_question_rows[target_option_question_rows['answer'] == 'no.']
    # no_rows를 섞고 그 중에 하나를 고름
    no_row = no_rows.sample(1)
    # to_dict할 때 index가 같이 나오지 않도록 해야함
    yes_row = yes_row.reset_index(drop=True)
    no_row = no_row.reset_index(drop=True)
    # 그래도 0인덱스가 나오는데 이거 지우면서 넣기
    
    yes_row_dict = yes_row.to_dict(orient='records')[0]
    no_row_dict = no_row.to_dict(orient='records')[0]
    datas_train_option_balanced.append(yes_row_dict)
    datas_train_option_balanced.append(no_row_dict)
    
# datas_train_option_balanced를 jsonl로 저장
output_path = '../label_train/setting2_EM_balanced_option.jsonl'
with open(output_path, 'w') as f:
    for row in datas_train_option_balanced:
        f.write(json.dumps(row, ensure_ascii=False) + '\n')
datas_train_option_balanced_df = pd.DataFrame(datas_train_option_balanced)



100%|██████████| 3695/3695 [06:24<00:00,  9.62it/s]


In [82]:
# datas_train_option_balanced를 setting1_EM 뒤에 넣기
setting1_EM_path = '../label_train/setting1_EM.json'
datas = json.load(open(setting1_EM_path, 'r'))
len(datas['train'])

datas['train'] += datas_train_option_balanced
datas['valid'] += datas_valid_option_balanced

In [101]:
# datas를 '../label_train/setting1_EM_balanced_tmp.json'로 저장
output_path = '../label_train/setting1_EM_balanced_tmp.json'
with open(output_path, 'w') as f:
    json.dump(datas, f, indent=4)

In [108]:
import json 
import pandas as pd

setting2_EM_path = '../label_train/setting2_EM.json'

datas = json.load(open(setting2_EM_path, 'r'))

datas_valid = datas['valid']    

datas_valid_option = datas_valid[500:]

datas_valid_df =  pd.DataFrame(datas_valid[:500])
datas_valid_option_df = pd.DataFrame(datas_valid_option)

datas_valid_option_balanced = []

from tqdm import tqdm

total_qids = datas_valid_df.qid

# qid_to_mainqid
def qid_to_mainqid(qid):
    # qid에서 '_option'을 찾고, 그 이후를 모두 지운다
    return qid[:qid.find('_option')]

for qid in tqdm(total_qids):
    # datas_valid_option_df 중에서 datas_valid_option_df['qid']에 qid가 포함되는 것들만 검색
    target_option_question_rows = datas_valid_option_df[datas_valid_option_df.apply(lambda x: qid == qid_to_mainqid(x['qid']), axis=1)]
    yes_row = target_option_question_rows[target_option_question_rows['answer'] == 'yes.']
    no_rows = target_option_question_rows[target_option_question_rows['answer'] == 'no.']
    # no_rows를 섞고 그 중에 하나를 고름
    no_row = no_rows.sample(1)
    # to_dict할 때 index가 같이 나오지 않도록 해야함
    yes_row = yes_row.reset_index(drop=True)
    no_row = no_row.reset_index(drop=True)
    # 그래도 0인덱스가 나오는데 이거 지우면서 넣기
    
    yes_row_dict = yes_row.to_dict(orient='records')[0]
    no_row_dict = no_row.to_dict(orient='records')[0]
    datas_valid_option_balanced.append(yes_row_dict)
    datas_valid_option_balanced.append(no_row_dict)
    
# datas_valid_option_balanced를 jsonl로 저장
output_path = '../label_train/setting2_EM_balanced_option_withvalid.jsonl'
with open(output_path, 'w') as f:
    for row in datas_valid_option_balanced:
        f.write(json.dumps(row, ensure_ascii=False) + '\n')
datas_valid_option_balanced_df = pd.DataFrame(datas_valid_option_balanced)



100%|██████████| 500/500 [00:08<00:00, 62.26it/s]


In [119]:
# datas_train_option_balanced를 setting1_EM 뒤에 넣기
setting1_EM_path = '../label_train/setting1_EM.json'
datas = json.load(open(setting1_EM_path, 'r'))
len(datas['train'])

datas['train'] += datas_train_option_balanced
datas['valid'] += datas_valid_option_balanced

# datas를 '../label_train/setting1_EM_balanced_tmp.json'로 저장
output_path = '../label_train/setting4_EM.json'
with open(output_path, 'w') as f:
    json.dump(datas, f, indent=4)

In [124]:
datafile = '../label_train/setting4_EM.json'
datas = json.load(open(datafile, 'r'))
valid_df = pd.DataFrame(datas['valid'])
train_df = pd.DataFrame(datas['train'])