In [18]:
import json, datetime, string
from collections import Counter
import pandas as pd

# 모든 원본 데이터를 담은 df를 만든다 (Answer를 일치시키도록 복원하기 위해서)
import pandas as pd

LogiQA2_val = pd.read_json('/hdd/hjl8708/workspace/Data/LogiQA-2/logiqa-val.jsonl', lines=True)
# MMLU_val = pd.read_json('/hdd/hjl8708/workspace/Data/MMLU/MMLU-val.jsonl', lines=True)
RULE_mainq = pd.read_json('/hdd/hjl8708/workspace/Data/RULE/RULE_mainq.jsonl', lines=True)

# 위 3개의 데이터를 합친다
original_data = pd.concat([LogiQA2_val, RULE_mainq], ignore_index=True)

def exact_match_score(prediction, ground_truth):
    max_len = len(ground_truth)
    return (normalize_answer(prediction, max_len) == normalize_answer(ground_truth, max_len))

def f1_score(prediction, ground_truth):
    max_len = len(ground_truth)
    prediction_tokens = normalize_answer(prediction,max_len).split()
    ground_truth_tokens = normalize_answer(ground_truth,max_len).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())

    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def metric_match(metric, prediction, ground_truth):
    score = metric(prediction, ground_truth)
    return score

# def normalize_answer_fewshot_LLaMA(s):
def normalize_answer(s, max_len):
    # should we keep those counter removal? 
    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    # 구현
    lowered_pred = remove_punc(s.lower())
    # print('original:', s)
    if lowered_pred.startswith("yes") or lowered_pred.startswith("no"):
        if lowered_pred.startswith("yes"): 
            extract_s = s[:3]
        elif lowered_pred.startswith("no"):
            extract_s = s[:2]
    else:
        extract_s = lowered_pred[0]
        
    # print('extracted:', extract_s)
    return white_space_fix(remove_punc(lower(extract_s)))

In [20]:
import itertools
import json
# from utils.tools import read_jsonl, check_jsonls, metric_max_over_ground_truths,  f1_score, exact_match_score, metric_match
import pandas as pd 

original_data_RULE_path = '/hdd/hjl8708/workspace/Data/RULE/RULE_subq_all.jsonl'
original_data_RULE = pd.read_json(original_data_RULE_path, lines=True) #: [{'question_id': str, 'prediction': str}

def dtype_processing(pred):
    dtypes = ['logiqa', 'ReClor_val', 'RULE_train', 'MMLU_val']
    sub_types = ['sub', 'option', 'shuffle']
    # qid에서 다음 것들 중에 겹치는 부분을 추출한다
    
    result = []
    qid = pred["qid"]
    
    for dtype in dtypes:
        if dtype in qid:
            result.append(dtype)
    for sub_type in sub_types:
        if sub_type in qid:
            result.append(sub_type)
    
    if 'option' in result:
        if 'yes' in pred['answer']:
            result.append('selective')
        else:
            result.append('eliminative')
    
    if 'sub' in result:
        def get_main_option_correctness(qid):
            row = original_data_RULE[original_data_RULE['id_string'] == qid]
            selective = row['others'].item()['main_option_correctness']
            return 'selective' if selective else 'eliminative'
        result.append(get_main_option_correctness(qid))
        
    result = '_'.join(result)
    
    return result

def gen_eval_new(preds):
    em_total = 0
    f1_total = 0
    count = 0
    
    results = {} # qid에 있는 type마다 em, f1을 저장
    
    for pred in preds:
        qid = pred["qid"]
        sent = pred["question"].lower().strip()
        count += 1
        gold = pred["answer"]
        

        prediction = pred["output"]

        em_current = metric_match(exact_match_score, prediction, gold)
        em_total += em_current
        
        dtype = dtype_processing(pred)
        
        if dtype in results:
            results[dtype]['em'] += em_current
            results[dtype]['count'] += 1
        else:
            results[dtype] = {'em': em_current, 'count': 1}
        
    for dtype in results:
        results[dtype]['em'] /= results[dtype]['count']
        results[dtype]['em'] *= 100
        
    return results

In [21]:

# 여기서부터 시작 (결과)

dataname = 'Test-AMR_LDA-mixtral_instruct'
pred_foler = f'/hdd/hjl8708/workspace/experiments/{dataname}'
pred_file: str = f'{pred_foler}/greedy_preds.json'
with open(pred_file, 'r') as f:
    preds = json.load(f) # {'qid': str, 'question': str, 'output': str, 'answer': str}
    
# golds = read_jsonl(gold_file)
# check_jsonls(preds, golds)

results = gen_eval_new(preds)

import pandas as pd

# # results를 pd DF로
df = pd.DataFrame(results).T
preds_df = pd.DataFrame(preds)

with open(f'{pred_foler}/main_results.json', 'w') as f:
    json.dump(results, f, indent=4)

In [4]:
# 정답 여부와 관계 없이, 선택지에 대해서 같은 판단을 내렸는가에 관한 평가 (정답인 경우와 아닌 경우도 나눠서 확인하기)
# preds에서 각 선택지에 대한 판단을 추출
    # MCQA type: 정답 배열 중에서 output에 가장 가까운 문장을 선택함
    # Option type: 데이터 원본을 확인해서 원본 정답 문장을 확인하고 yes/no를 통해 선택지 판단을 확인

def mcqa_answer_extraction(pred: str, answers, normalize_function):
    def get_minimum_edit_distance(a, b):
        m, n = len(a), len(b)
        dp = [[0] * (n + 1) for _ in range(m + 1)]
        # 기저 사례 초기화
        for i in range(m + 1):
            dp[i][0] = i  # str1의 첫 i 글자를 공백으로 변환하는 비용
        for j in range(n + 1):
            dp[0][j] = j  # 공백을 str2의 첫 j 글자로 변환하는 비용
        # dp 테이블 채우기
        for i in range(1, m + 1):
            for j in range(1, n + 1):
                if a[i-1] == b[j-1]:
                    cost = 0  # 글자가 같을 경우, 비용은 0
                else:
                    cost = 1  # 글자가 다를 경우, 비용은 1
                dp[i][j] = min(
                    dp[i-1][j] + 1,  # 삭제
                    dp[i][j-1] + 1,  # 삽입
                    dp[i-1][j-1] + cost  # 교체 또는 매칭
                )
        # 우측 하단에 있는 값이 최소 편집 거리
        return dp[m][n]    
    minimum_edit_distance = []
    
    
    pred_norm = normalize_function(pred, max(len(answer) for answer in answers))
    
    for answer in answers:
        minimum_edit_distance.append(get_minimum_edit_distance(pred_norm, normalize_function(answer, len(answer))))
    
    return answers[minimum_edit_distance.index(min(minimum_edit_distance))]


# 1. main_question의 답변을 추출한다
mainq_qids = [] # 대상이 되는 main question들의 qid
for pred in preds:
    if 'option' not in pred['qid'] and 'shuffle' not in pred['qid'] and 'sub' not in pred['qid']:
        mainq_qids.append(pred['qid'])

# 중복되는 경우 2가지 (From LogiQA): 스킵
result_for_mainq = []
from tqdm import tqdm 
for main_qid in tqdm(mainq_qids):
    tmp = {}
    tmp['qid'] = main_qid
    
    # qid가 original_data에서 두 개 잡히는 경우 그 무시하기
    if len(original_data[original_data['id_string'] == main_qid]) > 1:
        print(f"qid가 original_data에서 두 개 잡히는 경우: {main_qid}")
        continue
    # answer_options의 모든 원소에 str() 적용
    answer_options = original_data[original_data['id_string'] == main_qid]['answers'].item()
    answer_options = [str(answer) for answer in answer_options]
    output = preds_df[preds_df['qid'] == main_qid]['output'].item()
    tmp['main_result'] = mcqa_answer_extraction(output, answer_options, normalize_answer)
    tmp['answer'] = preds_df[preds_df['qid'] == main_qid]['answer'].item()
    result_for_mainq.append(tmp)
    
result_for_mains = f'{pred_foler}/result_for_mains.json'

data_ids = ['RULE_train', 'logiqa', 'MMLU_val']

for i in result_for_mainq: # 데이터 타입을 마킹
    for data_id in data_ids:
        if data_id in i['qid']:
            i['data_id'] = data_id
            break

with open(result_for_mains, 'w') as f:
    json.dump(result_for_mainq, f, indent=4)

  0%|          | 0/943 [00:00<?, ?it/s]

100%|██████████| 943/943 [00:07<00:00, 122.60it/s]


In [147]:

# 여기서부터 시작 (결과)

pred_foler = f'/hdd/hjl8708/workspace/experiments/{dataname}'
pred_file: str = f'{pred_foler}/greedy_preds.json'
with open(pred_file, 'r') as f:
    preds = json.load(f) # {'qid': str, 'question': str, 'output': str, 'answer': str}
    
# golds = read_jsonl(gold_file)
# check_jsonls(preds, golds)

results = gen_eval_new(preds)

import pandas as pd

# # results를 pd DF로
df = pd.DataFrame(results).T
# # df

preds_df = pd.DataFrame(preds)

# SubQ 답변을 추출한다
subq_qids = [] # 대상이 되는 main question들의 qid
for pred in preds:
    if 'sub' in pred['qid']:
        subq_qids.append(pred['qid'])

RULE_subq = pd.read_json('/hdd/hjl8708/workspace/Data/RULE/RULE_subq_all.jsonl', lines=True)

# 중복되는 경우 2가지 (From LogiQA): 스킵
result_for_subq = []
from tqdm import tqdm 
for subq_qid in tqdm(subq_qids):
    tmp = {}
    tmp['qid'] = subq_qid
    
    # qid가 RULE_subq에서 두 개 잡히는 경우 그 무시하기
    if len(RULE_subq[RULE_subq['id_string'] == subq_qid]) > 1:
        print(f"qid가 RULE_subq에서 두 개 잡히는 경우: {subq_qid}")
        continue
    # answer_options의 모든 원소에 str() 적용
    answer_options = RULE_subq[RULE_subq['id_string'] == subq_qid]['answers'].item()
    main_option_correctness = RULE_subq[RULE_subq['id_string'] == subq_qid]['others'].item()['main_option_correctness']
    answer_options = [str(answer) for answer in answer_options]
    output = preds_df[preds_df['qid'] == subq_qid]['output'].item()
    tmp['original_output'] = output
    tmp['main_result'] = mcqa_answer_extraction(output, answer_options, normalize_answer)
    tmp['answer'] = preds_df[preds_df['qid'] == subq_qid]['answer'].item()
    tmp['main_option_correctness'] = main_option_correctness
    result_for_subq.append(tmp)
    
result_for_subs = f'{pred_foler}/result_for_subqs.json'

with open(result_for_subs, 'w') as f:
    json.dump(result_for_subq, f, indent=4)
    

df_subq_result = pd.DataFrame(result_for_subq)
df_subq_result['EM'] = df_subq_result['main_result'] == df_subq_result['answer']

df_subq_result_selective = df_subq_result[df_subq_result['main_option_correctness'] == True]
df_subq_result_selective['EM'] = df_subq_result_selective['main_result'] == df_subq_result_selective['answer']
df_subq_result_selective.EM.mean()

df_subq_result_eliminative = df_subq_result[df_subq_result['main_option_correctness'] == False]
df_subq_result_eliminative['EM'] = df_subq_result_eliminative['main_result'] == df_subq_result_eliminative['answer']
df_subq_result_eliminative.EM.mean()

print(f"SubQ EM: {df_subq_result.EM.mean()}, Selective EM: {df_subq_result_selective.EM.mean()}, Eliminative EM: {df_subq_result_eliminative.EM.mean()}")

subq_result_score_path = f'{pred_foler}/subq_result_score.json'
subq_result_score = {'SubQ EM': df_subq_result.EM.mean(), 'Selective EM': df_subq_result_selective.EM.mean(), 'Eliminative EM': df_subq_result_eliminative.EM.mean()}
with open(subq_result_score_path, 'w') as f:
    json.dump(subq_result_score, f, indent=4)

 18%|█▊        | 534/3003 [00:40<03:06, 13.27it/s]


KeyboardInterrupt: 

## Option consistency 평가

In [148]:
''' 
data_for_mains: [
    {
        'qid': str,
        'main_result': 선택지문장
    }
]
'''
result_for_mains = f'{pred_foler}/result_for_mains.json'
with open(result_for_mains, 'r') as f:
    data_for_mains = json.load(f)

def extract_option_from_option_question(query):
    # 1. query에서 마지막으로 등장하는 '・'를 찾고 그 뒤만 남긴다
    query_1 = query.split('・')[-1]
    # 2. 그 뒤에 있는 문장을 " 앞까지만 남긴다
    query_2 = query_1.split('"')[0].strip()
    return query_2

def yes_no_from_output(output):
    output.replace("The answer is:", "")
    if 'yes' in output.lower():
        return 'yes'
    elif 'no' in output.lower():
        return 'no'
    else:
        return 'unknown'
        
data_ids_count = { # 데이터 타입에 따른 경향성을 보기 위해 카운트
    'RULE_train': {
                'consistent_count': 0,
                'inconsistent_count': 0,
                'main_correct': {
                    'consistent_count': 0,
                    'inconsistent_count': 0
                }
                },
    'logiqa': {
                'consistent_count': 0,
                'inconsistent_count': 0,
                'main_correct': {
                    'consistent_count': 0,
                    'inconsistent_count': 0
                }},
    'MMLU_val': {
                'consistent_count': 0,
                'inconsistent_count': 0,
                'main_correct': {
                    'consistent_count': 0,
                    'inconsistent_count': 0
                }},
}

from tqdm import tqdm
for i in tqdm(range(len(data_for_mains))): 
    option_result = []
    target_main_qid = data_for_mains[i]['qid']
    target_option = data_for_mains[i]['main_result']
    target_main_correctness = data_for_mains[i]['answer'] == data_for_mains[i]['main_result']
    
    qid_match_rows = preds_df[preds_df['qid'].str.contains(target_main_qid)]
    option_rows = qid_match_rows[qid_match_rows['qid'].str.contains('option')]
    
    for j in range(len(option_rows)):
        option_q_target = extract_option_from_option_question(option_rows.iloc[j]['question'])
        option_q_decision = yes_no_from_output(option_rows.iloc[j]['output'])
        option_q_decision_label = 'yes' if target_option == option_q_target else 'no'
        if option_q_decision == option_q_decision_label:
            data_ids_count[data_for_mains[i]['data_id']]['consistent_count'] += 1
        else:
            data_ids_count[data_for_mains[i]['data_id']]['inconsistent_count'] += 1
            
        if target_main_correctness:
            if option_q_decision == option_q_decision_label:
                data_ids_count[data_for_mains[i]['data_id']]['main_correct']['consistent_count'] += 1
            else:
                data_ids_count[data_for_mains[i]['data_id']]['main_correct']['inconsistent_count'] += 1
        
# option_consistency_result.json에 저장

option_consistency_result_path = f'{pred_foler}/option_consistency_result.json'
with open(option_consistency_result_path, 'w') as f:
    json.dump(data_ids_count, f, indent=4)

  0%|          | 0/4039 [00:00<?, ?it/s]

100%|██████████| 4039/4039 [00:59<00:00, 67.39it/s]


In [149]:
# 점수 계산

option_consistency_result_path = f'{pred_foler}/option_consistency_result.json'
with open(option_consistency_result_path, 'r') as f:
    option_consistency_result = json.load(f)    
    
data_ids_scores = { # 데이터 타입에 따른 경향성을 보기 위해 카운트
    'RULE_train': {
                'option_consistency_total': 0,
                'option_consistency_main_correct': 0,
                'option_consistency_main_incorrect': 0,
                },
    'logiqa': {
                'option_consistency_total': 0,
                'option_consistency_main_correct': 0,
                'option_consistency_main_incorrect': 0,
                },
    'MMLU_val': {
                'option_consistency_total': 0,
                'option_consistency_main_correct': 0,
                'option_consistency_main_incorrect': 0,
                },
}

for i in data_ids_scores:
    total_count = option_consistency_result[i]['consistent_count'] + option_consistency_result[i]['inconsistent_count']
    total_consistent = option_consistency_result[i]['consistent_count']
    total_inconsistent = option_consistency_result[i]['inconsistent_count']
    
    correct_consistent = option_consistency_result[i]['main_correct']['consistent_count']
    correct_inconsistent = option_consistency_result[i]['main_correct']['inconsistent_count']
    total_correct = correct_consistent + correct_inconsistent
    
    incorrect_consistent = total_consistent - correct_consistent
    incorrect_inconsistent = total_inconsistent - correct_inconsistent
    total_incorrect = incorrect_consistent + incorrect_inconsistent
    
    data_ids_scores[i]['option_consistency_total'] =  total_consistent / total_count
    data_ids_scores[i]['option_consistency_main_correct'] = correct_consistent / total_correct
    data_ids_scores[i]['option_consistency_main_incorrect'] = incorrect_consistent / total_incorrect
      
# data_ids_scores를 option_consistency_scores.json으로 저장
option_consistency_scores_path = f'{pred_foler}/option_consistency_scores.json'
with open(option_consistency_scores_path, 'w') as f:
    json.dump(data_ids_scores, f, indent=4)

## Shuffle Consistency

In [150]:
result_for_mains = f'{pred_foler}/result_for_mains.json'
with open(result_for_mains, 'r') as f:
    data_for_mains = json.load(f)


def remove_prefix(text):
    if "The answer is:" in text:
        text = text.split("The answer is:")[1]
        text = text.split("\n")[0].strip()
    else:
        text = text
    return text 
    # return text.replace("The answer is: ", "")

data_ids_count = { # 데이터 타입에 따른 경향성을 보기 위해 카운트
    'RULE_train': {
                'consistency_scores': [],
                },
    'logiqa': {
                'consistency_scores': [],
                },
    'MMLU_val': {
                'consistency_scores': [],
                },
}

from tqdm import tqdm
for i in tqdm(range(len(data_for_mains))): 
    option_result = []
    target_main_qid = data_for_mains[i]['qid']
    target_option = data_for_mains[i]['main_result']
    target_main_correctness = data_for_mains[i]['answer'] == data_for_mains[i]['main_result']
    
    qid_match_rows = preds_df[preds_df['qid'].str.contains(target_main_qid)]
    shuffle_rows = qid_match_rows[qid_match_rows['qid'].str.contains('shuffle')]
    
    prediction_list = [target_option]
    for j in range(len(shuffle_rows)):
        prediction_list.append(normalize_answer(shuffle_rows.iloc[j]['output'], len(target_option)))
        
    # prediction_list에서 각 값의 등장 횟수를 센다
    prediction_count = Counter(prediction_list)
    data_ids_count[data_for_mains[i]['data_id']]['consistency_scores'].append(prediction_count[max(prediction_count)])

scores = {
    'RULE_train': 25*sum(data_ids_count['RULE_train']['consistency_scores']) / len(data_ids_count['RULE_train']['consistency_scores']),
    'logiqa': 25*sum(data_ids_count['logiqa']['consistency_scores']) / len(data_ids_count['logiqa']['consistency_scores']),
    'MMLU_val': 25*sum(data_ids_count['MMLU_val']['consistency_scores']) / len(data_ids_count['MMLU_val']['consistency_scores']),
}

# scores를 저장
scores_path = f'{pred_foler}/shuffle_consistency_scores.json'
with open(scores_path, 'w') as f:
    json.dump(scores, f, indent=4)



  0%|          | 0/4039 [00:00<?, ?it/s]

100%|██████████| 4039/4039 [00:59<00:00, 68.31it/s]


In [None]:
pred_file

'/hdd/hjl8708/workspace/EM_evaluation/results/setting3_EM_LoRA_after/greedy_preds.json'

# 여기까지

In [10]:
def pr_tmp(preds, i):
    pred = preds[i]
    prediction = pred["output"]
    gold = pred["answer"]

    dtype = dtype_processing(pred)
    em_current = metric_match(exact_match_score, prediction, gold)
    f1_current = metric_match(f1_score, prediction, gold)

    print(f"""
{pred["question"]}
pred: {prediction}
gold: {gold}

em: {em_current}
f1: {f1_current}

    """.strip())

i = 3003
pr_tmp(preds, i)


original: The answer is: yes.
extracted: The answer is: yes.
original: yes.
extracted: yes.
original: The answer is: yes.
extracted: The answer is: yes.
original: yes.
extracted: yes.
<Passage>: Trainer: I recently developed an exercise routine that can get anybody to meet his or her goals. The routine combines cardio and bodybuilding during each session for the purpose of losing weight. Every person I' ve trained has lost weight on the program.
The strength of the argument depends on which one of the following?
・ Every client the trainer has worked with has weight loss as a goal.
・ Every client the trainer has worked with has prior experience lifting weights.
・ Every client the trainer has worked with has also adopted a healthy diet.
・ Losing weight is always a healthy outcome.


<Question>: Is the option "・ Every client the trainer has worked with has weight loss as a goal." is a correct answer?
pred: The answer is: yes.
gold: yes.

em: True
f1: 1.0


In [24]:
for i in preds:
    if 'sub' in i['qid']:
        print(i)
        break

{'qid': 'RULE_train_1_sub1', 'question': "<Passage>: Patient: Pharmacists maintain that doctors should not be permitted to sell the medicine that they prescribe because doctors would then be tempted to prescribe unnecessary medicines in order to earn extra income. But pharmacists have a financial interest in having a monopoly on the sale of prescription medicines, so their objection to the sale of medicines by doctors cannot be taken seriously.\n<Question>: What is the rationale for the patient's argument attempting to discredit a position by questioning the motives of the proponents of that position?\nChoose the correct answer from the following options.\n・ Lack of pharmacists' knowledge was not highlighted in the argument against putting them in charge of prescriptions\n・ The patient does not invoke public opinion in their argument but instead highlights the incentive structure which introduces conflicts for pharmacists and doctors.\n・ The patient correctly highlights that the pharma

In [40]:
# original_data_RULE에서 'qid'와 같은 'id_string'이 있는 row를 찾고
# 그 row에서 row['others']['main_option_correctness']가 True/False를 Return한다
qid = 'RULE_train_1_sub1'
def get_main_option_correctness(qid):
    row = original_data_RULE[original_data_RULE['id_string'] == qid]
    selective = row['others'].item()['main_option_correctness']
    return 'selective' if selective else 'eliminative'

get_main_option_correctness(qid)

True