## LogiQA data processing

In [None]:
# Data processing
import pandas as pd

df = pd.read_parquet('data/logiqa_test.parquet') # 원본 데이터

def option_processing(options):
    # options = eval(options)  # Convert string representation of list to actual list
    formatted_options = [f"{chr(65 + i)}: {opt}" for i, opt in enumerate(options)]
    return "\n".join(formatted_options)

def process_logiqa_direct(context ,query, options):
    option_str = option_processing(options)
    input_text = f"Answer the following question with the given context.\n\nContext: {context}\n\nQuestion: {query}\n\nOptions:\n{option_str}\n\nOutput only the correct label as a single character. Do not add any extra text."
    return input_text

def process_logiqa_cot(context ,query, options):
    option_str = option_processing(options)
    input_text = f"Answer the following question with the given context.\n\nContext: {context}\n\nQuestion: {query}\n\nOptions:\n{option_str}\n\nLet's think step by step. After reasoning, output the correct label on the last line in this format: <A> / <B> / <C> / <D>."
    return input_text

df_input = df.copy()

df_input['answer'] = df['correct_option'].apply(lambda x: chr(65 + x))  # Convert 0,1,2,3 to A,B,C,D
df_input['qid'] = df.index
df_input['dataset_name'] = 'logiqa'

df_direct = df_input.copy()
df_cot = df_input.copy()

df_direct['input'] = df_input.apply(lambda row: process_logiqa_direct(row['context'], row['query'], row['options']), axis=1)
df_cot['input'] = df_input.apply(lambda row: process_logiqa_cot(row['context'], row['query'], row['options']), axis=1)

df_direct = df_direct[['qid', 'dataset_name', 'input', 'answer']]  # 데이터셋 포맷 통일
df_cot = df_cot[['qid', 'dataset_name', 'input', 'answer']]  # 데이터셋 포맷 통일

df_direct.to_csv('data/logiqa_test_direct.csv', index=False)
df_cot.to_csv('data/logiqa_test_cot.csv', index=False)

## Evaluation (Direct, CoT)

In [65]:
# Evaluation
import pandas as pd

file = 'result/Qwen3-8B-logiqa_test_direct.csv'
df = pd.read_csv(file)
accuracy = (df['answer'] == df['output']).mean()
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.5484


In [None]:
# evaluation CoT
file = 'result/Qwen3-8B-logiqa_test_cot.csv'
df = pd.read_csv(file)

def extract(str):
    # <와 >사이를 추출
    import re
    match = re.search(r'<([A-D])>', str)
    if match:
        return match.group(1)
    else:
        return None
df['extracted_output'] = df['output'].apply(extract)

print(f"추출 실패 개수: {df['extracted_output'].isna().sum()} / {len(df)}")
accuracy = (df['answer'] == df['extracted_output']).mean()
print(f"Accuracy (추출실패 포함): {accuracy:.4f}")

# df['extracted_output'].value_counts(dropna=False) # 추출안된 것 확인

df_dropna = df.dropna(subset=['extracted_output'])
accuracy = (df_dropna['answer'] == df_dropna['extracted_output']).mean()
print(f"Accuracy (추출성공만): {accuracy:.4f}")


# Qwen3-8B는 Reasoning tuning이 된 모델이라서 일반적인 CoT와 다른 경향을 보일 수도 있을듯

추출 실패 개수: 86 / 651
Accuracy (추출실패 포함): 0.5223
Accuracy (추출성공만): 0.6018
