## ReClor test의 question type 분류 시키기

In [51]:
import pandas as pd 
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from typing import List
fname = '../../Data/ReClor/ReClor_test.jsonl'
data_dataset = load_dataset("json", data_files=fname, split="train")
data_loader = DataLoader(data_dataset, batch_size=1, shuffle=False)

test_data = pd.read_json(fname, lines=True)

In [54]:
pd.DataFrame(test_data.question_type.value_counts())

Unnamed: 0_level_0,count
question_type,Unnamed: 1_level_1
13,117
0,114
3,113
2,94
8,84
16,73
9,65
7,56
5,46
6,36


In [32]:
def answers_to_string(answers: List) -> str:
    answer_str = ""
    for a, answer in enumerate(answers):
        while type(answer) == tuple:
            answer = answer[0]
        if a == len(answers) - 1:
            answer_str += f"{a+1}: {answer}"
        else:
            answer_str += f"{a+1}: {answer}, "
    return answer_str

def template_to_string(template: str, context: str, question: str, options: List[str]) -> str:
    # option combinate
    option_str = answers_to_string(options)
    return template.format(context=context, question=question, options=option_str)

template = """
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

Instruction: You are given a logical reasoning question and predefined types of questions. Each type has its unique type code such as 'N1'. write the code of the question type of following question.
There are predefined 17 types of questions. The number of the question type and the description of the question type are as follows: 
N1. "Necessary Assumptions": identify the claim that must be true or is required in order for the argument to work.
N2. "Sufficient Assumptions": identify a sufficient assumption, that is, an assumption that, if added to the argument, would make it logically valid.
N3. "Strengthen": identify information that would strengthen an argument
N4. "Weaken": identify information that would weaken an argument
N5. "Evaluation": identify information that would be useful to know to evaluate an argument
N6. "Implication": identify something that follows logically from a set of premises
N7. "Conclusion/Main Point": identify the conclusion/main point of a line of reasoning
N8. "Most Strongly Supported": find the choice that is most strongly supported by a stimulus
N9. "Explain or Resolve": identify information that would explain or resolve a situation
N10. "Principle": identify the principle, or find a situation that conforms to a principle, or match the principles
N11. "Dispute": identify or infer an issue in dispute
N12. "Technique": identify the technique used in the reasoning of an argument
N13. "Role": describe the individual role that a statement is playing in a larger argument
N14. "Identify a Flaw": identify a flaw in an argument’s reasoning
N15. "Match Flaws": find a choice containing an argument that exhibits the same flaws as the passage’s argument
N16. "Match Structures": match the structure of an argument in a choice to the structure of the argument in the passage
N17. "Others": other types of questions which are not included by the above

Following logical reasoning question.
Context: {context}
Question: {question}
Options: {options}
The code of the question type of this question is N
""".strip()

for ix, x in enumerate(data_loader):
    context = x["context"][0] # str
    question = x["question"][0]
    answers = x["answers"] # List 
    answers = [x[0] for x in answers]
    question_type = x["question_type"][0] # int
    id_string = x["id_string"][0] # str
    
    input_string = template_to_string(template, context, question, answers)
    
    

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

Instruction: You are given a logical reasoning question and predefined types of questions. Each type has its unique type code such as 'N1'. write the code of the question type of following question.
There are predefined 17 types of questions. The number of the question type and the description of the question type are as follows: 
N1. "Necessary Assumptions": identify the claim that must be true or is required in order for the argument to work.
N2. "Sufficient Assumptions": identify a sufficient assumption, that is, an assumption that, if added to the argument, would make it logically valid.
N3. "Strengthen": identify information that would strengthen an argument
N4. "Weaken": identify information that would weaken an argument
N5. "Evaluation": identify information that would be useful to know to evaluate an argument
N6. "Implica

In [59]:
# path
import pandas as pd 
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from typing import List
filepath = 'qtype_result/ReClor_test_result.jsonl'

data = pd.read_json(filepath, lines=True)

resp_exs = [data.response.values[x] for x in [0,2,5]]
resp_exs

def response_to_code(response: str):
    # regex를 통해서 'N\d'를 찾아서 숫자를 반환
    import re
    regex = r'N\d\d?'
    match = re.search(regex, response)
    if match:
        return int(match.group()[1:])-1
    else:
        # 문자열에서 나타나는 첫 번째 숫자를 찾는다
        regex = r'\d\d?'
        match = re.search(regex, response)
        return int(match.group())-1
    
def extract_first_num_in_string(string: str):
    result = ''.join([x for x in string if x.isdigit()])
    if result == '':
        return -1
    else: 
        return int(''.join([x for x in string if x.isdigit()]))-1

data['response_code'] = data.response.apply(response_to_code)
data['correctness'] = data.response_code == data.question_type

Unnamed: 0_level_0,count
response_code,Unnamed: 1_level_1
3,148
13,127
7,121
0,116
2,91
8,81
9,56
12,39
6,37
5,35


In [75]:
# data 중에서 question_type이 16이고, response_code가 3인 것 추출
data[(data.question_type == 7) & (data.response_code == 7)].answers.values


array([list(['For at least some people, engaging in exercise can cause their stress levels to be reduced.', 'Most people with high blood pressure can lower their blood pressure by reducing their stress levels.', 'For at least some people, having lower blood pressure has at least some tendency to cause their stress levels to be reduced.', 'Most people who do not exercise regularly have higher stress levels as a result.']),
       list(['Few people who consume vitamin-fortified foods are aware of the recommended daily intake of vitamins A and D.', 'Some people who consume vitamin-fortified foods exceed the recommended daily intake of vitamins A and D.', 'Most people who eat vitamin-fortified foods should not take any vitamin supplements.', 'Some people mistakenly believe it is healthy to consume more than the recommended daily intake of vitamins A and D.']),
       list(['Prior to the twelfth century, the institution of European feudalism functioned without the presence of a dominant cla

In [35]:
data.keys()

Index(['context', 'question', 'answers', 'id_string', 'input_string',
       'response', 'response_code'],
      dtype='object')

In [43]:
data['response_code'].value_counts()


response_code
3     144
7     138
13    135
0     132
2      65
8      60
1      54
5      47
6      42
9      39
11     24
10     23
4      18
12     17
14      3
15      2
Name: count, dtype: int64

In [46]:

# data에서 id_string, response_code만 추출해서 저장
data[['id_string', 'response_code']].to_json('qtype_result/RULE_subq_map.jsonl', orient='records', lines=True)

In [19]:
# data에서 'nums'가 20 이상인 거만 남기기
data_errored = data[data.nums > 20]
data_errored.response.values[5]

IndexError: index 5 is out of bounds for axis 0 with size 5

In [33]:
# data에서 question_type이 13, 0, 3, 1, 4, 2, 7 인 것을 각각 하나씩 랜덤하게 추출
# 그리고 그것들을 하나의 dataframe으로 만들어서 저장

few_shot_data = pd.DataFrame()
for qtype in [13, 0, 3, 1, 4, 2, 7]:
    few_shot_data = pd.concat([few_shot_data, data[data.question_type == qtype].sample(1, random_state=22)])

few_shot_data

Unnamed: 0,context,question,answers,question_type,id_string,input_string,response,response_code,correctness
296,Clothes dryers manufactured by Archway Applian...,The argument is most vulnerable to criticism o...,[Archway's dryers consistently perform well an...,13,test_296,"Below is an instruction that describes a task,...",The code of the question type of this question...,0,False
736,Commercial passenger airplanes can be equipped...,Which one of the following is an assumption on...,[Warnings given by a collision-avoidance syste...,0,test_736,"Below is an instruction that describes a task,...",The code of the question type of this question...,0,True
675,"Jocko, a chimpanzee, was once given a large bu...","Which one of the following, if true, most seri...",[Bananas are a food for which all of the chimp...,3,test_675,"Below is an instruction that describes a task,...",The code of the question type of this question...,3,True
908,No democracy should require national identific...,The conclusion drawn above follows logically i...,[No feature characteristic of totalitarian soc...,1,test_908,"Below is an instruction that describes a task,...",The code of the question type of this question...,1,True
184,Fashion company executive: The number of compe...,To evaluate whether the plan described by the ...,[Why the number of competing brands of clothin...,4,test_184,"Below is an instruction that describes a task,...",The code of the question type of this question...,4,True
725,That sales can be increased by the presence of...,"Which of the following, if true, most strength...",[The departments in the part of the store unde...,2,test_725,"Below is an instruction that describes a task,...",The code of the question type of this question...,2,True
480,Unlike many machines that are perfectly useful...,The information above provides the most suppor...,[In some industries it is in the interest of c...,7,test_480,"Below is an instruction that describes a task,...",The code of the question type of this question...,7,True


In [55]:
data['response_code'].value_counts()

response_code
3     148
13    127
7     121
0     116
2      91
8      81
9      56
12     39
6      37
5      35
1      32
10     31
15     30
14     24
11     17
4      15
Name: count, dtype: int64

In [78]:
# data.correctness에서 True의 비율
acc = data.correctness.value_counts()[0] / data.correctness.value_counts().sum()
print(acc)
# # question type 별 정답률 분석
# for i in range(17):
#     print(f"question_type {i} accuracy: {data[data.question_type == i].correctness.value_counts()[0] / data[data.question_type == i].correctness.value_counts().sum()}")
    
# question type 별 multi-class f1 score 분석
from sklearn.metrics import f1_score
for i in range(17):
    # macro: 단순평균, weighted: 각 클래스에 속하는 표본의 갯수로 가중평균
    print(f"question_type {i} f1 score: {f1_score(data[data.question_type == i].question_type, data[data.question_type == i].response_code, average='weighted')}")
    

0.827
question_type 0 accuracy: 0.9824561403508771
question_type 1 accuracy: 0.9
question_type 2 accuracy: 0.8191489361702128
question_type 3 accuracy: 0.9823008849557522
question_type 4 accuracy: 0.9230769230769231
question_type 5 accuracy: 0.6521739130434783
question_type 6 accuracy: 0.8333333333333334
question_type 7 accuracy: 1.0
question_type 8 accuracy: 0.9285714285714286
question_type 9 accuracy: 0.8
question_type 10 accuracy: 0.8666666666666667
question_type 11 accuracy: 0.5555555555555556
question_type 12 accuracy: 1.0
question_type 13 accuracy: 0.9743589743589743
question_type 14 accuracy: 0.7741935483870968
question_type 15 accuracy: 1.0
question_type 16 accuracy: 1.0
question_type 0 f1 score: 0.9911504424778761
question_type 1 f1 score: 0.9473684210526316
question_type 2 f1 score: 0.9005847953216375
question_type 3 f1 score: 0.9910714285714286
question_type 4 f1 score: 0.9600000000000001
question_type 5 f1 score: 0.7894736842105263
question_type 6 f1 score: 0.90909090909090

In [13]:
# question_type이 11인 것 중, correctness가 False인 것들을 랜덤하게 5개 추출
 
enwkfltm = data[(data.question_type == 11) & (data.correctness == False)].sample(5, random_state=22)
enwkfltm.response.values[4]

'11. "Dispute": identify or infer an issue in dispute'

In [None]:
d

In [57]:
data.question_type.value_counts()

question_type
13    117
0     114
3     113
2      94
8      84
16     73
9      65
7      56
5      46
6      36
11     36
12     32
14     31
1      30
10     30
15     30
4      13
Name: count, dtype: int64

In [48]:
# data['correctness']가 False인 것들
data_false = data[data.correctness == False]
# response, response_code, correctness, question_type
data_false = data_false[['response', 'response_code', 'correctness', 'question_type']]


In [51]:
data_false

Unnamed: 0,response,response_code,correctness,question_type
10,The code of the question type for this logical...,3,False,16
12,The code of the question type for this questio...,10,False,16
13,The code of the question type of this question...,7,False,5
14,The code of the question type of this question...,7,False,6
21,The code of the question type of this question...,3,False,16
...,...,...,...,...
975,The code of the question type of this question...,7,False,5
976,The code of the question type for the provided...,1,False,16
977,The code of the question type of this question...,2,False,0
984,The code of the question type of this question...,7,False,16
