In [63]:
import pandas as pd
import numpy as np
import random

In [64]:
def read_questions_from_file(filename):
    questions = []
    with open(filename, 'r') as file:
        lines = file.readlines()
        num_lines = len(lines)
        i = 0
        while i < num_lines:
            # Skip blank line
            i += 1
            # Extracting correct answer
            correct_answer = lines[i].strip()
            correct_answer = 'option_'+correct_answer
            i += 1
            # Extracting context
            context = lines[i].strip()
            i += 1
            # Extracting question
            question = lines[i].strip()
            i += 1
            # Extracting options
            options = [lines[j].strip() for j in range(i, i+4)]
            i += 4
            # Append to questions list
            questions.append((context, question, *options, correct_answer))
    questions = pd.DataFrame(questions, columns=['context', 'question', 'option_a', 'option_b', 'option_c', 'option_d', 'correct_answer'])
    return questions

We take out all the negation questions when reading in, so we assume that all the correct answers are logical in the context.

In [65]:
train_filename = "./dataset/Train.txt"
train_dataset = read_questions_from_file(train_filename)
train_dataset.head()

Unnamed: 0,context,question,option_a,option_b,option_c,option_d,correct_answer
0,"Some Cantonese don't like chili, so some south...",Which of the following can guarantee the above...,A.Some Cantonese love chili,B.Some people who like peppers are southerners,C.All Cantonese are southerners,D.Some Cantonese like neither peppers nor sweets,option_c
1,Continuous exposure to indoor fluorescent ligh...,Which of the following questions was the initi...,A.Can hospital light therapy be proved to prom...,"B.Which one lives longer, the hamster living u...",C.What kind of illness does the hamster have?,D.Do some hamsters need a period of darkness?,option_a
2,There is no doubt that minors should be prohib...,"In order to evaluate the above argument, which...",A.Does the proportion of underage smokers in t...,B.How much inconvenience does the ban on the u...,C.Whether the proportion of unlicensed drivers...,D.Is the harm of minor smoking really as serio...,option_b
3,A research report states that a special educat...,Which of the following best illustrates the lo...,A.Children's cognitive abilities are constantl...,B.Establishing such education and training pro...,C.Many parents mistakenly believe that early f...,D.Investigators are unaware that they include ...,option_a
4,"The traitor is a traitor, so you are a traitor...",Which of the following makes the same logical ...,A.Literature and art are not worthy of attenti...,B.Non-university can also become an outstandin...,"C.The earth is a sphere, which can be proved f...",D.I saw them coming out of this chemical plant...,option_c


In [66]:
dev_filename = "./dataset/Eval.txt"
test_filename = "./dataset/Test.txt"
dev_dataset = read_questions_from_file(dev_filename)
test_dataset = read_questions_from_file(test_filename)

Then we randomly sample some wrong answers to be not logical, and label the row as 0, while other with correct answers to be 1.

In [67]:
def generate_answer_and_label(row):
    if random.random() < 0.5:  # Randomly assign label 0 or 1
        label = 0
        answer_options = ['option_a', 'option_b', 'option_c', 'option_d']
        answer_options.remove(row['correct_answer'])
        answer = random.choice(answer_options)  # Randomly choose a wrong option
        answer = row[answer]
    else:
        label = 1
        answer = row[row['correct_answer']]
    return pd.Series([answer, label], index=['Answer', 'Label'])

In [68]:
train_dataset[['Answer', 'Label']] = train_dataset.apply(generate_answer_and_label, axis=1)
train_dataset.drop(columns=['option_a', 'option_b', 'option_c', 'option_d', 'correct_answer'], inplace=True)
train_dataset.head()

Unnamed: 0,context,question,Answer,Label
0,"Some Cantonese don't like chili, so some south...",Which of the following can guarantee the above...,D.Some Cantonese like neither peppers nor sweets,0
1,Continuous exposure to indoor fluorescent ligh...,Which of the following questions was the initi...,A.Can hospital light therapy be proved to prom...,1
2,There is no doubt that minors should be prohib...,"In order to evaluate the above argument, which...",B.How much inconvenience does the ban on the u...,1
3,A research report states that a special educat...,Which of the following best illustrates the lo...,B.Establishing such education and training pro...,0
4,"The traitor is a traitor, so you are a traitor...",Which of the following makes the same logical ...,"C.The earth is a sphere, which can be proved f...",1


In [69]:
train_dataset.to_csv('./dataset/train.csv', index=False)

In [70]:
dev_dataset[['Answer', 'Label']] = dev_dataset.apply(generate_answer_and_label, axis=1)
dev_dataset.drop(columns=['option_a', 'option_b', 'option_c', 'option_d', 'correct_answer'], inplace=True)
dev_dataset.to_csv('./dataset/dev.csv', index=False)
test_dataset[['Answer', 'Label']] = test_dataset.apply(generate_answer_and_label, axis=1)
test_dataset.drop(columns=['option_a', 'option_b', 'option_c', 'option_d', 'correct_answer'], inplace=True)
test_dataset.to_csv('./dataset/test_answer.csv', index=False)
test_dataset.drop(columns=['Label'], inplace=True)
test_dataset.to_csv('./dataset/test.csv', index=False)

In [71]:
# merge context and Answer into text column
train_dataset['Merged_Column'] = train_dataset['context'] + ' ' + train_dataset['question'] + ' ' + train_dataset['Answer']
train_dataset.drop(columns=['context', 'question', 'Answer'], inplace=True)
train_dataset.rename(columns={'Merged_Column': 'text'}, inplace=True)
train_dataset.to_csv('./dataset/train_merged_text.csv', index=False)

In [72]:
dev_dataset['Merged_Column'] = dev_dataset['context'] + ' ' + dev_dataset['question'] + ' ' + dev_dataset['Answer']
dev_dataset.drop(columns=['context', 'question', 'Answer'], inplace=True)
dev_dataset.rename(columns={'Merged_Column': 'text'}, inplace=True)
dev_dataset.to_csv('./dataset/dev_merged_text.csv', index=False)

In [73]:
test_dataset['Merged_Column'] = test_dataset['context'] + ' ' + test_dataset['question'] + ' ' + test_dataset['Answer']
test_dataset.drop(columns=['context', 'question', 'Answer'], inplace=True)
test_dataset.rename(columns={'Merged_Column': 'text'}, inplace=True)
test_dataset.to_csv('./dataset/test_merged_text.csv', index=False)