# 2.0 - Format QA jsonl file

In [5]:
import os
import json
import numpy as np

In [6]:
with open(os.path.join(os.getcwd(), '..', 'data', 'orange_qa_full.json'), 'r') as f:
    qa_mcq = json.load(f)

with open(os.path.join(os.getcwd(), '..', 'data', 'orange_connection_questions_MCQ.json'), 'r') as f:
    questions_MCQ = json.load(f)

with open(os.path.join(os.getcwd(), '..', 'data', 'orange_connection_questions_QA.json'), 'r') as f:
    questions_QA = json.load(f)

raw_data = qa_mcq + questions_MCQ + questions_QA
len(raw_data)

2472

In [7]:
SYSTEM_PROMPT_TEMPLATE = """You are a helpful assistant that can answer questions about the Orange Data Mining software."""

QA_USER_PROMPT_TEMPLATE = """Answer the following question based on your knowledge of the Orange Data Mining software.
Make sure you answer the question in a few sentences.

Question: {question}
"""

MCQ_USER_PROMPT_TEMPLATE = """Answer the following question based on your knowledge of the Orange Data Mining software.
Make sure you answer the question with a single letter corresponding to the correct answer.

Question: {question}

Answers:
A: {A}
B: {B}
C: {C}
D: {D}
"""



In [11]:
REPO_PATH = os.path.join(os.getcwd(), '..', 'repositories')
FILE_EXTENSIONS = ["txt", "md", "rst"]

SKIP = ["negative_words_Slolex.txt", "positive_words_Slolex.txt"]

text_files = []
for root, dirs, files in os.walk(REPO_PATH):
    for file in files:
        if True in [file.endswith(ext) for ext in FILE_EXTENSIONS] and file not in SKIP:
            text_files.append(os.path.relpath(os.path.join(root, file), REPO_PATH))
text_files += ["orange_widgets_connections.json"]

def _check_if_correct_type(question):
    keys = question.keys()

    if question['type'] == "MCQ-con":
        _type = "MCQ-con"
        question['type'] = _type
        return question, True

    elif "A" in keys and "B" in keys and "C" in keys and "D" in keys and "correct" in keys:
        _type = "MCQ"
        question['type'] = _type
        return question, True
    
    elif "question" in keys and "answer" in keys:
        _type = "QA"
        question['type'] = _type
        return question, True
    else:
        return None, False

def _check_if_hallucinated_file(question, check_hallucinated_file:bool=True):
    if not check_hallucinated_file:
        return False
    if "file" not in question.keys():
        return False
    file = question['file']
    if file not in text_files:
        return True
    else:
        return False


In [12]:
ALTERNATIVE_SORTING_N_TIMES = 2
CHECK_HALLUCINATED_FILE = False
np.random.seed(42)

full_prompt_messages = []
alternative_sorting = []

i = -1
for question_ in raw_data:

    question, success = _check_if_correct_type(question_.copy())
    hallucinated_file = _check_if_hallucinated_file(question_.copy(), CHECK_HALLUCINATED_FILE)

    if not success:
        print('Skipping question: ', question_['question'])
        continue
    if hallucinated_file:
        print('Skipping hallucinated file: ', question_['file'])
        continue

    i += 1
    if question['type'] == 'QA':
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT_TEMPLATE},
            {"role": "user", "content": QA_USER_PROMPT_TEMPLATE.format(question=question['question'])},
            {"role": "assistant", "content": question['answer']}
        ]
        full_prompt_messages.append({
            'id': i,
            "type": question['type'],
            'messages': messages
        })

    elif question['type'] in ['MCQ', 'MCQ-con']:

        for sort_id in range(ALTERNATIVE_SORTING_N_TIMES):
            possible_answers = np.array([letter for letter in list('ABCD') if letter in question.keys()])
            np.random.shuffle(possible_answers)
            possible_answers = possible_answers.tolist()
            correct_answer = possible_answers.index(question['correct'])
            messages = [
                {"role": "system", "content": SYSTEM_PROMPT_TEMPLATE},
                {"role": "user", "content": MCQ_USER_PROMPT_TEMPLATE.format(question=question['question'], A=question[possible_answers[0]], B=question[possible_answers[1]], C=question[possible_answers[2]], D=question[possible_answers[3]])},
                {"role": "assistant", "content": f"{list("ABCD")[correct_answer]}"}
            ]
            if sort_id == 0:
                full_prompt_messages.append({
                    'id': i,
                    "type": question['type'],
                    'messages': messages
                })
            else:
                alternative_sorting.append({
                    'id': i,
                    "type": question['type'],
                    'messages': messages
                })

    else:
        raise ValueError(f"Unknown question type: {question['type']}")

In [15]:
alternative_sorting[700]

{'id': 1238,
 'type': 'MCQ-con',
 'messages': [{'role': 'system',
   'content': 'You are a helpful assistant that can answer questions about the Orange Data Mining software.'},
  {'role': 'user',
   'content': 'Answer the following question based on your knowledge of the Orange Data Mining software.\nMake sure you answer the question with a single letter corresponding to the correct answer.\n\nQuestion: Which of the following is a valid connection?\n\nAnswers:\nA: Distances --> Differential Expression\nB: Preprocess --> Venn Diagram\nC: ARIMA Model --> Line Chart\nD: Test and Score --> Scatter Plot\n'},
  {'role': 'assistant', 'content': 'C'}]}

In [16]:
print("Questions: ", len(full_prompt_messages))
print("Alternative sorting: ", len(alternative_sorting))

Questions:  2472
Alternative sorting:  1234


## Train Test split

In [18]:
N_TEST_MCQ = 200
N_TEST_MCQ_CON = 200
np.random.seed(1)

full_dataset = np.array(full_prompt_messages.copy())
np.random.shuffle(full_dataset)

test_set_MCQ = [sample for sample in full_dataset if sample['type'] == 'MCQ'][:N_TEST_MCQ]
test_set_MCQ_CON = [sample for sample in full_dataset if sample['type'] == 'MCQ-con'][:N_TEST_MCQ_CON]
test_set_MCQ_ids = [sample['id'] for sample in test_set_MCQ]
test_set_MCQ_CON_ids = [sample['id'] for sample in test_set_MCQ_CON]

train_set_MCQ = [sample for sample in full_dataset if sample['id'] not in test_set_MCQ_ids + test_set_MCQ_CON_ids]
train_set_expanded = train_set_MCQ + [sample for sample in alternative_sorting if sample['id'] not in test_set_MCQ_ids + test_set_MCQ_CON_ids]

print("Test set MCQ: ", len(test_set_MCQ))
print("Test set MCQ-con: ", len(test_set_MCQ_CON))
print("Train set: ", len(train_set_MCQ))
print("Train set expanded: ", len(train_set_expanded))

Test set MCQ:  200
Test set MCQ-con:  200
Train set:  2072
Train set expanded:  2906


In [19]:
def format_input_data(samples):
    return [{"messages": sample['messages']} for sample in samples]

train_set_formatted = format_input_data(train_set_expanded)
test_set_MCQ_formatted = format_input_data(test_set_MCQ)
test_set_MCQ_CON_formatted = format_input_data(test_set_MCQ_CON)

print("Test set MCQ: ", len(test_set_MCQ_formatted))
print("Test set MCQ-con: ", len(test_set_MCQ_CON_formatted))
print("Train set: ", len(train_set_formatted))
print()
print("Example of formatted train set:\n    ", "\n    ".join(str(x) for x in train_set_formatted[0]))


Test set MCQ:  200
Test set MCQ-con:  200
Train set:  2906

Example of formatted train set:
     messages


In [20]:
with open(os.path.join(os.getcwd(), '..', 'data', 'train_test_dataset', 'orange_qa_train.jsonl'), 'w') as f:
    f.write(json.dumps(train_set_formatted, indent=4))

with open(os.path.join(os.getcwd(), '..', 'data', 'train_test_dataset', 'orange_qa_MCQ_test.jsonl'), 'w') as f:
    f.write(json.dumps(test_set_MCQ_formatted, indent=4))

with open(os.path.join(os.getcwd(), '..', 'data', 'train_test_dataset', 'orange_qa_MCQ-con_test.jsonl'), 'w') as f:
    f.write(json.dumps(test_set_MCQ_CON_formatted, indent=4))
