# 2.0 - Format QA jsonl file

In [1]:
import os
import json
import numpy as np

In [2]:
with open(os.path.join(os.getcwd(), '..', 'data', 'orange_qa_full.json'), 'r') as f:
    raw_data = json.load(f)

raw_data[0]

{'file': 'orange3-addons/README.txt',
 'type': 'QA',
 'question': 'How is the list of official add-ons populated in the Orange3 add-on dialog?',
 'answer': 'The list is populated by add-ons listed in the OFFICIAL_ADDONS.txt file.'}

In [3]:
SYSTEM_PROMPT_TEMPLATE = """You are a helpful assistant that can answer questions about the Orange Data Mining software."""

QA_USER_PROMPT_TEMPLATE = """Answer the following question based on your knowledge of the Orange Data Mining software.
Make sure you answer the question in a few sentences.

Question: {question}
"""

MCQ_USER_PROMPT_TEMPLATE = """Answer the following question based on your knowledge of the Orange Data Mining software.
Make sure you answer the question with a single letter corresponding to the correct answer.

Question: {question}

Answers:
A: {A}
B: {B}
C: {C}
D: {D}
"""



In [4]:
REPO_PATH = os.path.join(os.getcwd(), '..', 'repositories')
FILE_EXTENSIONS = ["txt", "md", "rst"]

SKIP = ["negative_words_Slolex.txt", "positive_words_Slolex.txt"]

text_files = []
for root, dirs, files in os.walk(REPO_PATH):
    for file in files:
        if True in [file.endswith(ext) for ext in FILE_EXTENSIONS] and file not in SKIP:
            text_files.append(os.path.relpath(os.path.join(root, file), REPO_PATH))

def _check_if_correct_type(question):
    keys = question.keys()

    if "A" in keys and "B" in keys and "C" in keys and "D" in keys and "correct" in keys:
        _type = "MCQ"
        question['type'] = _type
        return question, True
    
    elif "question" in keys and "answer" in keys:
        _type = "QA"
        question['type'] = _type
        return question, True
    else:
        return None, False

def _check_if_hallucinated_file(question):
    if "file" not in question.keys():
        return False
    file = question['file']
    if file not in text_files:
        return True
    else:
        return False


In [None]:
ALTERNATIVE_SORTING_N_TIMES = 2
np.random.seed(42)

full_prompt_messages = []
alternative_sorting = []

i = -1
for question_ in raw_data:

    question, success = _check_if_correct_type(question_.copy())
    hallucinated_file = _check_if_hallucinated_file(question_.copy())

    if not success:
        print('Skipping question: ', question_['question'])
        continue
    if hallucinated_file:
        print('Skipping hallucinated file: ', question_['file'])
        continue

    i += 1
    if question['type'] == 'QA':
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT_TEMPLATE},
            {"role": "user", "content": QA_USER_PROMPT_TEMPLATE.format(question=question['question'])},
            {"role": "assistant", "content": question['answer']}
        ]
        full_prompt_messages.append({
            'id': i,
            "type": question['type'],
            'messages': messages
        })

    elif question['type'] == 'MCQ':

        for sort_id in range(ALTERNATIVE_SORTING_N_TIMES):
            possible_answers = np.array([letter for letter in list('ABCD') if letter in question.keys()])
            np.random.shuffle(possible_answers)
            possible_answers = possible_answers.tolist()
            correct_answer = possible_answers.index(question['correct'])
            messages = [
                {"role": "system", "content": SYSTEM_PROMPT_TEMPLATE},
                {"role": "user", "content": MCQ_USER_PROMPT_TEMPLATE.format(question=question['question'], A=question[possible_answers[0]], B=question[possible_answers[1]], C=question[possible_answers[2]], D=question[possible_answers[3]])},
                {"role": "assistant", "content": f"{list("ABCD")[correct_answer]}"}
            ]
            if sort_id == 0:
                full_prompt_messages.append({
                    'id': i,
                    "type": question['type'],
                    'messages': messages
                })
            else:
                alternative_sorting.append({
                    'id': i,
                    "type": question['type'],
                    'messages': messages
                })

    else:
        raise ValueError(f"Unknown question type: {question['type']}")

['B' 'D' 'A' 'C']
['B' 'D' 'A' 'C']
['D' 'A' 'B' 'C']
['B' 'A' 'D' 'C']
['C' 'B' 'A' 'D']
['A' 'B' 'C' 'D']
['D' 'C' 'B' 'A']
['A' 'C' 'B' 'D']
['D' 'C' 'A' 'B']
['A' 'C' 'B' 'D']
['B' 'C' 'D' 'A']
['B' 'A' 'D' 'C']
['A' 'D' 'C' 'B']
['A' 'D' 'C' 'B']
['A' 'B' 'D' 'C']
['B' 'C' 'A' 'D']
['B' 'D' 'C' 'A']
['B' 'C' 'D' 'A']
['C' 'D' 'A' 'B']
['A' 'C' 'D' 'B']
['C' 'D' 'B' 'A']
['A' 'D' 'C' 'B']
['D' 'B' 'A' 'C']
['A' 'B' 'D' 'C']
['D' 'C' 'B' 'A']
['A' 'C' 'B' 'D']
['A' 'C' 'D' 'B']
['C' 'A' 'B' 'D']
['A' 'D' 'B' 'C']
['A' 'C' 'B' 'D']
['C' 'A' 'D' 'B']
['A' 'B' 'C' 'D']
['A' 'D' 'C' 'B']
['D' 'C' 'B' 'A']
['C' 'B' 'D' 'A']
['D' 'B' 'A' 'C']
['C' 'D' 'A' 'B']
['B' 'C' 'A' 'D']
['B' 'D' 'C' 'A']
['B' 'C' 'D' 'A']
['C' 'B' 'D' 'A']
['B' 'A' 'C' 'D']
['D' 'B' 'A' 'C']
['B' 'A' 'D' 'C']
['D' 'B' 'A' 'C']
['D' 'A' 'B' 'C']
['B' 'A' 'C' 'D']
['C' 'A' 'B' 'D']
['B' 'A' 'C' 'D']
['D' 'C' 'A' 'B']
['C' 'A' 'B' 'D']
['B' 'D' 'A' 'C']
['D' 'B' 'C' 'A']
['B' 'D' 'A' 'C']
['D' 'A' 'C' 'B']
['D' 'A' '

In [6]:
alternative_sorting[102]

{'id': 210,
 'type': 'MCQ',
 'messages': [{'role': 'system',
   'content': 'You are a helpful assistant that can answer questions about the Orange Data Mining software.'},
  {'role': 'user',
   'content': "Answer the following question based on your knowledge of the Orange Data Mining software.\nMake sure you answer the question with a single letter corresponding to the correct answer.\n\nQuestion: What does 'Replicable training' do in the Gradient Boosting widget?\n\nAnswers:\nA: It allows exporting the model to Python.\nB: It trains the model twice.\nC: It fixes the random seed to enable reproducibility.\nD: It saves the model to a file.\n"},
  {'role': 'assistant', 'content': 'C'}]}

In [7]:
print("Questions: ", len(full_prompt_messages))
print("Alternative sorting: ", len(alternative_sorting))

Questions:  1072
Alternative sorting:  534


## Train Test split

In [8]:
N_TEST = 200
np.random.seed(1)

full_dataset = np.array(full_prompt_messages.copy())
np.random.shuffle(full_dataset)

test_set = [sample for sample in full_dataset if sample['type'] == 'MCQ'][:N_TEST]
test_set_ids = [sample['id'] for sample in test_set]
train_set = [sample for sample in full_dataset if sample['id'] not in test_set_ids]
train_set_expanded = train_set + [sample for sample in alternative_sorting if sample['id'] not in test_set_ids]

print("Test set: ", len(test_set))
print("Train set: ", len(train_set))
print("Train set expanded: ", len(train_set_expanded))

Test set:  200
Train set:  872
Train set expanded:  1206


In [9]:
def format_input_data(samples):
    return [{"messages": sample['messages']} for sample in samples]

train_set_formatted = format_input_data(train_set_expanded)
test_set_formatted = format_input_data(test_set)

print("Test set: ", len(test_set_formatted))
print("Train set: ", len(train_set_formatted))
print()
print("Example of formatted train set:\n    ", "\n    ".join(str(x) for x in train_set_formatted[0]))


Test set:  200
Train set:  1206

Example of formatted train set:
     messages


In [10]:
with open(os.path.join(os.getcwd(), '..', 'data', 'train_test_dataset', 'orange_qa_train.jsonl'), 'w') as f:
    f.write(json.dumps(train_set_formatted, indent=4))

with open(os.path.join(os.getcwd(), '..', 'data', 'train_test_dataset', 'orange_qa_test.jsonl'), 'w') as f:
    f.write(json.dumps(test_set_formatted, indent=4))
