In [None]:
%cd ..

In [None]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
# os.environ["TORCH_USE_CUDA_DSA"] = "1"

In [None]:
import json
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from glob import glob
import re

In [None]:
def extract_qa(content) -> list[str]:
	# Regex to match <qa>...</qa> and <qa>..."answer"...
	qa_pattern = re.compile(r'<qa>(.*?)</qa>|<qa>(.*?answer.*?option \d.*?explanation.*?)<|eot_id|>', re.DOTALL)
	
	matches = qa_pattern.findall(content)
	
	# Flatten the list of tuples and filter out empty strings
	matches: list[str] = [match[0] if match[0] else match[1] for match in matches]
	
	return [i for  i in matches if i.strip()]

files = glob("data/generated_qa/QCM_Context/*/*.txt")

len(files)

In [None]:
n_error = 0
ignore_match = 'Question text goes here'

In [None]:
def parse_qa_content(qa_content: str):
    # Define regex patterns for each component
    question_pattern = r'question:\s*(.*?)(?=\noption)'
    option_pattern = r'option\s+(\d+):\s*(.*?)(?=\noption|\nanswer)'
    answer_pattern = r'answer:\s*(.*?)\n'
    explanation_pattern = r'explanation:\s*(.*?)$'
    
    # Extract components
    question = re.search(question_pattern, qa_content, re.DOTALL).group(1).strip()
    options = dict(re.findall(option_pattern, qa_content, re.DOTALL))
    answer = re.search(answer_pattern, qa_content, re.DOTALL).group(1).strip()
    explanation = re.search(explanation_pattern, qa_content, re.DOTALL).group(1).strip()
    
    # Construct the JSON structure
    qa_dict = {
        "question": question,
        "options": options,
        "answer": answer,
        "explanation": explanation.replace("<|eot_id|>", "")
    }
    
    return qa_dict

In [None]:
def get_questions(file : str):
	raw = open(file, "r").read()
	matches = extract_qa(raw)
	matches = [i for i in matches if i.strip() and (ignore_match not in i)]
	for match in matches:
		raw = parse_qa_content(match)
		yield raw

In [None]:
questions = [
	list(get_questions(file)) for file in files
]

In [None]:
len(questions)

In [None]:
questions = sum(questions, start=[])

In [None]:
len(questions)

In [None]:
questions[:10]

In [None]:
res = extract_qa("""

<qa>
question: What is the purpose of the 5G Multicast-Broadcast User Service architecture in the context of 5G MBS?
option 1: To provide a generic 5G MBS User Service architecture
option 2: To support 5G Media Streaming via eMBMS
option 3: To enable lossless mobility for MBS broadcast
option 4: To define MBS frequency prioritization
option 5: To specify stage 2 5G multicast-broadcast User Services architecture
answer: option 5
explanation: The 5G Multicast-Broadcast User Service architecture is a stage 2 5G multicast-broadcast User Services architecture that defines the MBS User Services network architecture, reference architecture model, and new reference points for MBS User Services.<|eot_id|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|en
""")

In [None]:
parse_qa_content(res[0])