In [None]:
%cd ..

In [None]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
# os.environ["TORCH_USE_CUDA_DSA"] = "1"

In [None]:
import json
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from glob import glob
import re

In [None]:
def extract_qa(content) -> list[str]:
	# Regex to match <qa>...</qa> and <qa>..."answer"...
	qa_pattern = re.compile(r'<qa>(.*?)</qa>|<qa>(.*?answer.*?option \d.*?explanation.*?)<|eot_id|>', re.DOTALL)
	
	matches = qa_pattern.findall(content)
	
	# Flatten the list of tuples and filter out empty strings
	matches: list[str] = [match[0] if match[0] else match[1] for match in matches]
	
	return [i for  i in matches if i.strip()]

files = glob("data/generated_qa/QCM_Context/*/*.txt")

len(files)

In [None]:
n_error = 0
ignore_match = 'Question text goes here'

In [None]:
def parse_qa_content(qa_content: str):
    # Define regex patterns for each component
    question_pattern = r'question:\s*(.*?)(?=\noption)'
    option_pattern = r'option\s+(\d+):\s*(.*?)(?=\noption|\nanswer)'
    answer_pattern = r'answer:\s*(.*?)\n'
    explanation_pattern = r'explanation:\s*(.*?)$'
    
    # Extract components
    question = re.search(question_pattern, qa_content, re.DOTALL).group(1).strip()
    options = dict(re.findall(option_pattern, qa_content, re.DOTALL))
    answer = re.search(answer_pattern, qa_content, re.DOTALL).group(1).strip()
    explanation = re.search(explanation_pattern, qa_content, re.DOTALL).group(1).strip()
    
    # Construct the JSON structure
    qa_dict = {
        "question": question,
        "options": options,
        "answer": answer,
        "explanation": explanation.replace("<|eot_id|>", "")
    }
    
    return qa_dict

In [None]:
def get_questions(file : str):
	raw = open(file, "r").read()
	matches = extract_qa(raw)
	matches = [i for i in matches if i.strip() and (ignore_match not in i)]
	for match in matches:
		raw = parse_qa_content(match)
		yield raw

In [None]:
questions = [
	list(get_questions(file)) for file in files
]

In [None]:
len(questions)

In [None]:
questions = sum(questions, start=[])

In [None]:
len(questions)

In [None]:
questions[10]

In [None]:
def flatten(raw):
    return {
        'question': raw['question'],
        **{"option "+key: value for key, value in raw["options"].items()},
        'answer': raw['answer'].split()[1][0],
        'explanation': raw['explanation'],
	}

question_flatt = list(map(flatten, questions))

In [None]:
question_flatt[0]

In [None]:
df = pd.DataFrame(question_flatt)


In [None]:
df.sample(10)

In [None]:
df.describe()

In [None]:
df.to_csv("data/full_data_v2/generated.csv", index=False)

In [None]:
df.shape

In [None]:
import json
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from glob import glob
import re

In [None]:
df = pd.read_json("data/zindi_data/TeleQnA_training.json", ).T

In [None]:
df.sample(5)

In [None]:
df["category"].value_counts()

In [None]:
df = df[df["category"].isin(["Standards overview", "Standards specifications"])]
df.shape

In [None]:
df["answer"] = df["answer"].apply(lambda x: int(x.split()[1][0]))

In [None]:
df[[
    "question", "option 1", "option 2", "option 3", "option 4", "option 5", "answer", "explanation"
]].to_csv("data/full_data_v2/validation.csv", index=False)

In [None]:
df["answer"].value_counts()

In [None]:
generated = pd.read_csv("data/full_data_v2/generated.csv")

generated.sample(10)

In [None]:
generated["answer"].value_counts()

In [None]:
generated[generated["answer"].isin([str(i) for i in range(1, 6)])]["answer"].value_counts()

In [None]:
generated[generated["answer"].isin([str(i) for i in range(1, 6)])].to_csv("data/full_data_v2/generated_solved.csv")