In [None]:
%cd ..

In [None]:
import json
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from glob import glob
import re

In [None]:
def extract_qa(content) -> list[str]:
	# Regex to match <qa>...</qa> and <qa>..."answer"...
	qa_pattern = re.compile(r'<qa>(.*?)</qa>|<qa>(.*?"answer".*?"option \d:.*?)<|eot_id|>', re.DOTALL)
	
	matches = qa_pattern.findall(content)
	
	# Flatten the list of tuples and filter out empty strings
	matches = [match[0] if match[0] else match[1] for match in matches]
	
	return matches

files = glob("data/generated_qa/*/*/*.txt")

In [None]:
n_error = 0
ignore_match = '"question": "Question text goes here"'

In [None]:
def get_questions(file : str):
	raw = open(file, "r").read()
	matches = extract_qa(raw)
	matches = [i for i in matches if i.strip()]
	# if len(matches) != 6:
	# 	print(len(matches), file)
	for match in matches:
		if not len(match.strip()) or ignore_match in match:
			continue
		match = match.strip()
		if match.endswith(","):
			match = match[:-1]
		match = "{" + match + "}"
		try:
			match = json.loads(match.replace('"answer": "answer":', '"answer":').replace('""', '”"'))
			match["source"] = file
			yield match
		except:
			pass

In [None]:
questions = [
	list(get_questions(file)) for file in files
]

In [None]:
len(questions)

In [None]:
questions[10]

In [None]:
total_questions = sum(questions, start=[])

len(total_questions)

In [None]:
df = pd.DataFrame(total_questions)
df.head()

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df[df["option 5"].isna()]["source"].values[:10]

In [None]:
options = df["answer"].apply(lambda x: str(x).split(" ")[1][0])

In [None]:
options.value_counts()

In [None]:
df[options == "o"]

In [None]:
df[options == "m"]

In [None]:
df[options == "a"]

In [None]:
df[options == "p"]

In [None]:
df_filtered = df[options.str.isdigit()].copy()
print(df_filtered.shape)

In [None]:
len(options)

In [None]:
options = df_filtered["answer"].apply(lambda x: int(str(x).split(" ")[1][0]))
df_filtered["answer_index"] = options.values

df_filtered.sample(5)

In [None]:
df_filtered.columns

In [None]:
df_filtered[[
    'question', 'option 1', 'option 2', 'option 3', 'option 4', 'option 5', 'answer_index'
]].to_csv("data/full_data/generated_v1.csv", index=False)

In [None]:
teleqna = pd.DataFrame(json.load(open("data/TeleQnA.json", "r"))).T
test_qna = pd.DataFrame(json.load(open("data/zindi_data/TeleQnA_testing1.json",))).T


teleqna.shape, test_qna.shape

In [None]:
teleqna.sample(5)

In [None]:
teleqna["answer_index"] = teleqna["answer"].apply(lambda x: int(str(x).split()[1][0]))
teleqna["answer_index"].value_counts()

In [None]:
teleqna.columns

In [None]:
"]".__contains__("]")

In [None]:
teleqna["question"].apply(lambda x: str(x).split("[")[-1].__contains__("]")).sum(), len(teleqna)

In [None]:
teleqna[teleqna.index.isin(test_qna.index)][
    ['question', 'option 1', 'option 2', 'option 3', 'option 4', 'option 5', 'answer_index']
].to_csv("data/full_data/test_data.csv", index=False)

teleqna[~teleqna.index.isin(test_qna.index)][
    ['question', 'option 1', 'option 2', 'option 3', 'option 4', 'option 5', 'answer_index']
].to_csv("data/full_data/train_data.csv", index=False)