In [None]:
from pydantic import BaseModel, Field
from typing import List, Union, Optional
from glob import glob

files = glob("QuestionsCameroon/*.txt")
len(files)

In [None]:
model_outputs = list(map(lambda x: open(x).read(), files))

len(model_outputs)

In [None]:
print(model_outputs[0])

In [None]:
class AnswerChoice(BaseModel):
	letter: str = Field(
		description="The letter identifier for the answer choice (e.g., 'A', 'B', 'C'...)",
	)
	text: str = Field(
		description="The actual text content of the answer choice",
	)


class Question(BaseModel):
	question_number: Union[int, str] = Field(
		description="Sequential number identifying the question in the set",
	)
	question_text: str = Field(
		description="The actual text of the question being asked",
	)
	answer_choices: List[AnswerChoice] = Field(
		description="List of possible answer choices for the question",
	)
	correct_answers: List[str] = Field(
		description="List of letters corresponding to the correct answer choices. examples : ['A', 'C']",
	)
	explanation: str = Field(
		description="Factual detailed explanation of why the marked answers are correct",
	)


class QuestionBank(BaseModel):
	questions: List[Question] = Field(
		description="Collection of all questions in the question bank"
	)

In [None]:
import json
import re

def parse_question_bank_from_text(text: str) -> QuestionBank:
	# Extract JSON content using a regular expression
	json_match = re.search(r'\{.*\}', text, re.DOTALL)
	if not json_match:
		raise ValueError("No valid JSON found in the provided text")

	# Parse the extracted JSON
	json_data = json.loads(json_match.group())
	# Convert parsed JSON to QuestionBank dataclass
	question_bank = QuestionBank.model_validate(json_data)
	return question_bank


In [None]:
sentence = parse_question_bank_from_text(model_outputs[0])

In [None]:
sentence.questions

In [None]:
sentences = {}
fails = {}
for raw, file in zip(model_outputs, files):
	try:
		sentences[file] = parse_question_bank_from_text(raw)
	except:
		fails[file] = raw

In [None]:
len(sentences), len(fails)

In [None]:
parse_question_bank_from_text(fails["QuestionsCameroon/0001f179-fb9c-4124-aac2-2d8b34dd4995.txt"])

In [None]:
print(fails["QuestionsCameroon/0001f179-fb9c-4124-aac2-2d8b34dd4995.txt"])

In [None]:
import re
import json
from pydantic import ValidationError

def extract_questions(text: str) -> QuestionBank:
	# Regex pattern to capture each individual question object entirely
	question_pattern = re.compile(
		r'\{\s*"question_number":\s*(\d+|".+?"),\s*'
		r'"question_text":\s*".+?",\s*'
		r'"answer_choices":\s*\[.*?\],\s*'
		r'"correct_answers":\s*\[.*?\],\s*'
		r'"explanation":\s*".+?"\s*\}',
		re.DOTALL
	)
	
	# Find all matches for the complete question JSON object
	matches = question_pattern.finditer(text)
	
	# Parse each question JSON structure and convert to a Question dataclass
	questions = []
	for match in matches:
		question_json_str = match[0]
		try:
			question_data = json.loads(question_json_str)
			question = Question.model_validate(question_data)
			questions.append(question)
		except json.JSONDecodeError:
			pass
			# print("Failed to parse question:", question_json_str)
		except ValidationError:
			return
	
	# Construct the final QuestionBank instance
	question_bank = QuestionBank(questions=questions)
	return question_bank

In [None]:
print(extract_questions(fails["QuestionsCameroon/0001f179-fb9c-4124-aac2-2d8b34dd4995.txt"]))

In [None]:
fails_fixed = {}
fails_fails = {}

for key, raw in fails.items():
	question = extract_questions(raw)
	if question and len(question.questions):
		fails_fixed[key] = question
	else:
		fails_fails[key] = raw

In [None]:
len(fails_fixed), len(fails_fails)

In [None]:
import pandas as pd

fails_fixed_len = [
	len(question.questions) for question in fails_fixed.values()
]

pd.Series(fails_fixed_len).describe()

In [None]:
print(fails_fails["QuestionsCameroon/02200217-7f9d-4a81-95ad-bd0fc18abd18.txt"])

In [None]:
extract_questions(fails_fails["QuestionsCameroon/02200217-7f9d-4a81-95ad-bd0fc18abd18.txt"])

In [None]:
import re
import json

def remove_trailing_commas(json_str: str) -> str:
	# Regex to find and remove trailing commas in JSON
	json_str = re.sub(r',\s*(\]|\})', r'\1', json_str)
	return json_str

# Example usage
json_text = """
{
	"question_number": 1,
	"question_text": "Why was Africa's agricultural sector performance mediocre during the 1970s?",
	"answer_choices": [
		{"letter": "A", "text": "Because of insufficient investment in research on staple crops and root vegetables"},
		{"letter": "B", "text": "Due to lack of technical improvements adapted to African agronomic conditions"},
		{"letter": "C", "text": "Both A and B are correct reasons"}, 
	],
	"correct_answers": ["A", "B"],
	"explanation": "The text states that the agricultural sector performance in Africa during the 1970s was mediocre due to insufficient investment in research on staple crops and root vegetables, as well as lack of technical improvements adapted to African agronomic conditions."
}
"""

# Clean up the JSON and parse it
cleaned_json_text = remove_trailing_commas(json_text)
parsed_json = json.loads(cleaned_json_text)

print(parsed_json)

In [None]:
import re
import json
from pydantic import ValidationError


def extract_questions_v2(text: str) -> QuestionBank:
	# Regex pattern to capture each individual question object entirely
	question_pattern = re.compile(
		r'\{\s*"question_number":\s*(\d+|".+?"),\s*'
		r'"question_text":\s*".+?",\s*'
		r'"answer_choices":\s*\[.*?\],\s*'
		r'"correct_answers":\s*\[.*?\],\s*'
		r'"explanation":\s*".+?"\s*\}',
		re.DOTALL
	)
	
	# Find all matches for the complete question JSON object
	matches = question_pattern.finditer(text)
	
	# Parse each question JSON structure and convert to a Question dataclass
	questions = []
	for match in matches:
		question_json_str = match[0]
		try:
			question_data = json.loads(remove_trailing_commas(question_json_str))
			question = Question.model_validate(question_data)
			questions.append(question)
		except json.JSONDecodeError as e:
			pass
			# print("Failed to parse question:", question_json_str)
		except ValidationError as e:
			return
	
	# Construct the final QuestionBank instance
	question_bank = QuestionBank(questions=questions)
	return question_bank

In [None]:
fails_fails_fixed = {}
fails_fails_fails = {}

for key, raw in fails_fails.items():
	question = extract_questions_v2(raw)
	if question and len(question.questions):
		fails_fails_fixed[key] = question
	else:
		fails_fails_fails[key] = raw

In [None]:
len(fails_fails_fixed), len(fails_fails_fails)

In [None]:
print(fails_fails_fails["QuestionsCameroon/0904d146-d52b-4f4d-92fb-b04eecf49aee.txt"])

In [None]:
print(fails_fails_fails["QuestionsCameroon/1188f1f8-49a9-4877-8e59-42c433fc652f.txt"])

In [None]:
import re
import json
import pydantic
import uuid

def extract_questions(text: str) -> QuestionBank:
	# Regex pattern to capture each individual question object entirely
	question_pattern = re.compile(
		r'\{\s*"question_number":\s*(\d+|".+?"),\s*'
		r'"question_text":\s*".+?",\s*'
		r'"answer_choices":\s*\[.*?\],\s*'
		r'"correct_answers":\s*\[.*?\],\s*'
		r'"explanation":\s*".+?"\s*\}',
		re.DOTALL
	)

	text_id = str(uuid.uuid4())
	
	# Find all matches for the complete question JSON object
	matches = question_pattern.finditer(text)
	
	# Parse each question JSON structure and convert to a Question dataclass
	questions = []
	for match in matches:
		question_json_str = match[0]
		try:
			question_data = json.loads(remove_trailing_commas(question_json_str))
			question = Question.model_validate(question_data)
			question.question_number = text_id + "_" + str(question.question_number)
			questions.append(question)
		except json.JSONDecodeError as e:
			pass
			# print("Failed to parse question:", question_json_str)
		except pydantic.ValidationError as e:
			pass
	
	return questions

In [None]:
sentences = {}
fails = {}
for raw, file in zip(model_outputs, files):
	try:
		sentences[file] = extract_questions(raw)
	except:
		fails[file] = raw

In [None]:
len(sentences), len(fails)

In [None]:
merged_sentences = sum(sentences.values(), start=[])
len(merged_sentences)

In [None]:
merged_sentences[0]

In [None]:
from datasets import Dataset
from pydantic import BaseModel, Field
from typing import List

def convert_answer(answer: AnswerChoice):
    return {answer.letter: answer.text}

def convert(answers: list[AnswerChoice]):
    res = {}
    for i in answers:
        i = convert_answer(i)
        res.update(i)
        # if list(i.values())[0]:
        #     res.update(i) # put a print here to check duplicated id
    return res

# Convert list of dataclass instances to a dataset
def convert_to_dataset(questions: List[Question]) -> Dataset:
    # Create a dictionary for the dataset
    dataset_dict = {
        "question_number": [],
        "question_text": [],
        "answer_choices": [],
        "correct_answers": [],
        "explanation": []
    }

    for question in questions:
        dataset_dict["question_number"].append(question.question_number)
        dataset_dict["question_text"].append(question.question_text)
        dataset_dict["answer_choices"].append(convert(question.answer_choices))
        dataset_dict["correct_answers"].append(question.correct_answers)
        dataset_dict["explanation"].append(question.explanation)
        # explanation_utf8 = json.dumps({"explanation": question.explanation}, ensure_ascii=False)
        # dataset_dict["explanation"].append(json.loads(explanation_utf8)["explanation"])

    # Create the dataset
    dataset = Dataset.from_dict(dataset_dict)
    return dataset

# Convert the questions list to a Dataset
questions_dataset = convert_to_dataset(merged_sentences)

# Optionally, print the dataset
print(questions_dataset)

In [None]:
questions_dataset[:5]

In [None]:
questions_dataset.save_to_disk("time_question_gen/qa_africa")

In [None]:
questions_dataset.push_to_hub("alexneakameni/qa_africa")