In [56]:
import re
from tqdm import tqdm
import pandas as pd

In [57]:
questions = []

with open('extracted_questions_2013.txt', 'r', encoding='utf-8') as file:
    questions = file.readlines()

In [58]:
def clean_question(question):
    # Removes any leading number followed by a dot and optional space
    return re.sub(r'^\d+\.\s*', '', question)

def clean_answer(answer):
    # Removes any leading capital letter A-D followed by a dot and optional space
    return re.sub(r'^[A-D]\.\s*', '', answer)

def clean_options(options_str):
    # Split by comma, as options are comma-separated
    options = [opt.strip() for opt in options_str.split(',')]
    cleaned_options = []
    for opt in options:
        cleaned = re.sub(r'^[a-dA-D]\)\s*', '', opt)
        cleaned_options.append(cleaned)
    return cleaned_options

In [59]:
def extract_field(tag, text):
    # Try to find <tag>...</tag>
    pattern = fr'<{tag}>(.*?)</{tag}>'
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(1).strip()
    # Fallback: Find <tag>... (up to next < or end of string)
    pattern = fr'<{tag}>(.*?)(?=<|$)'
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return None

def parse_question_string(input_string):
    # Remove code block markers and possible 'xml' at start
    cleaned = re.sub(r'^`{3,}xml|`{3,}$', '', input_string).strip()
    fields = {}
    for tag in ['question', 'options', 'answer', 'solution']:
        fields[tag] = extract_field(tag, cleaned)
    fields['paper'] = 'NEET-PG-2013'
    return fields

In [60]:
questions_list = []

for question in tqdm(questions):
    fields = parse_question_string(question)
    fields['question'] = clean_question(fields['question'])
    fields['answer'] = clean_answer(fields['answer'])
    fields['options'] = clean_options(fields['options'])
    questions_list.append(fields)

100%|██████████| 1869/1869 [00:00<00:00, 33579.59it/s]


In [61]:
questions_list[13]

{'question': "Billorth's cord are present in which part of spleen?",
 'options': ['White pulp', 'Red pulp', 'Both', 'Capsule'],
 'answer': 'Red pulp',
 'solution': 'Spleen is mainly composed of two parts : 1. White pulp : The white pulp of the spleen is formed of mass of T and B lymphocytes surrounding central artery, arranged as lymphoid nodule. Each nodule is also called Malpigian bodies. Marginal zone surrounds the white pulp and contains antigen presenting cells as macrophages. 2. Red pulp : Red pulp is made up of a mesh of leaky sinusoids (vascular sinuses) through which the red cells are squeezed. Adjacent blood spaces contain blood cells and arranged in cords called splenic cords of billorth.',
 'paper': 'NEET-PG-2013'}

In [62]:
df = pd.DataFrame(questions_list)

In [63]:
df.head()

Unnamed: 0,question,options,answer,solution,paper
0,First carpal bone to appear is?,"[Trapezium, Capitate, Pisiform, Lunate]",Capitate,The ossification centres in carpal bones appea...,NEET-PG-2013
1,Pharyngeal muscles are derived from which phar...,"[1st, 2nd, 3rd, 5th]",3rd,3rd,NEET-PG-2013
2,Spinal cord develops from ?,"[Neural tube, Mesencephalon, Rhombencephalon, ...",Neural tube,Nervous system develops from ectoderm (neuroec...,NEET-PG-2013
3,Collecting part of kidney develops from ?,"[Pronephrons, Mesonephros, Metanephros, Ureter...",Ureteric bud,Kidneys develop from two sources : Metanephros...,NEET-PG-2013
4,which level the somites initially form ?,"[Thoracic level, Cervical level, Lumbar level,...",Cervical level,The first pair of somites develop a short dist...,NEET-PG-2013


In [64]:
df.to_csv('question_neet_pg_2013.csv', index=False)