In [2]:
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
import re
import os
import json
from openai import OpenAI

In [3]:
dir_path = 'mba-pyq'

papers = os.listdir(dir_path)

xat_papers = []
cat_dilr = []

for paper in papers:
    if 'XAT' in paper:
        xat_papers.append(paper)
    else:
        cat_dilr.append(paper)

In [4]:
def extract_table_html(table_tag):
    """Extracts the HTML of a table as a string."""
    return str(table_tag)

def extract_img_info(img_tag):
    """Extracts the src and alt/title of an image."""
    return {
        'src': img_tag.get('src'),
        'alt': img_tag.get('alt'),
        'title': img_tag.get('title')
    }

def parse_questions_with_media(html):
    soup = BeautifulSoup(html, 'html.parser')
    results = []

    # Get all elements at the top level (not inside <li>)
    body_elements = []
    for elem in soup.find_all(['p', 'table', 'img', 'li'], recursive=False):
        body_elements.append(elem)

    # If the above doesn't work due to HTML structure, use soup.contents and filter tags
    if not body_elements:
        body_elements = [el for el in soup.contents if getattr(el, 'name', None) in {'p', 'table', 'img', 'li'}]

    idx = 0
    n = len(body_elements)
    while idx < n:
        elem = body_elements[idx]
        # Find a <p> that is not inside <li> (passage)
        if elem.name == 'p' and not elem.find_parent('li'):
            passage_text = elem.get_text(strip=True)
            passage_tables = []
            passage_images = []

            # Collect any tables or images that immediately follow the passage
            j = idx + 1
            while j < n and body_elements[j].name in {'table', 'img'}:
                if body_elements[j].name == 'table':
                    passage_tables.append(extract_table_html(body_elements[j]))
                elif body_elements[j].name == 'img':
                    passage_images.append(extract_img_info(body_elements[j]))
                j += 1

            # Now, collect all <li> (questions) until the next <p> or end
            questions = []
            idx = j
            while idx < n and not (body_elements[idx].name == 'p' and not body_elements[idx].find_parent('li')):
                if body_elements[idx].name == 'li' and not body_elements[idx].find_parent('ol', class_='choice choice1') and not body_elements[idx].find_parent('li'):
                    # Parse question
                    q_p = body_elements[idx].find('p')
                    question_text = q_p.get_text(strip=True) if q_p else ""

                    # Options
                    options = []
                    ol = body_elements[idx].find('ol', class_='choice choice1')
                    if ol:
                        for opt_li in ol.find_all('li', recursive=False):
                            options.append(opt_li.get_text(strip=True))

                    # Correct answer
                    answer = None
                    tooltip = body_elements[idx].find('span', class_='tooltiptext')
                    if tooltip:
                        b = tooltip.find('b')
                        ans_letter = b.get_text(strip=True) if b else None
                        ans_text = tooltip.get_text(strip=True)
                        if ans_letter and ans_letter in ans_text:
                            ans_text = ans_text.split(ans_letter, 1)[-1].strip(': .\n')
                        answer = {'choice': ans_letter, 'text': ans_text}

                    questions.append({
                        'question': question_text,
                        'options': options,
                        'answer': answer
                    })
                idx += 1

            results.append({
                'passage_text': passage_text,
                'tables': passage_tables,
                'images': passage_images,
                'questions': questions
            })
        else:
            idx += 1

    return results

In [5]:
paper = 'XAT-2023-Question-Paper-BDM.html'
paper_path = dir_path + '/' + paper

with open(paper_path, 'r', encoding='utf-8') as file:
    text = file.read()
    text = text.split('<ol class="ques ques1">')[-1]
    splits = paper.split('-')
    title = ' '.join(splits[:-1]) + ' - ' + splits[-1].split('.')[0]
    text = text.replace(title, '').replace('<br/>', '').replace('<h4>', '').replace('</h4>', '')
    text = parse_questions_with_media(text)
    data = eval(json.dumps(text, indent=2, ensure_ascii=False))
    print(type(data))
    

<class 'list'>


In [6]:
print(data)

[{'passage_text': 'Read the following scenario and answer the THREE questions that follow.\n   \n   \n   During the floods of 2018-2019, a group of philanthropists led by Niyabuddin, wished to open free food centre for the needy. Their motto was that “no human should be hungry.” Nothing gives more satisfaction to the philanthropists than to see the hungry eat to the fullest.\n   \n   Post Covid-19, the group started a food centre by the name Win Borne Life Care Food (WBLCF) in a small town called Palakkad. The centre started gaining popularity as the number of people enjoying free meals increased over time. Initially, WBLCF offered a standardized menu consisting of idli, upma, puttu for breakfast, curd rice for lunch, and idli or upma for supper. Six women were employed by WBLCF to prepare all the meals. As the number of diners increased, they started expecting a variety in the menu.\n   \n   At WBLCF, not all the diners eat "free": while two-thirds of diners get free food, one-third w

In [7]:
def extract_table_html(table_tag):
    return str(table_tag)

def extract_img_info(img_tag):
    return {
        'src': img_tag.get('src'),
        'alt': img_tag.get('alt'),
        'title': img_tag.get('title')
    }

def is_question_li(li):
    # Heuristic: A question <li> contains a <p> and an <ol class="choice choice1">
    return (
        li.find('p') is not None and
        li.find('ol', class_='choice choice1') is not None
    )

def extract_text_with_sup(tag):
    """Extract text from a tag, replacing <sup>n</sup> with ^n and <sub>n</sub> with _n."""
    result = ''
    for elem in tag.descendants:
        if elem.name == 'sup':
            result += '^' + elem.get_text(strip=True)
        elif elem.name == 'sub':
            result += '_' + elem.get_text(strip=True)
        elif elem.string and elem.parent.name not in ('sup', 'sub'):
            result += elem.string
    return result.strip()

def parse_questions_with_media(html):
    soup = BeautifulSoup(html, 'html.parser')
    results = []

    # Find all <li> tags that are likely questions (not options)
    for li in soup.find_all('li'):
        if not is_question_li(li):
            continue

        # Question text
        p_tag = li.find('p')
        question_text = extract_text_with_sup(p_tag) if p_tag else ""

        # Tables and images directly under this <li>
        tables = [extract_table_html(table) for table in li.find_all('table', recursive=False)]
        images = [extract_img_info(img) for img in li.find_all('img', recursive=False)]

        # Options
        options = []
        ol_tag = li.find('ol', class_='choice choice1')
        if ol_tag:
            for opt in ol_tag.find_all('li', recursive=False):
                options.append(extract_text_with_sup(opt))

        # Correct answer
        answer = None
        tooltip = li.find('span', class_='tooltiptext')
        if tooltip:
            b = tooltip.find('b')
            ans_letter = b.get_text(strip=True) if b else None
            # Extract answer text (may have <sup>/<sub>)
            # Remove the <b> tag for clean extraction
            for btag in tooltip.find_all('b'):
                btag.extract()
            ans_text = extract_text_with_sup(tooltip)
            if ans_letter and ans_letter in ans_text:
                ans_text = ans_text.split(ans_letter, 1)[-1].strip(': .\n')
            answer = {'choice': ans_letter, 'text': ans_text}

        results.append({
            'question': question_text,
            'tables': tables,
            'images': images,
            'options': options,
            'answer': answer
        })

    return results

In [8]:
paper = 'XAT-2018-Question-Paper-QADI.html'
paper_path = dir_path + '/' + paper

with open(paper_path, 'r', encoding='utf-8') as file:
    text = file.read()
    text = text.split('<ol class="ques ques1">')[-1]
    splits = paper.split('-')
    title = ' '.join(splits[:-1]) + ' - ' + splits[-1].split('.')[0]
    text = text.replace(title, '').replace('<br/>', '').replace('<h4>', '').replace('</h4>', '')
    result = parse_questions_with_media(text)

In [9]:
print(result)

[{'question': 'Find the value of the expression: 10 + 10\n    ^3\n    + 10\n    ^6\n    + 10\n    ^9', 'tables': [], 'images': [], 'options': ['1010101010', '1001000010', '1001000110', '1001001010', '100010001010'], 'answer': {'choice': 'Choice D', 'text': '1001001010'}}, {'question': 'Abdul, Bimal, Charlie and Dilbar can finish a task in 10, 12, 15 and 18 days respectively. They can either choose to work or remain absent on a particular day. If 50 percent of the total work gets completed after 3 days, then, which of the following options is possible?', 'tables': [], 'images': [], 'options': ['Each of them worked for exactly 2 days.', 'Bimal and Dilbar worked for 1 day each, Charlie worked for 2 days and Abdul worked for all 3 days.', 'Abdul and Charlie worked for 2 days each, Dilbar worked for 1 day and Bimal worked for all 3 days.', 'Abdul and Dilbar worked for 2 days each, Charlie worked for 1 day and Bimal worked for all 3 days.', 'Abdul and Charlie worked for 1 day each, Bimal wor

In [10]:
paper = 'XAT-2019-Question-Paper-VALR.html'
paper_path = dir_path + '/' + paper

with open(paper_path, 'r', encoding='utf-8') as file:
    text = file.read()
    text = text.split('<ol class="ques ques1">')[-1]
    splits = paper.split('-')
    title = ' '.join(splits[:-1]) + ' - ' + splits[-1].split('.')[0]
    text = text.replace(title, '').replace('<br/>', '').replace('<h4>', '').replace('</h4>', '')
    print(text)


  <li>
   
    
   
   <p>
    Choose the option that would fill in the blanks meaningfully in the sentence(s) below:
    
    
    ______ the importance of ‘horizontal stratification’ ______ higher education is widely acknowledged, ______ attention has been applied to horizontal stratification ______ compulsory schooling.
   </p>
   <div class="section group">
    <div class="col span_3_of_4">
     <ol class="choice choice1">
      <li>
       Whereas; with: too less; to
      </li>
      <li>
       While; within: far less; within
      </li>
      <li>
       While; without: further less; within
      </li>
      <li>
       While; on: far fewer; about
      </li>
      <li>
       Whereas; about: for less; of
      </li>
     </ol>
    </div>
    <div class="col span_1_of_4">
     <div class="btn-group">
      <div class="tooltip">
       <button class="button" style="padding-top:12px; padding-bottom:13px">
        Correct Answer
       </button>
       <span class="tooltiptext">


In [20]:
extracted_path = "extracted/CAT-1991-Question-Paper-with-Solution/auto/CAT-1991-Question-Paper-with-Solution.md"

def improved_split_question_types(markdown_text):
    pattern = r'(?=^#?\s*Q\s*\d+\s*(?:–|-|to)\s*\d+(?:,.*?)?\s*:)'
    sections = re.split(pattern, markdown_text, flags=re.MULTILINE)
    # Remove empty and strip whitespace
    sections = [s.strip() for s in sections if s.strip()]
    return sections

def extract_instruction_img_questions(section_text):
    qnum_match = re.search(r'^\s*(\d+)\.\s', section_text, flags=re.MULTILINE)
    if not qnum_match:
        # No question found, treat whole section as instruction
        instruction = section_text.strip()
        img_paths = re.findall(r'!\[.*?\]\((.*?)\)', instruction)
        return {'instruction': instruction, 'img_paths': img_paths, 'questions': []}

    first_q_start = qnum_match.start()
    instruction = section_text[:first_q_start].strip()
    rest = section_text[first_q_start:].strip()

    # 2. Extract all image paths from the instruction
    img_paths = re.findall(r'!\[.*?\]\((.*?)\)', instruction)

    # 3. Split questions by number-dot-space at line start
    qblocks = re.split(r'\n(?=\d+\.\s)', rest)
    questions = []
    for qblock in qblocks:
        qblock = qblock.strip()
        if not qblock:
            continue
        m = re.match(r'(\d+)\.\s*(.*)', qblock, re.DOTALL)
        if not m:
            continue
        qnum = m.group(1)
        rest = m.group(2)

        # Find all options (a)-(d) (or more), capturing their text
        opt_pattern = re.compile(r'\(([a-z])\)\s*([^()]+)')
        options = []
        for opt_match in opt_pattern.finditer(rest):
            options.append(opt_match.group(2).strip())

        # Question text is before the first option
        first_opt = opt_pattern.search(rest)
        if first_opt:
            qtext = rest[:first_opt.start()].strip()
        else:
            qtext = rest.strip()

        questions.append({
            'number': qnum,
            'question': qtext,
            'options': options
        })

    return {
        'instruction': instruction,
        'img_paths': img_paths,
        'questions': questions
    }


with open(extracted_path, 'r', encoding='utf-8') as f:
    markdown_content = f.read()
    questions, answers = markdown_content.split('# 	')
sections = improved_split_question_types(questions)

In [26]:
questions = extract_instruction_img_questions(sections[-3])

In [22]:
sections

['# Question Paper with Solutions  \n\n# CAT 1991  \n\nhttps://bodheeprep.com  \n\n500 hours of online CAT coaching content $4 0 0 0 +$ online CAT preparation videos $4 0 0 0 +$ questions as a part of online CAT course   \n60 Live online Sessions   \nWeekly doubt clearing sessions  \n\nGet FREE Trial  \n\n# Click to join our CAT prep Groups  \n\nCAT Prep Whatsapp Group  \n\n# \x10\x0e\x07\x08\x11\x12\x13\x04\x14\x04\x05',
 'Q1 – 11 : From the statements in questions choose the one that expresses the idea most correctly.  \n\n1. (a) The best part of the programme is the dances. (b) The best part of the programme are the dances. (c) The best part of the programme are the dance. (d) The best parts of the programme is the dances.  \n\n2. (a) The professor, as well as the students, was pleased with their results. (b) The professor, as well as the students, were pleased with their results. (c) The professor as well as the students were pleased with their results. (d) The professor as well as

In [37]:
all_questions = []

for i in range(1, len(sections)):
    all_questions.append(extract_instruction_img_questions(sections[i]))

In [38]:
all_questions

[{'instruction': 'Q1 – 11 : From the statements in questions choose the one that expresses the idea most correctly.',
  'img_paths': [],
  'questions': [{'number': '1',
    'question': '',
    'options': ['The best part of the programme is the dances.',
     'The best part of the programme are the dances.',
     'The best part of the programme are the dance.',
     'The best parts of the programme is the dances.']},
   {'number': '2',
    'question': '',
    'options': ['The professor, as well as the students, was pleased with their results.',
     'The professor, as well as the students, were pleased with their results.',
     'The professor as well as the students were pleased with their results.',
     'The professor as well as the students were pleased with their results.']},
   {'number': '3',
    'question': '',
    'options': ['He was unwilling to testify, he was afraid of the defendant.',
     'Because he was afraid of the defendant, he was unwilling to testify.',
     'He was 

In [33]:
answer_key = {}
# Find the HTML table with answer keys
anskey_table = re.search(r'<table>(.*?)</table>', answers, re.DOTALL)
if anskey_table:
    soup = BeautifulSoup(anskey_table.group(0), 'html.parser')
    for row in soup.find_all('tr'):
        cells = row.find_all('td')
        for i in range(0, len(cells) - 1, 2):
            qnum = cells[i].get_text(strip=True)
            ans = cells[i+1].get_text(strip=True)
            if qnum.isdigit() and ans:
                answer_key[int(qnum)] = ans.lower()

# 2. Extract explanations for each question (1–175)
explanations = {}
# Regex for standard explanations (e.g. 1. a Explanation...)
exp_pattern = re.compile(r'(\d+)\.\s*([a-dA-D])\s*(.*?)(?=\n\d+\.\s*[a-dA-D]|\Z)', re.DOTALL)
for match in exp_pattern.finditer(answers):
    qnum = int(match.group(1))
    ans = match.group(2).lower()
    explanation = match.group(3).strip()
    # Extract image paths
    images = re.findall(r'!\[\]\((images/[^\)]+)\)', explanation)
    explanation_clean = re.sub(r'!\[\]\(images/[^\)]+\)', '', explanation).strip()
    explanations[qnum] = {
        'answer': ans,
        'explanation': explanation_clean,
        'images': images
    }

# 3. Special handling for questions 176–180
# Find the special block for 176–180
special_block = re.search(r'# For questions 176 to 180:(.*?)(CAT Prep Whatsapp Group)', answers, re.DOTALL)
if special_block:
    html_tables = re.findall(r'<table>(.*?)</table>', special_block.group(1), re.DOTALL)
    # The second table contains the answers/explanations for 176–180
    if len(html_tables) >= 2:
        soup = BeautifulSoup('<table>' + html_tables[1] + '</table>', 'html.parser')
        # Flatten all text in table cells
        cell_texts = [td.get_text(separator=' ', strip=True) for td in soup.find_all('td')]
        # Map each question number to its answer/explanation
        for cell in cell_texts:
            # Look for pattern like "177. d Fertilizers production in 1988 = 3.5x + k"
            m = re.match(r'(\d+)\.\s*([a-dA-D])\s*(.*)', cell)
            if m:
                qnum = int(m.group(1))
                ans = m.group(2).lower()
                explanation = m.group(3).strip()
                explanations[qnum] = {
                    'answer': ans,
                    'explanation': explanation,
                    'images': []
                }
            # Some explanations are split or run together; handle those
            else:
                # Try to find all question-answer-explanation in a single cell
                for m in re.finditer(r'(\d+)\.\s*([a-dA-D])\s*([^\.]+)', cell):
                    qnum = int(m.group(1))
                    ans = m.group(2).lower()
                    explanation = m.group(3).strip()
                    explanations[qnum] = {
                        'answer': ans,
                        'explanation': explanation,
                        'images': []
                    }

# 4. Ensure all questions in answer key are present
results = []
for qnum in sorted(answer_key.keys()):
    ans = answer_key[qnum]
    if qnum in explanations:
        entry = explanations[qnum]
        # Use answer from answer key for consistency
        entry['answer'] = ans
    else:
        entry = {
            'answer': ans,
            'explanation': '[No explanation found]',
            'images': []
        }
    entry['question'] = qnum
    results.append(entry)

In [39]:
results.insert(142, {'answer' : 'c', 'explanation' : 'The passage states that no man of sound mind and with his eyes open should be hindered from obtaining money.', 'images' : [], 'question' : 143})

In [40]:
results

[{'answer': 'a',
  'explanation': 'The subject here is ‘the best part’, which is singular and should therefore be followed by a singular verb.',
  'images': [],
  'question': 1},
 {'answer': 'a',
  'explanation': 'When using ‘as well as’ to introduce a complex subject, the phrase should be set off by commas, and the verb agrees with the main subject, which in this case is ‘the professor’.',
  'images': [],
  'question': 2},
 {'answer': 'b',
  'explanation': 'As the first part of the sentence provides the reason for his being unwilling to testify, ‘because’ should be used to introduce it. Moreover a comma should always be used to separate two distinct phrases in a sentence.',
  'images': [],
  'question': 3},
 {'answer': 'b',
  'explanation': 'The pronoun should remain consistent throughout the sentence.',
  'images': [],
  'question': 4},
 {'answer': 'b',
  'explanation': 'When ‘either’ and ‘neither’ are followed by ‘or’ and ‘nor’ respectively, the verb depends on the noun following ‘o

In [16]:
llm_id = 'gemma-3-4b-it'
server_url = 'http://127.0.0.1:1234/v1'
gen_client = OpenAI(base_url=server_url, api_key="lm-studio")

system_prompt = {
            "role": "system",
            "content": "You are an AI data extraction assistant. You help the user extract the relevant data from the text provided",
    }

user_prompt = """
You are a AI based question extraction helper. You will be provided with raw markdown that contains the results of performing OCR on a pdf.
It has a lot of types of questions with instructions, all questions have a list of options. Your task is to extract the different questions from the markdown.
Before each type of question will be an instruction about them. Extract the instructions, questions and options from the text, do not miss anything
Extract the questions as it is, do not change the language or words.

Provided the answer in the format : List[Dictionary]
Each dictionary will have the following parameters : Instruction : str, Questions : List[Dictionary]
Each Question will have the parameters : question : str, options : List[str]

Here is the provided markdown : 
{markdown}
"""

In [17]:
context = gen_client.chat.completions.create(
        model=llm_id,
        messages=[
            system_prompt,
            {
                "role" : "user",
                "content" : user_prompt.format(
                    markdown=sections[-1]
                )

            }
        ],
        temperature=0.3
    )

APIConnectionError: Connection error.

In [None]:
print(context.choices[0].message.content)

```json
[
  {
    "Instruction": "Study the following graph and answer questions that follow.",
    "Questions": [
      {
        "question": "The sum of food and fertilizer production has shown a constant value for how many years?",
        "options": [
          "(a) None of the years",
          "(b) 2",
          "(c) 4",
          "(d) 5"
        ]
      },
      {
        "question": "If in 1988, the sum of the food and fertilizer production was 170 million tonnes, the value of food production must have been (approximately, in million tonnes) …",
        "options": [
          "(a) 90",
          "(b) 70",
          "(c) 100",
          "(d) Insufficient data"
        ]
      },
      {
        "question": "From its apparent behaviour, the food production in year 1992 can be expected to …",
        "options": [
          "(a) go up",
          "(b) go down",
          "(c) remain the same as previous year.",
          "(d) nothing can be said."
        ]
      },
      {
       