In [3]:
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
import re
import os
import json
from openai import OpenAI

In [2]:
dir_path = 'mba-pyq'

papers = os.listdir(dir_path)

xat_papers = []
cat_dilr = []

for paper in papers:
    if 'XAT' in paper:
        xat_papers.append(paper)
    else:
        cat_dilr.append(paper)

In [16]:
def extract_table_html(table_tag):
    """Extracts the HTML of a table as a string."""
    return str(table_tag)

def extract_img_info(img_tag):
    """Extracts the src and alt/title of an image."""
    return {
        'src': img_tag.get('src'),
        'alt': img_tag.get('alt'),
        'title': img_tag.get('title')
    }

def parse_questions_with_media(html):
    soup = BeautifulSoup(html, 'html.parser')
    results = []

    # Get all elements at the top level (not inside <li>)
    body_elements = []
    for elem in soup.find_all(['p', 'table', 'img', 'li'], recursive=False):
        body_elements.append(elem)

    # If the above doesn't work due to HTML structure, use soup.contents and filter tags
    if not body_elements:
        body_elements = [el for el in soup.contents if getattr(el, 'name', None) in {'p', 'table', 'img', 'li'}]

    idx = 0
    n = len(body_elements)
    while idx < n:
        elem = body_elements[idx]
        # Find a <p> that is not inside <li> (passage)
        if elem.name == 'p' and not elem.find_parent('li'):
            passage_text = elem.get_text(strip=True)
            passage_tables = []
            passage_images = []

            # Collect any tables or images that immediately follow the passage
            j = idx + 1
            while j < n and body_elements[j].name in {'table', 'img'}:
                if body_elements[j].name == 'table':
                    passage_tables.append(extract_table_html(body_elements[j]))
                elif body_elements[j].name == 'img':
                    passage_images.append(extract_img_info(body_elements[j]))
                j += 1

            # Now, collect all <li> (questions) until the next <p> or end
            questions = []
            idx = j
            while idx < n and not (body_elements[idx].name == 'p' and not body_elements[idx].find_parent('li')):
                if body_elements[idx].name == 'li' and not body_elements[idx].find_parent('ol', class_='choice choice1') and not body_elements[idx].find_parent('li'):
                    # Parse question
                    q_p = body_elements[idx].find('p')
                    question_text = q_p.get_text(strip=True) if q_p else ""

                    # Options
                    options = []
                    ol = body_elements[idx].find('ol', class_='choice choice1')
                    if ol:
                        for opt_li in ol.find_all('li', recursive=False):
                            options.append(opt_li.get_text(strip=True))

                    # Correct answer
                    answer = None
                    tooltip = body_elements[idx].find('span', class_='tooltiptext')
                    if tooltip:
                        b = tooltip.find('b')
                        ans_letter = b.get_text(strip=True) if b else None
                        ans_text = tooltip.get_text(strip=True)
                        if ans_letter and ans_letter in ans_text:
                            ans_text = ans_text.split(ans_letter, 1)[-1].strip(': .\n')
                        answer = {'choice': ans_letter, 'text': ans_text}

                    questions.append({
                        'question': question_text,
                        'options': options,
                        'answer': answer
                    })
                idx += 1

            results.append({
                'passage_text': passage_text,
                'tables': passage_tables,
                'images': passage_images,
                'questions': questions
            })
        else:
            idx += 1

    return results

In [23]:
paper = 'XAT-2023-Question-Paper-BDM.html'
paper_path = dir_path + '/' + paper

with open(paper_path, 'r', encoding='utf-8') as file:
    text = file.read()
    text = text.split('<ol class="ques ques1">')[-1]
    splits = paper.split('-')
    title = ' '.join(splits[:-1]) + ' - ' + splits[-1].split('.')[0]
    text = text.replace(title, '').replace('<br/>', '').replace('<h4>', '').replace('</h4>', '')
    text = parse_questions_with_media(text)
    data = eval(json.dumps(text, indent=2, ensure_ascii=False))
    print(type(data))
    

<class 'list'>


In [24]:
print(data)

[{'passage_text': 'Read the following scenario and answer the THREE questions that follow.\n   \n   \n   During the floods of 2018-2019, a group of philanthropists led by Niyabuddin, wished to open free food centre for the needy. Their motto was that “no human should be hungry.” Nothing gives more satisfaction to the philanthropists than to see the hungry eat to the fullest.\n   \n   Post Covid-19, the group started a food centre by the name Win Borne Life Care Food (WBLCF) in a small town called Palakkad. The centre started gaining popularity as the number of people enjoying free meals increased over time. Initially, WBLCF offered a standardized menu consisting of idli, upma, puttu for breakfast, curd rice for lunch, and idli or upma for supper. Six women were employed by WBLCF to prepare all the meals. As the number of diners increased, they started expecting a variety in the menu.\n   \n   At WBLCF, not all the diners eat "free": while two-thirds of diners get free food, one-third w

In [13]:
def extract_table_html(table_tag):
    return str(table_tag)

def extract_img_info(img_tag):
    return {
        'src': img_tag.get('src'),
        'alt': img_tag.get('alt'),
        'title': img_tag.get('title')
    }

def is_question_li(li):
    # Heuristic: A question <li> contains a <p> and an <ol class="choice choice1">
    return (
        li.find('p') is not None and
        li.find('ol', class_='choice choice1') is not None
    )

def extract_text_with_sup(tag):
    """Extract text from a tag, replacing <sup>n</sup> with ^n and <sub>n</sub> with _n."""
    result = ''
    for elem in tag.descendants:
        if elem.name == 'sup':
            result += '^' + elem.get_text(strip=True)
        elif elem.name == 'sub':
            result += '_' + elem.get_text(strip=True)
        elif elem.string and elem.parent.name not in ('sup', 'sub'):
            result += elem.string
    return result.strip()

def parse_questions_with_media(html):
    soup = BeautifulSoup(html, 'html.parser')
    results = []

    # Find all <li> tags that are likely questions (not options)
    for li in soup.find_all('li'):
        if not is_question_li(li):
            continue

        # Question text
        p_tag = li.find('p')
        question_text = extract_text_with_sup(p_tag) if p_tag else ""

        # Tables and images directly under this <li>
        tables = [extract_table_html(table) for table in li.find_all('table', recursive=False)]
        images = [extract_img_info(img) for img in li.find_all('img', recursive=False)]

        # Options
        options = []
        ol_tag = li.find('ol', class_='choice choice1')
        if ol_tag:
            for opt in ol_tag.find_all('li', recursive=False):
                options.append(extract_text_with_sup(opt))

        # Correct answer
        answer = None
        tooltip = li.find('span', class_='tooltiptext')
        if tooltip:
            b = tooltip.find('b')
            ans_letter = b.get_text(strip=True) if b else None
            # Extract answer text (may have <sup>/<sub>)
            # Remove the <b> tag for clean extraction
            for btag in tooltip.find_all('b'):
                btag.extract()
            ans_text = extract_text_with_sup(tooltip)
            if ans_letter and ans_letter in ans_text:
                ans_text = ans_text.split(ans_letter, 1)[-1].strip(': .\n')
            answer = {'choice': ans_letter, 'text': ans_text}

        results.append({
            'question': question_text,
            'tables': tables,
            'images': images,
            'options': options,
            'answer': answer
        })

    return results

In [14]:
paper = 'XAT-2018-Question-Paper-QADI.html'
paper_path = dir_path + '/' + paper

with open(paper_path, 'r', encoding='utf-8') as file:
    text = file.read()
    text = text.split('<ol class="ques ques1">')[-1]
    splits = paper.split('-')
    title = ' '.join(splits[:-1]) + ' - ' + splits[-1].split('.')[0]
    text = text.replace(title, '').replace('<br/>', '').replace('<h4>', '').replace('</h4>', '')
    result = parse_questions_with_media(text)

In [15]:
print(result)

[{'question': 'Find the value of the expression: 10 + 10\n    ^3\n    + 10\n    ^6\n    + 10\n    ^9', 'tables': [], 'images': [], 'options': ['1010101010', '1001000010', '1001000110', '1001001010', '100010001010'], 'answer': {'choice': 'Choice D', 'text': '1001001010'}}, {'question': 'Abdul, Bimal, Charlie and Dilbar can finish a task in 10, 12, 15 and 18 days respectively. They can either choose to work or remain absent on a particular day. If 50 percent of the total work gets completed after 3 days, then, which of the following options is possible?', 'tables': [], 'images': [], 'options': ['Each of them worked for exactly 2 days.', 'Bimal and Dilbar worked for 1 day each, Charlie worked for 2 days and Abdul worked for all 3 days.', 'Abdul and Charlie worked for 2 days each, Dilbar worked for 1 day and Bimal worked for all 3 days.', 'Abdul and Dilbar worked for 2 days each, Charlie worked for 1 day and Bimal worked for all 3 days.', 'Abdul and Charlie worked for 1 day each, Bimal wor

In [25]:
paper = 'XAT-2019-Question-Paper-VALR.html'
paper_path = dir_path + '/' + paper

with open(paper_path, 'r', encoding='utf-8') as file:
    text = file.read()
    text = text.split('<ol class="ques ques1">')[-1]
    splits = paper.split('-')
    title = ' '.join(splits[:-1]) + ' - ' + splits[-1].split('.')[0]
    text = text.replace(title, '').replace('<br/>', '').replace('<h4>', '').replace('</h4>', '')
    print(text)


  <li>
   
    
   
   <p>
    Choose the option that would fill in the blanks meaningfully in the sentence(s) below:
    
    
    ______ the importance of ‘horizontal stratification’ ______ higher education is widely acknowledged, ______ attention has been applied to horizontal stratification ______ compulsory schooling.
   </p>
   <div class="section group">
    <div class="col span_3_of_4">
     <ol class="choice choice1">
      <li>
       Whereas; with: too less; to
      </li>
      <li>
       While; within: far less; within
      </li>
      <li>
       While; without: further less; within
      </li>
      <li>
       While; on: far fewer; about
      </li>
      <li>
       Whereas; about: for less; of
      </li>
     </ol>
    </div>
    <div class="col span_1_of_4">
     <div class="btn-group">
      <div class="tooltip">
       <button class="button" style="padding-top:12px; padding-bottom:13px">
        Correct Answer
       </button>
       <span class="tooltiptext">


In [30]:
extracted_path = "extracted/CAT-1991-Question-Paper-with-Solution/auto/CAT-1991-Question-Paper-with-Solution.md"

def split_question_types(markdown_text):
    # Regex pattern to match instruction lines indicating a new question type
    # Handles different dash types and "to" as separator, e.g., "Q101 – 155", "Q 12 to 22"
    pattern = r'(?=^Q\s*\d+\s*(?:–|-|to)\s*\d+\s*:.*$)'
    # re.MULTILINE ensures ^ matches at the start of each line
    sections = re.split(pattern, markdown_text, flags=re.MULTILINE)
    # Remove empty strings and strip whitespace
    sections = [section.strip() for section in sections if section.strip()]
    return sections

def extract_questions_from_section(section_text):
    qsplit = re.split(r'\n(?=\d+\.\s)', section_text.strip())  # split on new question number at line start

    questions = []
    for qblock in qsplit:
        qblock = qblock.strip()
        if not qblock:
            continue
        # Extract question number
        m = re.match(r'(\d+)\.\s*(.*)', qblock, re.DOTALL)
        if not m:
            continue
        qnum = m.group(1)
        rest = m.group(2)

        # Find all options (a)-(d) (or more), capturing their text
        # Handles options on same line or across lines
        # Option pattern: (a) ... (b) ... (c) ... (d) ...
        opt_pattern = re.compile(r'\(([a-z])\)\s*([^()]+)')
        options = []
        last_end = 0
        for opt_match in opt_pattern.finditer(rest):
            options.append(opt_match.group(2).strip())
            last_end = opt_match.end()

        # Question text is before the first option
        first_opt = opt_pattern.search(rest)
        if first_opt:
            qtext = rest[:first_opt.start()].strip()
        else:
            qtext = rest.strip()

        questions.append({
            'number': qnum,
            'question': qtext,
            'options': options
        })
    return questions

with open(extracted_path, 'r', encoding='utf-8') as f:
    markdown_content = f.read()

sections = split_question_types(markdown_content)

In [35]:
questions = extract_questions_from_section(sections[-2])

In [36]:
questions

[{'number': '166',
  'question': 'Which share showed the greatest percentage increase in market value in any month during the entire period?',
  'options': ['A', 'B', 'C', 'D']},
 {'number': '167',
  'question': 'In which month was the greatest absolute change in market value for any share recorded?',
  'options': ['March', 'April', 'May', 'June']},
 {'number': '168',
  'question': 'In which month was the greatest percentage increase in market value for any share recorded?',
  'options': ['February', 'March', 'April', 'May']},
 {'number': '169',
  'question': 'An individual wishes to sell 1 share of C and 1 share of D to buy 1 share of A at the end of a month. At which month-end would the individual’s loss from this decision, due to share value changes, be the most?',
  'options': ['February', 'March', 'April', 'June']},
 {'number': '170',
  'question': 'An individual decides to sell 1 share of C and 1 share of D to buy 1 share of A at the end of the month. What can be the individual’s

In [21]:
llm_id = 'gemma-3-4b-it'
server_url = 'http://127.0.0.1:1234/v1'
gen_client = OpenAI(base_url=server_url, api_key="lm-studio")

system_prompt = {
            "role": "system",
            "content": "You are an AI data extraction assistant. You help the user extract the relevant data from the text provided",
    }

user_prompt = """
You are a AI based question extraction helper. You will be provided with raw markdown that contains the results of performing OCR on a pdf.
It has a lot of types of questions with instructions, all questions have a list of options. Your task is to extract the different questions from the markdown.
Before each type of question will be an instruction about them. 
Some instruction/questions will have image paths attached like : ![](images/cd1de4c1186edf8240cc4acc770c034a3b75c348e7272ed3d5d3e6f31b9337ac.jpg). 
Make sure to extract the img path as it is aloing with the instruction/question. Failure to extract them will lead to incomlete questions.
Extract the questions as it is, do not change the language or words.

Provided the answer in the format : List[Dictionary]
Each dictionary will have the following parameters : Instruction : str, Questions : List[Dictionary]
Each Question will have the parameters : question : str, options : List[str]

Here is the provided markdown : 
{markdown}
"""

In [22]:
context = gen_client.chat.completions.create(
        model=llm_id,
        messages=[
            system_prompt,
            {
                "role" : "user",
                "content" : user_prompt.format(
                    markdown=sections[-1]
                )

            }
        ],
        temperature=0.3
    )

In [23]:
print(context.choices[0].message.content)

```json
[
  {
    "Instruction": "Study the following graph and answer questions that follow. The x – axis denotes the years from 1983 to 1991.",
    "Questions": [
      {
        "question": "The sum of food and fertilizer production has shown a constant value for how many years?",
        "options": [
          "(a) None of the years",
          "(b) 2",
          "(c) 4",
          "(d) 5"
        ]
      },
      {
        "question": "If in 1988, the sum of the food and fertilizer production was 170 million tonnes, the value of food production must have been (approximately, in million tonnes) …",
        "options": [
          "(a) 90",
          "(b) 70",
          "(c) 100",
          "(d) Insufficient data"
        ]
      },
      {
        "question": "From its apparent behaviour, the food production in year 1992 can be expected to …",
        "options": [
          "(a) go up",
          "(b) go down",
          "(c) remain the same as previous year.",
          "(d) nothin