In [25]:
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
import re
import os
import json

In [6]:
dir_path = 'mba-pyq'

papers = os.listdir(dir_path)

xat_papers = []
cat_dilr = []

for paper in papers:
    if 'XAT' in paper:
        xat_papers.append(paper)
    else:
        cat_dilr.append(paper)

In [27]:
def extract_table_html(table_tag):
    """Extracts the HTML of a table as a string."""
    return str(table_tag)

def extract_img_info(img_tag):
    """Extracts the src and alt/title of an image."""
    return {
        'src': img_tag.get('src'),
        'alt': img_tag.get('alt'),
        'title': img_tag.get('title')
    }

def parse_questions_with_media(html):
    soup = BeautifulSoup(html, 'html.parser')
    results = []

    # Get all elements at the top level (not inside <li>)
    body_elements = []
    for elem in soup.find_all(['p', 'table', 'img', 'li'], recursive=False):
        body_elements.append(elem)

    # If the above doesn't work due to HTML structure, use soup.contents and filter tags
    if not body_elements:
        body_elements = [el for el in soup.contents if getattr(el, 'name', None) in {'p', 'table', 'img', 'li'}]

    idx = 0
    n = len(body_elements)
    while idx < n:
        elem = body_elements[idx]
        # Find a <p> that is not inside <li> (passage)
        if elem.name == 'p' and not elem.find_parent('li'):
            passage_text = elem.get_text(strip=True)
            passage_tables = []
            passage_images = []

            # Collect any tables or images that immediately follow the passage
            j = idx + 1
            while j < n and body_elements[j].name in {'table', 'img'}:
                if body_elements[j].name == 'table':
                    passage_tables.append(extract_table_html(body_elements[j]))
                elif body_elements[j].name == 'img':
                    passage_images.append(extract_img_info(body_elements[j]))
                j += 1

            # Now, collect all <li> (questions) until the next <p> or end
            questions = []
            idx = j
            while idx < n and not (body_elements[idx].name == 'p' and not body_elements[idx].find_parent('li')):
                if body_elements[idx].name == 'li' and not body_elements[idx].find_parent('ol', class_='choice choice1') and not body_elements[idx].find_parent('li'):
                    # Parse question
                    q_p = body_elements[idx].find('p')
                    question_text = q_p.get_text(strip=True) if q_p else ""

                    # Options
                    options = []
                    ol = body_elements[idx].find('ol', class_='choice choice1')
                    if ol:
                        for opt_li in ol.find_all('li', recursive=False):
                            options.append(opt_li.get_text(strip=True))

                    # Correct answer
                    answer = None
                    tooltip = body_elements[idx].find('span', class_='tooltiptext')
                    if tooltip:
                        b = tooltip.find('b')
                        ans_letter = b.get_text(strip=True) if b else None
                        ans_text = tooltip.get_text(strip=True)
                        if ans_letter and ans_letter in ans_text:
                            ans_text = ans_text.split(ans_letter, 1)[-1].strip(': .\n')
                        answer = {'choice': ans_letter, 'text': ans_text}

                    questions.append({
                        'question': question_text,
                        'options': options,
                        'answer': answer
                    })
                idx += 1

            results.append({
                'passage_text': passage_text,
                'tables': passage_tables,
                'images': passage_images,
                'questions': questions
            })
        else:
            idx += 1

    return results

In [40]:
paper = 'XAT-2019-Question-Paper-GK.html'
paper_path = dir_path + '/' + paper

with open(paper_path, 'r', encoding='utf-8') as file:
    text = file.read()
    text = text.split('<ol class="ques ques1">')[-1]
    splits = paper.split('-')
    title = ' '.join(splits[:-1]) + ' - ' + splits[-1].split('.')[0]
    text = text.replace(title, '').replace('<br/>', '').replace('<h4>', '').replace('</h4>', '')
    text = parse_questions_with_media(text)
    data = eval(json.dumps(text, indent=2, ensure_ascii=False))
    print(type(data))
    

<class 'list'>


In [55]:
def extract_table_html(table_tag):
    return str(table_tag)

def extract_img_info(img_tag):
    return {
        'src': img_tag.get('src'),
        'alt': img_tag.get('alt'),
        'title': img_tag.get('title')
    }

def is_question_li(li):
    # Heuristic: A question <li> contains a <p> and an <ol class="choice choice1">
    return (
        li.find('p') is not None and
        li.find('ol', class_='choice choice1') is not None
    )

def parse_questions_with_media(html):
    soup = BeautifulSoup(html, 'html.parser')
    results = []

    # Find all <li> tags that are likely questions (not options)
    for li in soup.find_all('li'):
        if not is_question_li(li):
            continue

        # Question text
        p_tag = li.find('p')
        question_text = p_tag.get_text(strip=True) if p_tag else ""

        # Tables and images directly under this <li>
        tables = [extract_table_html(table) for table in li.find_all('table', recursive=False)]
        images = [extract_img_info(img) for img in li.find_all('img', recursive=False)]

        # Options
        options = []
        ol_tag = li.find('ol', class_='choice choice1')
        if ol_tag:
            options = [opt.get_text(strip=True) for opt in ol_tag.find_all('li', recursive=False)]

        # Correct answer
        answer = None
        tooltip = li.find('span', class_='tooltiptext')
        if tooltip:
            b = tooltip.find('b')
            ans_letter = b.get_text(strip=True) if b else None
            ans_text = tooltip.get_text(strip=True)
            if ans_letter and ans_letter in ans_text:
                ans_text = ans_text.split(ans_letter, 1)[-1].strip(': .\n')
            answer = {'choice': ans_letter, 'text': ans_text}

        results.append({
            'question': question_text,
            'tables': tables,
            'images': images,
            'options': options,
            'answer': answer
        })

    return results

In [62]:
paper = 'XAT-2018-Question-Paper-GK.html'
paper_path = dir_path + '/' + paper

with open(paper_path, 'r', encoding='utf-8') as file:
    text = file.read()
    text = text.split('<ol class="ques ques1">')[-1]
    splits = paper.split('-')
    title = ' '.join(splits[:-1]) + ' - ' + splits[-1].split('.')[0]
    text = text.replace(title, '').replace('<br/>', '').replace('<h4>', '').replace('</h4>', '')
    result = parse_questions_with_media(text)

In [63]:
print(result)

[{'question': 'Which of the following is not a Cryptocurrency?', 'tables': [], 'images': [], 'options': ['Bitcoin', 'Laxmicoin', 'Etherium', 'Paypal', 'Litecoin'], 'answer': {'choice': 'Choice D', 'text': 'Paypal'}}, {'question': 'Who won the Nobel Prize for Economics in 2017?', 'tables': [], 'images': [], 'options': ['Richard Thaler', 'Raghuram Rajan', 'Jean Tirole', 'Bengt Holmstrom', 'Oliver Hart'], 'answer': {'choice': 'Choice A', 'text': 'Richard Thaler'}}, {'question': 'What is Showrooming?', 'tables': [], 'images': [], 'options': ['An individual browses the sites of e-retailers for products and then ends up purchasing the product from some brick and mortar store', 'An individual visiting a brick and mortar store to have a look and feel of the product and then ordering the same product through some e-retailers.', 'An individual visiting a big box retailer to have a look and feel of the product and then purchasing the same product from nearby kirana (Mom and Pop) store.', 'An e-re