In [1]:
import re
import fitz  # PyMuPDF

def extract_text_chunks(pdf_path):
    doc = fitz.open(pdf_path)
    chunks = []
    current_unit, current_section = None, None
    
    for page in doc:
        text = page.get_text("text").split("\n")
        
        for line in text:
            if re.match(r"^Unit \d+", line):  # Detect Unit Headers
                current_unit = line.strip()
            elif re.match(r"^\d+\.\d+ – ", line):  # Detect Sections
                current_section = line.strip()
            elif re.match(r"^\w+ Energy", line) or "Velocity" in line:  # Detect Concept Definitions
                chunks.append({
                    "unit": current_unit,
                    "section": current_section,
                    "type": "definition",
                    "content": line.strip()
                })
            elif re.match(r"^\s*[A-Za-z0-9]+\s*=\s*.*$", line):  # Detect Equations
                chunks.append({
                    "unit": current_unit,
                    "section": current_section,
                    "type": "equation",
                    "content": line.strip()
                })
            elif "Example" in line:  # Detect Example Problems
                chunks.append({
                    "unit": current_unit,
                    "section": current_section,
                    "type": "example",
                    "content": line.strip()
                })
            else:  # General Text (paragraphs)
                if chunks and chunks[-1]["type"] == "paragraph":
                    chunks[-1]["content"] += " " + line.strip()
                else:
                    chunks.append({
                        "unit": current_unit,
                        "section": current_section,
                        "type": "paragraph",
                        "content": line.strip()
                    })

    return chunks

In [None]:
import re
import fitz  # PyMuPDF

def extract_text_and_figures(pdf_path, image_output_folder="figures"):
    doc = fitz.open(pdf_path)
    chunks = []
    current_unit, current_section = None, None

    for page_num, page in enumerate(doc):
        text = page.get_text("text").split("\n")

        for line in text:
            if re.match(r"^Unit \d+", line):  # Detect Unit Headers
                current_unit = line.strip()
            elif re.match(r"^\d+\.\d+ – ", line):  # Detect Sections
                current_section = line.strip()
            elif re.match(r"^\w+ Energy", line) or "Velocity" in line:  # Detect Concept Definitions
                chunks.append({
                    "unit": current_unit,
                    "section": current_section,
                    "type": "definition",
                    "content": line.strip()
                })
            elif re.match(r"^\s*[A-Za-z0-9]+\s*=\s*.*$", line):  # Detect Equations
                chunks.append({
                    "unit": current_unit,
                    "section": current_section,
                    "type": "equation",
                    "content": line.strip()
                })
            elif "Example" in line:  # Detect Example Problems
                chunks.append({
                    "unit": current_unit,
                    "section": current_section,
                    "type": "example",
                    "content": line.strip()
                })
            else:  # General Text (paragraphs)
                if chunks and chunks[-1]["type"] == "paragraph":
                    chunks[-1]["content"] += " " + line.strip()
                else:
                    chunks.append({
                        "unit": current_unit,
                        "section": current_section,
                        "type": "paragraph",
                        "content": line.strip()
                    })
        
        # Extract figures and captions
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]  # Image reference number
            img_data = doc.extract_image(xref)
            img_bytes = img_data["image"]
            img_ext = img_data["ext"]
            
            img_filename = f"{image_output_folder}/figure_{page_num+1}_{img_index}.{img_ext}"
            with open(img_filename, "wb") as img_file:
                img_file.write(img_bytes)
            
            chunks.append({
                "unit": current_unit,
                "section": current_section,
                "type": "figure",
                "image_path": img_filename,
                "page": page_num + 1
            })
    
    return chunks

In [2]:
def extract_equations(text):
    """
    Extracts equations from the text using regex and stores them with LaTeX formatting.
    """
    equation_pattern = r"(?<=\\n)\\s*[A-Za-z0-9]+.*=\\s*.*\\n"
    equations = re.findall(equation_pattern, text)

    equation_chunks = []
    for eq in equations:
        equation_chunks.append({
            "type": "equation",
            "latex": eq.strip(),  # Store equation in LaTeX format
            "mathml": None,  # Placeholder for MathML conversion if needed
            "vector_embedding": None  # Placeholder for future semantic search
        })
    
    return equation_chunks

In [None]:
from sympy import sympify
from sympy.printing.mathml import mathml

def convert_latex_to_mathml(latex_eq):
    """
    Converts a LaTeX equation to MathML format for web rendering.
    """
    try:
        sympy_expr = sympify(latex_eq)
        return mathml(sympy_expr)
    except:
        return None  # Handle cases where conversion is not possible