In [102]:
import os
import fitz  # PyMuPDF
from PyPDF2 import PdfMerger
import utils
from tqdm import tqdm
import re
from openai import OpenAI
import json
import pdfplumber
import nltk

nltk.download("punkt")
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
from dotenv import load_dotenv

[nltk_data] Downloading package punkt to /home/jd_wsl0/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/jd_wsl0/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [95]:
import logging

logging.basicConfig(level=logging.INFO)

In [118]:
import importlib
import utils
importlib.reload(utils)

[nltk_data] Downloading package punkt to /home/jd_wsl0/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/jd_wsl0/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


<module 'utils' from '/home/jd_wsl0/llm-z-camp/PhysBot/utils.py'>

In [8]:
# Define input folder and output PDF file
pdf_folder = "data/chapters/"  # Replace with actual folder path
output_pdf = "data/Bare_Essentials_Only.pdf"

# essentials = utils.extract_first_page(pdf_folder, output_pdf)

Bare Essentials pages saved to data/Bare_Essentials_Only.pdf


In [24]:
# Load environment variables from .env
load_dotenv(".env")  # Specify the correct path if in a higher-level folder

# Retrieve OpenAI API key from the environment
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY is not set. Check your .env file.")

In [51]:
# Run processing
pdf_path = "data/Bare_Essentials_Only.pdf"  # Replace with actual path
output_json_path = "data/json/bare_essentials_concepts.json"
concepts_json = utils.process_bare_essentials(pdf_path, output_json_path)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"                 | 0/35 [00:00<?, ?section/s]
INFO:root:Raw OpenAI response: [{"concept": "Physics", "definition": "The study of matter, energy, and their interactions.", "equation": "", "notes": []}, {"concept": "Four Fundamental Interactions", "definition": "Interactions that describe the universe with different relative strengths.", "equation": "", "notes": ["Strong Nuclear Interaction: Short-range interaction between quarks and gluons responsible for keeping quarks together in nucleons, and nucleons together in nuclei.", "Weak Nuclear Interaction: Short-range interaction responsible for nuclear radioactivity.", "Gravitational Interaction: Long-range, attractive interaction between massive particles.", "Electromagnetic Interaction: Long-range interaction between electrically charged particles."]}, {"concept": "Standard Model", "definition": "The most widely accepted fundamental model for our un

Extracted concepts saved to data/json/bare_essentials_concepts.json





In [52]:
with open("data/json/bare_essentials_concepts.json", "r") as f:
    data = json.load(f)

print(json.dumps(data, indent=4))

[
    {
        "unit": null,
        "section": "Bare Essentials",
        "concept": "Physics",
        "definition": "The study of matter, energy, and their interactions.",
        "equation": "$$",
        "notes": []
    },
    {
        "unit": null,
        "section": "Bare Essentials",
        "concept": "Four Fundamental Interactions",
        "definition": "Interactions that describe the universe with different relative strengths.",
        "equation": "$$",
        "notes": [
            "Strong Nuclear Interaction: Short-range interaction between quarks and gluons responsible for keeping quarks together in nucleons, and nucleons together in nuclei.",
            "Weak Nuclear Interaction: Short-range interaction responsible for nuclear radioactivity.",
            "Gravitational Interaction: Long-range, attractive interaction between massive particles.",
            "Electromagnetic Interaction: Long-range interaction between electrically charged particles."
        ]
    }

In [69]:
def clean_latex_equation(equation):
    """
    Cleans LaTeX equations by:
    - Fixing fraction formatting.
    - Ensuring proper subscripts and superscripts.
    - Removing duplicate character encoding errors.
    - Standardizing matrix formatting.
    """

    if not equation or equation.strip() in ["$$", "$ $", "$$ $$"]:
        return ""

    # Ensure LaTeX math wrapping
    if not equation.startswith("$"):
        equation = f"${equation}$"

    # Remove duplicate Unicode characters (e.g., "𝒓𝒓" → "𝒓")
    equation = re.sub(r"([a-zA-Z𝒓𝚫𝑲𝒗𝑭𝒂𝒑𝒎𝑻𝑼𝝉𝒓𝑳𝜽𝜌𝑮𝒈])\1", r"\1", equation)

    # Convert Unicode superscripts and subscripts to LaTeX format
    equation = re.sub(r"([a-zA-Z])(\d+)", r"\1_{\2}", equation)  # Fix subscripts (x1 → x_1)
    equation = re.sub(r"(\w+)\^(\d+)", r"\1^{\2}", equation)  # Fix superscripts (x^2 → x^{2})

    # Convert inline fractions (a/b → \frac{a}{b})
    equation = re.sub(r"(\w+)\s*/\s*(\w+)", r"\\frac{\1}{\2}", equation)

    # Fix matrix formatting: Ensure proper LaTeX syntax
    equation = equation.replace("\\\\", " \\\\ ")  # Ensure new line spacing in matrices

    return equation.strip()


def post_process_json(json_file, output_file):
    """
    Loads the JSON file, cleans LaTeX equations, and writes the cleaned version to a new file.
    """
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    for entry in data:
        if "equation" in entry and entry["equation"]:
            entry["equation"] = clean_latex_equation(entry["equation"])

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

    print(f"Post-processing complete. Cleaned JSON saved to {output_file}")



In [75]:
# Example usage:
post_process_json("data/json/bare_essentials_concepts.json", "data/json/cleaned_essentials_equations.json")

Post-processing complete. Cleaned JSON saved to data/json/cleaned_essentials_equations.json


In [77]:
def validate_equations(json_file, log_file="invalid_equations.log"):
    """
    Scans a cleaned JSON file and logs any ill-formed equations for further debugging.
    
    Parameters:
        json_file (str): Path to the cleaned JSON file.
        log_file (str): Path to the log file to store any detected issues.
    """
    logging.basicConfig(filename=log_file, level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")

    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    invalid_equations = []
    
    # Improved regex: Ensures equations start & end with a single `$`, allowing multi-line
    latex_pattern = re.compile(r"^\$(?!\$)[\s\S]*[^$]\$$")

    # Additional checks for LaTeX syntax errors
    bracket_pairs = {"{": "}", "[": "]", "(": ")"}
    latex_functions = [r"\\frac", r"\\begin", r"\\end", r"\\left", r"\\right"]

    for entry in data:
        concept = entry.get("concept", "Unknown Concept")
        equation = entry.get("equation", "").strip()

        if equation:
            # 1. Check for empty or meaningless equations
            if equation in ["$$", "$ $", "$$ $$", "$"] or not equation.strip("$").strip():
                invalid_equations.append(f"Empty equation found for '{concept}': {equation}")

            # 2. Check for missing proper LaTeX math mode ($ at start & end)
            elif not latex_pattern.match(equation):
                invalid_equations.append(f"Equation not properly formatted for '{concept}': {equation}")

            # 3. Check for **UNMATCHED** brackets in LaTeX
            for open_br, close_br in bracket_pairs.items():
                if equation.count(open_br) != equation.count(close_br):
                    invalid_equations.append(f"Unmatched brackets in equation for '{concept}': {equation}")
                    break  # No need to check further if one type is unmatched

            # 4. Detect Incorrect LaTeX Syntax
            for func in latex_functions:
                if re.search(func + r"[^{]", equation):  # Ensure each function is followed by `{`
                    invalid_equations.append(f"Potential LaTeX syntax issue in '{concept}': {equation}")

    # Log issues found
    if invalid_equations:
        logging.warning(f"{len(invalid_equations)} ill-formed equations detected.")
        for eq in invalid_equations:
            logging.warning(eq)
        print(f"Validation complete. Found {len(invalid_equations)} issues. Check '{log_file}' for details.")
    else:
        print("Validation complete. No issues detected.")

In [78]:
validate_equations("data/json/cleaned_essentials_equations.json", log_file="invalid_equations.log")

Validation complete. No issues detected.


In [119]:
pdf_path = "data/chapters/Unit 119 - Newtons Laws.pdf"

text_extract_test = utils.extract_text_chunks(pdf_path, output_json="data/json/Unit_119_text.json")

Text extraction complete! Data saved to data/json/Unit_119_text.json


In [120]:
with open("data/json/Unit_119_text.json", "r") as f:
    data = json.load(f)

print(json.dumps(data, indent=4))

[
    {
        "unit": null,
        "section": null,
        "chunk_type": "paragraph",
        "content": "119 \u2013 Newton \u2019 s Laws Newton \u2019 s laws of motion give us another way to analyze physical systems . By using Newton \u2019 s laws , we can directly relate how the net force on a set of objects is related to acceleration . Then , using kinematic relationships studied in the next two units , we will be able to directly relate the acceleration of objects to their change in velocity and position . The Bare Essentials \uf0b7 Isaac Newton proposed three \u2018 laws \u2019 for classical mechanics \uf0b7 Note : Newton \u2019 s laws and free-body diagrams are done with that hold for inertial , non-relativistic systems . respect to individual objects . This is quite different than the conservation of momentum and energy where the equations \uf0b7 Newton \u2019 s 1st law states that the velocity of an object does not were for the entire system ! change if it is not interactin