In [4]:
import re
import json
from typing import List, Dict
import glob 

In [7]:
def extract_chunks(tex_text: str) -> List[Dict[str, str]]:
    """
    Parses LaTeX content and extracts sections, text blocks, and equations.
    Returns a list of dictionaries with keys: section, text, equation
    """
    sections = re.split(r'\\section\*?\{(.+?)\}', tex_text)
    chunks = []

    for i in range(1, len(sections), 2):
        section_title = sections[i].strip()
        content = sections[i + 1]

        # Extract equations from equation and align environments
        eq_pattern = r'\\begin\{(?:equation|align)\*?\}(.*?)\\end\{(?:equation|align)\*?\}'
        equations = re.findall(eq_pattern, content, re.DOTALL)
        text_blocks = re.split(eq_pattern, content, flags=re.DOTALL)

        # Combine text blocks and equations into chunks
        eq_idx = 0
        for idx, text in enumerate(text_blocks):
            # Even index = text, odd = equation (because of how re.split works with capture group)
            if idx % 2 == 0:
                chunk = {
                    "section": section_title,
                    "text": re.sub(r'\\\w+', '', text.strip()),
                    "equation": equations[eq_idx].strip() if eq_idx < len(equations) else ""
                }
                chunks.append(chunk)
            else:
                eq_idx += 1

    return chunks

def save_chunks_to_json(chunks: List[Dict[str, str]], output_path: str = "chunks.json") -> None:
    with open(output_path, 'w') as f:
        json.dump(chunks, f, indent=2)

# if __name__ == "__main__":
tex_files = glob.glob("*.tex")
for texfile in tex_files:
    with open("ICML_2016_deep_learning_uncertainty.tex", "r") as f:
        tex_content = f.read()
    parsed_chunks = extract_chunks(tex_content)
    save_chunks_to_json(parsed_chunks)


In [10]:
# Unit test 
# test_tex_parser.py
import unittest
from tex_parser import extract_chunks

class TestTexParser(unittest.TestCase):

    def test_extract_chunks_basic(self):
        sample_tex = r"""
        \section{Introduction}
        This is an intro paragraph.
        \begin{equation}
        E = mc^2
        \end{equation}
        More explanation here.

        \section{Cooling Function}
        The cooling rate is defined as:
        \begin{align*}
        \Lambda(T) &= n_e n_H \Lambda_T \\
        \end{align*}
        It depends on temperature.
        """
        chunks = extract_chunks(sample_tex)
        self.assertEqual(len(chunks), 4)
        self.assertEqual(chunks[0]['section'], 'Introduction')
        self.assertIn('intro paragraph', chunks[0]['text'])
        self.assertEqual(chunks[1]['equation'], 'E = mc^2')
        self.assertEqual(chunks[2]['section'], 'Cooling Function')
        self.assertIn('cooling rate', chunks[2]['text'])
        self.assertIn('temperature', chunks[3]['text'])

if __name__ == "__main__":
    unittest.main()


E
ERROR: /home/100/jt4478/ (unittest.loader._FailedTest)
----------------------------------------------------------------------
AttributeError: module '__main__' has no attribute '/home/100/jt4478/'

----------------------------------------------------------------------
Ran 1 test in 0.001s

FAILED (errors=1)


SystemExit: True

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
