In [8]:
import re

def process_latex_file(input_file, output_file):
    # Read the file content
    with open(input_file, 'r') as file:
        content = file.read()
    
    # Process duplicate chapter sections
    # We'll use a regex to find all chapter headings and keep only the first occurrence of each
    seen_chapters = set()
    
    def chapter_replacer(match):
        chapter_title = match.group(1)
        if chapter_title in seen_chapters:
            return ""  # Remove duplicate chapter heading
        else:
            seen_chapters.add(chapter_title)
            return match.group(0)  # Keep the first occurrence
    
    # Replace chapter headings (only keeping first occurrence of each)
    chapter_pattern = r'\\section\*\{(Chapter \d+[^}]*)\}'
    content = re.sub(chapter_pattern, chapter_replacer, content)
    
    # Convert other section commands to plain text
    other_section_pattern = r'\\section\*\{([^}]*)\}'
    content = re.sub(other_section_pattern, r'\1', content)
    
    # Write the modified content to the output file
    with open(output_file, 'w') as file:
        file.write(content)

# Usage
input_file = 'latex/latex.tex'
output_file = 'latex/latex_modified.tex'
process_latex_file(input_file, output_file)

In [17]:
import re
import os

def split_latex(input_file, output_dir):
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()

    # Find chapters with their positions
    chapter_pattern = re.compile(r'\\section\*\{Chapter\s+(\d+)[^\}]*\}')
    chapters = list(chapter_pattern.finditer(content))
    # Append end of document sentinel
    chapters.append(re.Match)  # placeholder for end

    for i, chap_match in enumerate(chapters[:-1]):
        chap_num = chap_match.group(1)
        start_idx = chap_match.start()
        end_idx = chapters[i+1].start() if isinstance(chapters[i+1], re.Match) else len(content)
        chapter_text = content[start_idx:end_idx]

        # Extract examples
        example_pattern = re.compile(
            r'(Example\s+(\d+)\.\s*)(.+?)(?=(Example\s+\d+\.|\\section|\Z))',
            re.DOTALL)
        for ex_full, ex_num in example_pattern.findall(chapter_text):
            ex_text = ex_full + example_pattern.sub('', '', 1)  # capture full block
            filename = f'example{chap_num}.{ex_num}.tex'
            with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as out:
                out.write(ex_text.strip() + "\n")

        # Extract problem statements
        problems_section = re.search(r'\\section\*\{PROBLEMS\}', chapter_text)
        solutions_section = re.search(r'\\section\*\{SOLUTIONS\}', chapter_text)
        statement_block = chapter_text[problems_section.end():solutions_section.start()] if problems_section and solutions_section else ""
        solution_block = chapter_text[solutions_section.end():] if solutions_section else ""

        problem_stat_pattern = re.compile(r'(Problem\s+(\d+)\.\s*)(.+?)(?=(Problem\s+\d+\.|\\section|\Z))', re.DOTALL)
        solution_pattern = re.compile(r'(Problem\s+(\d+)\.\s*Solution:\s*)(.+?)(?=(Problem\s+\d+\.|\\section|\Z))', re.DOTALL)

        statements = {num: full.strip() for full, num, _ in problem_stat_pattern.findall(statement_block)}
        solutions = {num: full.strip() for full, num, _ in solution_pattern.findall(solution_block)}

        for prob_num, prob_text in statements.items():
            sol_text = solutions.get(prob_num, "")
            filename = f'problem{chap_num}.{prob_num}.tex'
            with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as out:
                out.write((prob_text + "\n\n" + sol_text).strip() + "\n")

if __name__ == "__main__":
    input_tex = "latex/latex.tex"       # path to your main LaTeX file
    out_dir = "split_output"     # directory to save individual files
    split_latex(input_tex, out_dir)



ValueError: too many values to unpack (expected 2)

In [16]:
import re

def process_latex_file(input_file, output_file):
    # Read the file content
    with open(input_file, 'r') as file:
        content = file.read()
    
    # Process duplicate chapter sections
    # We'll use a regex to find all chapter headings and keep only the first occurrence of each
    seen_chapters = set()
    
    def chapter_replacer(match):
        chapter_title = match.group(1)
        if chapter_title in seen_chapters:
            return ""  # Remove duplicate chapter heading
        else:
            seen_chapters.add(chapter_title)
            print(seen_chapters)
            return match.group(0)  # Keep the first occurrence
    
    # Replace chapter headings (only keeping first occurrence of each)
    chapter_pattern = r'\\section\*\{(Chapter \d+[^}]*)\}'
    content = re.sub(chapter_pattern, chapter_replacer, content)
    
    # Convert other section commands to plain text
    other_section_pattern = r'\\section\*\{([^}]*)\}'
    content = re.sub(other_section_pattern, r'\1', content)
    
    # Write the modified content to the output file
    with open(output_file, 'w') as file:
        file.write(content)

# Usage
input_file = 'latex/latex.tex'
output_file = 'latex/latex_modified.tex'
process_latex_file(input_file, output_file)

{'Chapter 1 Draw the Auxiliary Lines with Medians'}
{'Chapter 2 Draw the Auxiliary Lines with the Midlines', 'Chapter 1 Draw the Auxiliary Lines with Medians'}
{'Chapter 3 Draw the Auxiliary lines with Angle Bisectors', 'Chapter 2 Draw the Auxiliary Lines with the Midlines', 'Chapter 1 Draw the Auxiliary Lines with Medians'}
{'Chapter 3 Draw the Auxiliary lines with Angle Bisectors', 'Chapter 2 Draw the Auxiliary Lines with the Midlines', 'Chapter 4 Draw the Auxiliary lines with Perpendicular Lines', 'Chapter 1 Draw the Auxiliary Lines with Medians'}
{'Chapter 5 Draw the Auxiliary Lines with Parallel Lines', 'Chapter 4 Draw the Auxiliary lines with Perpendicular Lines', 'Chapter 1 Draw the Auxiliary Lines with Medians', 'Chapter 3 Draw the Auxiliary lines with Angle Bisectors', 'Chapter 2 Draw the Auxiliary Lines with the Midlines'}
{'Chapter 5 Draw the Auxiliary Lines with Parallel Lines', 'Chapter 4 Draw the Auxiliary lines with Perpendicular Lines', 'Chapter 1 Draw the Auxiliary Lin

In [18]:
# Import required libraries
import re
import os

# Define the function to extract text between triple hash marks
def extract_hashed_sections(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            
        # Find all text between ### and ###
        pattern = r'###\s*(.*?)\s*###'
        matches = re.findall(pattern, content)
        
        return matches
    except Exception as e:
        print(f"Error: {e}")
        return []

# Set file path
file_path = "latex/latex_modified.tex"

# Extract and display the sections
sections = extract_hashed_sections(file_path)

# Print results
for i, section in enumerate(sections, 1):
    print(f"Section {i}: {section}")

print(f"\nFound {len(sections)} section(s) enclosed in triple hash marks.")

Section 1: CHAPTER 1 EXAMPLE 1-1
Section 2: CHAPTER 1 PROBLEMS 1-1
Section 3: CHAPTER 1 SOLUTIONS 1-1
Section 4: CHAPTER 1 EXAMPLE 1-2
Section 5: CHAPTER 1 PROBLEMS 1-2
Section 6: CHAPTER 1 SOLUTIONS 1-2
Section 7: CHAPTER 2 EXAMPLE 1-1
Section 8: CHAPTER 2 PROBLEMS 1-1
Section 9: CHAPTER 2 SOLUTIONS 1-1
Section 10: CHAPTER 3 EXAMPLE 1-1
Section 11: CHAPTER 3 PROBLEMS 1-1
Section 12: CHAPTER 3 SOLUTIONS 1-1
Section 13: CHAPTER 4 EXAMPLE 1-1
Section 14: CHAPTER 4 PROBLEMS 1-1
Section 15: CHAPTER 1 SOLUTIONS 1-1
Section 16: CHAPTER 5 EXAMPLE 1-1
Section 17: CHAPTER 5 PROBLEMS 1-1
Section 18: CHAPTER 5 SOLUTIONS 1-1
Section 19: CHAPTER 6 EXAMPLE 1-1
Section 20: CHAPTER 6 PROBLEMS 1-1
Section 21: CHAPTER 6 SOLUTIONS 1-1
Section 22: CHAPTER 6 EXAMPLE 1-2
Section 23: CHAPTER 6 PROBLEMS 1-2
Section 24: CHAPTER 6 SOLUTIONS 1-2
Section 25: CHAPTER 6 EXAMPLE 1-3
Section 26: CHAPTER 6 PROBLEMS 1-3
Section 27: CHAPTER 6 SOLUTIONS 1-3
Section 28: CHAPTER 6 EXAMPLE 1-4
Section 29: CHAPTER 6 PROBLEMS

In [44]:
import re
import os
import sys

def create_directory_if_not_exists(directory):
    """Create directory if it doesn't exist."""
    if not os.path.exists(directory):
        os.makedirs(directory)

def create_tex_file(filename, content, title, chapter_num, set_num, item_type):
    """Create a TeX file with the given content."""
    # Add basic LaTeX structure
    tex_content = f"""\\documentclass{{article}}
\\usepackage{{amsmath}}
\\usepackage{{amssymb}}
\\usepackage{{graphicx}}
\\graphicspath{{{{images/}}}}
\\usepackage{{hyperref}}
\\usepackage[version=4]{{mhchem}}

\\title{{{title} from Chapter {chapter_num}, Set {set_num}}}
\\author{{Yongcheng Chen, Ph.D.}}
\\date{{}}

\\begin{{document}}
\\maketitle

{content}
\\end{{document}}
"""
    
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(tex_content)
    print(f"Created file: {filename}")

def extract_and_process_sections(file_paths):
    """Extract sections marked with triple hash marks and process them."""
    # Dictionary to store all content from all files
    all_content = ""
    
    # Read all files and concatenate their content
    for file_path in file_paths:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                all_content += file.read() + "\n\n"
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
    
    # Find all section markers and extract content
    # The pattern captures: chapter number, section type, and set number
    pattern = r'###\s*CHAPTER\s+(\d+)\s+(EXAMPLE|PROBLEMS|SOLUTIONS)\s+1-(\d+)\s*###(.*?)(?=###|$)'
    matches = re.findall(pattern, all_content, re.DOTALL)
    
    # Print all matches found to help diagnose issues
    print(f"Found {len(matches)} section matches in total.")
    
    # Dictionary to store organized content
    organized_content = {}
    
    # Process each match
    for chapter_num, section_type, set_num, section_content in matches:
        # Create keys for organization
        chapter_key = f"chapter_{chapter_num}"
        set_key = f"set_{set_num}"
        
        # Initialize nested dictionaries if needed
        if chapter_key not in organized_content:
            organized_content[chapter_key] = {}
        
        if set_key not in organized_content[chapter_key]:
            organized_content[chapter_key][set_key] = {}
        
        # Store the content
        organized_content[chapter_key][set_key][section_type] = section_content.strip()
    
    # Count sets per chapter for diagnostic purposes
    sets_by_chapter = {}
    for chapter_key, chapter_data in organized_content.items():
        chapter_num = chapter_key.split('_')[1]
        sets_by_chapter[chapter_num] = len(chapter_data)
    
    print("\nSets found per chapter:")
    for chapter_num, set_count in sorted(sets_by_chapter.items()):
        print(f"Chapter {chapter_num}: {set_count} sets")
    
    # Process the organized content
    for chapter_key, chapter_data in organized_content.items():
        chapter_num = chapter_key.split('_')[1]
        print(f"\nProcessing Chapter {chapter_num}:")
        
        for set_key, set_data in chapter_data.items():
            set_num = set_key.split('_')[1]
            print(f"  Processing Set {set_num}:")
            
            # Create directories
            examples_dir = f"{chapter_key}/examples/set_{set_num}"
            problems_dir = f"{chapter_key}/problems/set_{set_num}"
            create_directory_if_not_exists(examples_dir)
            create_directory_if_not_exists(problems_dir)
            
            # Process EXAMPLES
            if 'EXAMPLE' in set_data:
                example_content = set_data['EXAMPLE']
                
                # Extract individual examples
                example_pattern = re.compile(r'Example\s+(\d+)\.\s+(.*?)(?=Example\s+\d+\.|$)', re.DOTALL)
                examples = example_pattern.findall(example_content)
                
                if not examples:
                    # If no individual examples found, treat the whole content as one example
                    examples = [('1', example_content)]
                
                for example_num, example_text in examples:
                    example_filename = f"{examples_dir}/example_{chapter_num}.{set_num}_{example_num}.tex"
                    create_tex_file(
                        example_filename, 
                        example_text, 
                        f"Example {example_num}", 
                        chapter_num, 
                        set_num,
                        "Example"
                    )
                    print(f"    Created Example {example_num}")
            else:
                print(f"    No EXAMPLE section found for Chapter {chapter_num}, Set {set_num}")
            
            # Check if we have both PROBLEMS and SOLUTIONS
            if 'PROBLEMS' in set_data and 'SOLUTIONS' in set_data:
                problems_content = set_data['PROBLEMS']
                solutions_content = set_data['SOLUTIONS']
                
                # Extract individual problems
                problem_pattern = re.compile(r'Problem\s+(\d+)\.\s+(.*?)(?=Problem\s+\d+\.|$)', re.DOTALL)
                problems = problem_pattern.findall(problems_content)
                
                # Extract individual solutions
                solution_pattern = re.compile(r'Problem\s+(\d+)\.\s+Solution:(.*?)(?=Problem\s+\d+\.|$)', re.DOTALL)
                solutions = {num: solution.strip() for num, solution in solution_pattern.findall(solutions_content)}
                
                if not solutions:  # Try alternative pattern if no solutions found
                    solution_pattern = re.compile(r'Problem\s+(\d+)\.\s+Solution\.(.*?)(?=Problem\s+\d+\.|$)', re.DOTALL)
                    solutions = {num: solution.strip() for num, solution in solution_pattern.findall(solutions_content)}
                
                # Match problems with solutions
                for problem_num, problem_text in problems:
                    problem_solution = solutions.get(problem_num, "Solution not available.")
                    
                    # Create combined problem+solution file
                    problem_filename = f"{problems_dir}/problem_{chapter_num}.{set_num}_{problem_num}.tex"
                    
                    combined_content = f"""\\section*{{Problem}}
{problem_text.strip()}

\\section*{{Solution}}
{problem_solution}
"""
                    
                    create_tex_file(
                        problem_filename, 
                        combined_content, 
                        f"Problem {problem_num}", 
                        chapter_num, 
                        set_num,
                        "Problem"
                    )
                    print(f"    Created Problem {problem_num} with Solution")
            else:
                print(f"    Missing PROBLEMS or SOLUTIONS for Chapter {chapter_num}, Set {set_num}")
    
    return True

def main(input_files):
    """Main function to process the input files."""
    if extract_and_process_sections(input_files):
        print("\nProcessing complete!")
    else:
        print("\nProcessing failed.")
# This is a notebook cell, so we'll run the main function directly
# rather than using the if __name__ == "__main__" pattern

# Define default input file
input_files = ["latex/latex_modified.tex"]

# Run the main processing function
main(input_files)

Found 30 section matches in total.

Sets found per chapter:
Chapter 1: 2 sets
Chapter 2: 1 sets
Chapter 3: 1 sets
Chapter 4: 1 sets
Chapter 5: 1 sets
Chapter 6: 4 sets

Processing Chapter 1:
  Processing Set 1:
Created file: chapter_1/examples/set_1/example_1.1_1.tex
    Created Example 1
Created file: chapter_1/examples/set_1/example_1.1_2.tex
    Created Example 2
Created file: chapter_1/examples/set_1/example_1.1_3.tex
    Created Example 3
Created file: chapter_1/examples/set_1/example_1.1_4.tex
    Created Example 4
Created file: chapter_1/examples/set_1/example_1.1_5.tex
    Created Example 5
Created file: chapter_1/examples/set_1/example_1.1_8.tex
    Created Example 8
Created file: chapter_1/examples/set_1/example_1.1_9.tex
    Created Example 9
Created file: chapter_1/examples/set_1/example_1.1_10.tex
    Created Example 10
Created file: chapter_1/examples/set_1/example_1.1_11.tex
    Created Example 11
Created file: chapter_1/examples/set_1/example_1.1_12.tex
    Created Exam

In [None]:
mak#!/usr/bin/env python3
import os
import re
import json
import glob
import base64
from pathlib import Path

def extract_content_from_latex(latex_file):
    """
    Extract problem statement, images, and solution from a LaTeX file.
    Returns a tuple of (problem_text, problem_images, solution_steps, solution_images)
    """
    with open(latex_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Extract problem statement (everything between \section*{Problem} and next \section or \end)
    problem_match = re.search(r'\\section\*\{Problem\}(.*?)(?=\\section|\\\end\{document\})', content, re.DOTALL)
    if not problem_match:
        print(f"Warning: Could not find problem statement in {latex_file}")
        problem_text = ""
    else:
        problem_text = problem_match.group(1).strip()
    
    # Extract problem images
    problem_images = []
    if problem_match:
        img_matches = re.findall(r'\\includegraphics(?:\[.*?\])?\{(.*?)\}', problem_match.group(1))
        problem_images = [os.path.basename(img_path) for img_path in img_matches]
    
    # Extract solution (everything between \section*{Solution} and end of document)
    solution_match = re.search(r'\\section\*\{Solution\}(.*?)(?=\\end\{document\})', content, re.DOTALL)
    if not solution_match:
        print(f"Warning: Could not find solution in {latex_file}")
        solution_text = ""
    else:
        solution_text = solution_match.group(1).strip()
    
    # Extract solution images
    solution_images = []
    if solution_match:
        img_matches = re.findall(r'\\includegraphics(?:\[.*?\])?\{(.*?)\}', solution_match.group(1))
        solution_images = [os.path.basename(img_path) for img_path in img_matches]
    
    # Clean up LaTeX commands from problem text
    problem_text = clean_latex(problem_text)
    
    # Split solution into steps and clean up LaTeX commands
    solution_steps = split_solution_into_steps(solution_text)
    
    return problem_text, problem_images, solution_steps, solution_images

def clean_latex(text):
    """
    Clean LaTeX text by removing unwanted commands and normalizing spacing.
    """
    # Remove image references (we'll handle them separately)
    text = re.sub(r'\\includegraphics(?:\[.*?\])?\{.*?\}', '', text)
    
    # Convert \\ to newlines
    text = re.sub(r'\\\\', '\n', text)
    
    # Preserve math environment
    text = re.sub(r'\\triangle', '\\\\triangle', text)
    text = re.sub(r'\\angle', '\\\\angle', text)
    
    # Remove other LaTeX commands
    text = re.sub(r'\\begin\{.*?\}|\\end\{.*?\}', '', text)
    text = re.sub(r'\\.*?(?:\[.*?\])?\{(.*?)\}', r'\1', text)
    
    # Normalize spacing
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    return text

def split_solution_into_steps(solution_text):
    """
    Split the solution text into logical steps for the THOUGHT structure.
    """
    # Remove image references (we'll handle them separately)
    text = re.sub(r'\\includegraphics(?:\[.*?\])?\{.*?\}', '', solution_text)
    
    # Try to split by paragraphs first
    paragraphs = re.split(r'\n\s*\n', text)
    
    # Remove any empty paragraphs
    paragraphs = [p.strip() for p in paragraphs if p.strip()]
    
    # If there's only one paragraph, try to split by sentences
    if len(paragraphs) <= 1 and paragraphs:
        sentences = re.split(r'(?<=[.!?])\s+', paragraphs[0])
        paragraphs = [s.strip() for s in sentences if s.strip()]
    
    # Clean each paragraph
    clean_paragraphs = [clean_latex(p) for p in paragraphs]
    
    return clean_paragraphs

def image_to_base64(image_path):
    """
    Convert an image file to base64 encoding.
    """
    try:
        with open(image_path, 'rb') as img_file:
            return base64.b64encode(img_file.read()).decode('utf-8')
    except Exception as e:
        print(f"Error reading image {image_path}: {e}")
        return ""

def create_trace_json(problem_dir, output_file=None):
    """
    Create a trace JSON file for all problems in the given directory.
    """
    traces = []
    
    # Find all problem files
    problem_files = glob.glob(os.path.join(problem_dir, "**", "problem_*.tex"), recursive=True)
    
    # Process each problem file
    for problem_file in problem_files:
        print(f"Processing {problem_file}...")
        
        # Extract content from LaTeX file
        problem_text, problem_images, solution_steps, solution_images = extract_content_from_latex(problem_file)
        
        # Skip if problem text is empty
        if not problem_text:
            print(f"Skipping {problem_file} - no problem text found")
            continue
        
        # Create trace structure
        trace = {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": problem_text
                }
            ]
        }
        
        # Add problem images
        for img in problem_images:
            # Find the image in the latex/images directory
            img_path = find_image_path(img)
            if img_path:
                trace["content"].append({
                    "type": "image_url",
                    "image_url": {
                        "url": f"[input image: {img}]"
                        # For actual base64: "url": f"data:image/jpeg;base64,{image_to_base64(img_path)}"
                    }
                })
        
        # Create assistant response
        assistant_response = {
            "role": "assistant",
            "content": []
        }
        
        # Add solution steps as THOUGHT items
        for i, step in enumerate(solution_steps):
            if step:  # Skip empty steps
                assistant_response["content"].append({
                    "type": "text",
                    "text": f"THOUGHT {i}: {step}"
                })
                
                # Add solution images after appropriate steps
                if i < len(solution_images) and solution_images:
                    img = solution_images[i]
                    img_path = find_image_path(img)
                    if img_path:
                        assistant_response["content"].append({
                            "type": "image_url",
                            "image_url": {
                                "url": f"[solution image: {img}]"
                                # For actual base64: "url": f"data:image/jpeg;base64,{image_to_base64(img_path)}"
                            }
                        })
        
        # Add the trace
        traces.append([trace, assistant_response])
    
    # Write the traces to a JSON file
    if output_file:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(traces, f, indent=2)
        print(f"Wrote {len(traces)} traces to {output_file}")
    
    return traces

def find_image_path(image_name):
    """
    Find the full path to an image file based on the filename.
    """
    # Check in the latex/images directory
    base_path = os.path.join("latex", "images")
    
    # If the image name already has a path, extract just the filename
    image_name = os.path.basename(image_name)
    
    if os.path.exists(os.path.join(base_path, image_name)):
        return os.path.join(base_path, image_name)
    
    # Try finding with different extensions
    for ext in ['.jpg', '.png', '.pdf']:
        if os.path.exists(os.path.join(base_path, image_name + ext)):
            return os.path.join(base_path, image_name + ext)
    
    print(f"Warning: Image {image_name} not found in {base_path}")
    return None

def process_chapter(chapter_dir):
    """
    Process all problems in a chapter directory.
    """
    # Create output directory if it doesn't exist
    output_dir = "traces"
    os.makedirs(output_dir, exist_ok=True)
    
    # Extract chapter number from directory name
    chapter_match = re.search(r'chapter_(\d+)', chapter_dir)
    if not chapter_match:
        print(f"Could not extract chapter number from {chapter_dir}")
        return
    
    chapter_num = chapter_match.group(1)
    output_file = os.path.join(output_dir, f"chapter_{chapter_num}_traces.json")
    
    # Process the chapter
    create_trace_json(chapter_dir, output_file)

def main():
    """
    Main function to process all chapters or a specific chapter.
    """
    # Ask user which chapter to process
    chapter_dirs = glob.glob("chapter_*")
    if not chapter_dirs:
        print("No chapter directories found!")
        return
    
    print("Available chapters:")
    for i, chapter in enumerate(sorted(chapter_dirs)):
        print(f"{i+1}: {chapter}")
    print(f"{len(chapter_dirs)+1}: Process all chapters")
    
    try:
        choice = int(input("Enter your choice (1-{}): ".format(len(chapter_dirs)+1)))
        
        if choice == len(chapter_dirs) + 1:
            # Process all chapters
            for chapter in chapter_dirs:
                process_chapter(chapter)
        elif 1 <= choice <= len(chapter_dirs):
            # Process selected chapter
            process_chapter(sorted(chapter_dirs)[choice-1])
        else:
            print("Invalid choice!")
    except ValueError:
        print("Please enter a valid number!")

if __name__ == "__main__":
    main()

In [45]:
import re
import os
import sys
import shutil
from pathlib import Path

def create_directory_if_not_exists(directory):
    """Create directory if it doesn't exist."""
    if not os.path.exists(directory):
        os.makedirs(directory)

def process_images(content, folder_path, is_solution=False):
    """
    Extract image paths from content and copy them to the destination folder.
    Returns updated content with new image paths.
    """
    # Create images directory in the folder
    images_dir = os.path.join(folder_path, 'images')
    create_directory_if_not_exists(images_dir)
    
    # Pattern to find includegraphics commands
    img_pattern = r'\\includegraphics(?:\[.*?\])?\{(.*?)\}'
    
    # Find all image paths
    image_paths = re.findall(img_pattern, content)
    
    # Process each image
    for i, img_path in enumerate(image_paths):
        # Extract filename from path
        img_filename = os.path.basename(img_path)
        
        # Determine new filename based on whether it's in a solution or problem
        if is_solution:
            new_filename = f"intermediate_image_{i+1}.png"
        else:
            new_filename = f"input_image.png"
        
        # Source image path (assuming images are in a folder called 'images')
        source_path = os.path.join('images', img_filename)
        
        # Destination path
        dest_path = os.path.join(images_dir, new_filename)
        
        # Copy the image if it exists
        if os.path.exists(source_path):
            shutil.copy(source_path, dest_path)
            print(f"    Copied image: {source_path} -> {dest_path}")
        else:
            print(f"    Warning: Image not found: {source_path}")
        
        # Replace the path in the content
        content = content.replace(f'{{{img_path}}}', f'{{images/{new_filename}}}')
    
    return content

def create_tex_file(folder_path, content, title):
    """Create a TeX file with the given content."""
    # Add basic LaTeX structure
    tex_content = f"""\\documentclass{{article}}
\\usepackage{{amsmath}}
\\usepackage{{amssymb}}
\\usepackage{{graphicx}}
\\usepackage{{hyperref}}
\\usepackage[version=4]{{mhchem}}

\\title{{{title}}}
\\date{{}}

\\begin{{document}}
\\maketitle

{content}
\\end{{document}}
"""
    
    # Create the tex file
    tex_file = os.path.join(folder_path, "main.tex")
    with open(tex_file, 'w', encoding='utf-8') as f:
        f.write(tex_content)
    print(f"Created file: {tex_file}")

def extract_and_process_sections(file_paths):
    """Extract sections marked with triple hash marks and process them."""
    # Dictionary to store all content from all files
    all_content = ""
    
    # Read all files and concatenate their content
    for file_path in file_paths:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                all_content += file.read() + "\n\n"
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
    
    # Find all section markers and extract content
    # The pattern captures: chapter number, section type, and set number
    pattern = r'###\s*CHAPTER\s+(\d+)\s+(EXAMPLE|PROBLEMS|SOLUTIONS)\s+1-(\d+)\s*###(.*?)(?=###|$)'
    matches = re.findall(pattern, all_content, re.DOTALL)
    
    # Print all matches found to help diagnose issues
    print(f"Found {len(matches)} section matches in total.")
    
    # Dictionary to store organized content
    organized_content = {}
    
    # Process each match
    for chapter_num, section_type, set_num, section_content in matches:
        # Create keys for organization
        chapter_key = f"chapter_{chapter_num}"
        set_key = f"set_{set_num}"
        
        # Initialize nested dictionaries if needed
        if chapter_key not in organized_content:
            organized_content[chapter_key] = {}
        
        if set_key not in organized_content[chapter_key]:
            organized_content[chapter_key][set_key] = {}
        
        # Store the content
        organized_content[chapter_key][set_key][section_type] = section_content.strip()
    
    # Process the organized content
    for chapter_key, chapter_data in organized_content.items():
        chapter_num = chapter_key.split('_')[1]
        print(f"\nProcessing Chapter {chapter_num}:")
        
        for set_key, set_data in chapter_data.items():
            set_num = set_key.split('_')[1]
            print(f"  Processing Set {set_num}:")
            
            # Process EXAMPLES
            if 'EXAMPLE' in set_data:
                example_content = set_data['EXAMPLE']
                
                # Extract individual examples
                example_pattern = re.compile(r'Example\s+(\d+)\.\s+(.*?)(?=Example\s+\d+\.|$)', re.DOTALL)
                examples = example_pattern.findall(example_content)
                
                if not examples:
                    # If no individual examples found, treat the whole content as one example
                    examples = [('1', example_content)]
                
                for example_num, example_text in examples:
                    # Create folder for this example
                    example_folder = f"chapter_{chapter_num}_example_{set_num}_{example_num}"
                    create_directory_if_not_exists(example_folder)
                    
                    # Process images in the example
                    processed_text = process_images(example_text, example_folder)
                    
                    # Create the TeX file
                    create_tex_file(
                        example_folder, 
                        processed_text, 
                        f"Example {example_num}"
                    )
                    print(f"    Created Example {example_num}")
            
            # Check if we have both PROBLEMS and SOLUTIONS
            if 'PROBLEMS' in set_data and 'SOLUTIONS' in set_data:
                problems_content = set_data['PROBLEMS']
                solutions_content = set_data['SOLUTIONS']
                
                # Extract individual problems
                problem_pattern = re.compile(r'Problem\s+(\d+)\.\s+(.*?)(?=Problem\s+\d+\.|$)', re.DOTALL)
                problems = problem_pattern.findall(problems_content)
                
                # Extract individual solutions
                solution_pattern = re.compile(r'Problem\s+(\d+)\.\s+Solution:(.*?)(?=Problem\s+\d+\.|$)', re.DOTALL)
                solutions = {num: solution.strip() for num, solution in solution_pattern.findall(solutions_content)}
                
                if not solutions:  # Try alternative pattern if no solutions found
                    solution_pattern = re.compile(r'Problem\s+(\d+)\.\s+Solution\.(.*?)(?=Problem\s+\d+\.|$)', re.DOTALL)
                    solutions = {num: solution.strip() for num, solution in solution_pattern.findall(solutions_content)}
                
                # Match problems with solutions
                for problem_num, problem_text in problems:
                    problem_solution = solutions.get(problem_num, "Solution not available.")
                    
                    # Create folder for this problem
                    problem_folder = f"chapter_{chapter_num}_problem_{set_num}_{problem_num}"
                    create_directory_if_not_exists(problem_folder)
                    
                    # Process images in the problem statement
                    processed_problem = process_images(problem_text, problem_folder)
                    
                    # Process images in the solution
                    processed_solution = process_images(problem_solution, problem_folder, is_solution=True)
                    
                    # Combined content
                    combined_content = f"""\\section*{{Problem}}
{processed_problem.strip()}

\\section*{{Solution}}
{processed_solution}
"""
                    
                    # Create the TeX file
                    create_tex_file(
                        problem_folder, 
                        combined_content, 
                        f"Problem {problem_num}"
                    )
                    print(f"    Created Problem {problem_num} with Solution")
    
    return True

def main(input_files):
    """Main function to process the input files."""
    if extract_and_process_sections(input_files):
        print("\nProcessing complete!")
    else:
        print("\nProcessing failed.")

# This is a Jupyter notebook, so we'll run the main function directly
input_files = ["latex/latex_modified.tex"]
main(input_files)

Found 30 section matches in total.

Processing Chapter 1:
  Processing Set 1:
Created file: chapter_1_example_1_1/main.tex
    Created Example 1
Created file: chapter_1_example_1_2/main.tex
    Created Example 2
Created file: chapter_1_example_1_3/main.tex
    Created Example 3
Created file: chapter_1_example_1_4/main.tex
    Created Example 4
Created file: chapter_1_example_1_5/main.tex
    Created Example 5
Created file: chapter_1_example_1_8/main.tex
    Created Example 8
Created file: chapter_1_example_1_9/main.tex
    Created Example 9
Created file: chapter_1_example_1_10/main.tex
    Created Example 10
Created file: chapter_1_example_1_11/main.tex
    Created Example 11
Created file: chapter_1_example_1_12/main.tex
    Created Example 12
Created file: chapter_1_example_1_13/main.tex
    Created Example 13
Created file: chapter_1_problem_1_1/main.tex
    Created Problem 1 with Solution
Created file: chapter_1_problem_1_2/main.tex
    Created Problem 2 with Solution
Created file: c

In [52]:
import re
import os
import sys
import shutil
from pathlib import Path

def create_directory_if_not_exists(directory):
    """Create directory if it doesn't exist."""
    if not os.path.exists(directory):
        os.makedirs(directory)

def process_images(content, folder_path):
    """
    Extract image paths from content and copy them to the destination folder.
    Returns updated content with new image paths and fixed LaTeX syntax.
    """
    # Create images directory in the folder
    images_dir = os.path.join(folder_path, 'images')
    create_directory_if_not_exists(images_dir)
    
    # Fix invalid LaTeX options in includegraphics commands
    # Replace 'max width=' with 'width='
    content = re.sub(r'\\includegraphics\[max width=([^]]*?)\]', r'\\includegraphics[width=\1]', content)
    
    # Remove 'center' from options and add \centering before
    content = re.sub(r'\\includegraphics\[(.*?),\s*center\s*(.*?)\]', r'\\centering\n\\includegraphics[\1\2]', content)
    content = re.sub(r'\\includegraphics\[center,\s*(.*?)\]', r'\\centering\n\\includegraphics[\1]', content)
    content = re.sub(r'\\includegraphics\[center\]', r'\\centering\n\\includegraphics', content)
    
    # Pattern to find includegraphics commands
    img_pattern = r'\\includegraphics(?:\[.*?\])?\{(.*?)\}'
    
    # Find all image paths
    image_paths = re.findall(img_pattern, content)
    
    # Process each image
    for img_path in image_paths:
        # Extract filename from path
        img_filename = os.path.basename(img_path)
        
        # Remove the prefix from the filename
        new_filename = re.sub(r'2025_04_17_97bc1f7e44d93c271a88g-', '', img_filename)

        if not img_filename.lower().endswith('.jpg'):
            img_filename += '.jpg'
        
        # Ensure new filename also has .jpg extension
        if not new_filename.lower().endswith('.jpg'):
            new_filename += '.jpg'
        
        # Source image path (assuming images are in a folder called 'images')
        source_path = os.path.join('/Users/leon66/Desktop/VLM Reasoning/VLM Reasoning Repo/data/raw_cot/Science/geometry/aux_lines/latex/images', img_filename)

        print(f"Looking for image at: {source_path}")
        print(f"File exists: {os.path.exists(source_path)}")
        
        # Destination path
        dest_path = os.path.join(images_dir, new_filename)
        
        # Copy the image if it exists
        if os.path.exists(source_path):
            shutil.copy(source_path, dest_path)
            print(f"    Copied image: {source_path} -> {dest_path}")
        else:
            print(f"    Warning: Image not found: {source_path}")
        
        # Replace the path in the content
        content = content.replace(f'{{{img_path}}}', f'{{images/{new_filename}}}')
    
    return content

def create_tex_file(folder_path, content, title):
    """Create a TeX file with the given content."""
    # Add basic LaTeX structure
    tex_content = f"""\\documentclass{{article}}
\\usepackage{{amsmath}}
\\usepackage{{amssymb}}
\\usepackage{{graphicx}}
\\usepackage{{hyperref}}
\\usepackage[version=4]{{mhchem}}

\\title{{{title}}}
\\date{{}}

\\begin{{document}}
\\maketitle

{content}
\\end{{document}}
"""
    
    # Create the tex file
    tex_file = os.path.join(folder_path, "main.tex")
    with open(tex_file, 'w', encoding='utf-8') as f:
        f.write(tex_content)
    print(f"Created file: {tex_file}")

def extract_and_process_sections(file_paths):
    """Extract sections marked with triple hash marks and process them."""
    # Dictionary to store all content from all files
    all_content = ""
    
    # Read all files and concatenate their content
    for file_path in file_paths:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                all_content += file.read() + "\n\n"
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
    
    # Find all section markers and extract content
    # The pattern captures: chapter number, section type, and set number
    pattern = r'###\s*CHAPTER\s+(\d+)\s+(EXAMPLE|PROBLEMS|SOLUTIONS)\s+1-(\d+)\s*###(.*?)(?=###|$)'
    matches = re.findall(pattern, all_content, re.DOTALL)
    
    # Print all matches found to help diagnose issues
    print(f"Found {len(matches)} section matches in total.")
    
    # Dictionary to store organized content
    organized_content = {}
    
    # Process each match
    for chapter_num, section_type, set_num, section_content in matches:
        # Create keys for organization
        chapter_key = f"chapter_{chapter_num}"
        set_key = f"set_{set_num}"
        
        # Initialize nested dictionaries if needed
        if chapter_key not in organized_content:
            organized_content[chapter_key] = {}
        
        if set_key not in organized_content[chapter_key]:
            organized_content[chapter_key][set_key] = {}
        
        # Store the content
        organized_content[chapter_key][set_key][section_type] = section_content.strip()
    
    # Process the organized content
    for chapter_key, chapter_data in organized_content.items():
        chapter_num = chapter_key.split('_')[1]
        print(f"\nProcessing Chapter {chapter_num}:")
        
        for set_key, set_data in chapter_data.items():
            set_num = set_key.split('_')[1]
            print(f"  Processing Set {set_num}:")
            
            # Process EXAMPLES
            if 'EXAMPLE' in set_data:
                example_content = set_data['EXAMPLE']
                
                # Extract individual examples
                example_pattern = re.compile(r'Example\s+(\d+)\.\s+(.*?)(?=Example\s+\d+\.|$)', re.DOTALL)
                examples = example_pattern.findall(example_content)
                
                if not examples:
                    # If no individual examples found, treat the whole content as one example
                    examples = [('1', example_content)]
                
                for example_num, example_text in examples:
                    # Create folder for this example
                    example_folder = f"chapter_{chapter_num}_example_{set_num}_{example_num}"
                    create_directory_if_not_exists(example_folder)
                    
                    # Process images in the example
                    processed_text = process_images(example_text, example_folder)
                    
                    # Create the TeX file
                    create_tex_file(
                        example_folder, 
                        processed_text, 
                        f"Example {example_num}"
                    )
                    print(f"    Created Example {example_num}")
            
            # Check if we have both PROBLEMS and SOLUTIONS
            if 'PROBLEMS' in set_data and 'SOLUTIONS' in set_data:
                problems_content = set_data['PROBLEMS']
                solutions_content = set_data['SOLUTIONS']
                
                # Extract individual problems
                problem_pattern = re.compile(r'Problem\s+(\d+)\.\s+(.*?)(?=Problem\s+\d+\.|$)', re.DOTALL)
                problems = problem_pattern.findall(problems_content)
                
                # Extract individual solutions
                solution_pattern = re.compile(r'Problem\s+(\d+)\.\s+Solution:(.*?)(?=Problem\s+\d+\.|$)', re.DOTALL)
                solutions = {num: solution.strip() for num, solution in solution_pattern.findall(solutions_content)}
                
                if not solutions:  # Try alternative pattern if no solutions found
                    solution_pattern = re.compile(r'Problem\s+(\d+)\.\s+Solution\.(.*?)(?=Problem\s+\d+\.|$)', re.DOTALL)
                    solutions = {num: solution.strip() for num, solution in solution_pattern.findall(solutions_content)}
                
                # Match problems with solutions
                for problem_num, problem_text in problems:
                    problem_solution = solutions.get(problem_num, "Solution not available.")
                    
                    # Create folder for this problem
                    problem_folder = f"chapter_{chapter_num}_problem_{set_num}_{problem_num}"
                    create_directory_if_not_exists(problem_folder)
                    
                    # Process images in the problem statement
                    processed_problem = process_images(problem_text, problem_folder)
                    
                    # Process images in the solution
                    processed_solution = process_images(problem_solution, problem_folder)
                    
                    # Combined content
                    combined_content = f"""\\section*{{Problem}}
{processed_problem.strip()}

\\section*{{Solution}}
{processed_solution}
"""
                    
                    # Create the TeX file
                    create_tex_file(
                        problem_folder, 
                        combined_content, 
                        f"Problem {problem_num}"
                    )
                    print(f"    Created Problem {problem_num} with Solution")
    
    return True

def main(input_files):
    """Main function to process the input files."""
    if extract_and_process_sections(input_files):
        print("\nProcessing complete!")
    else:
        print("\nProcessing failed.")

# This is a Jupyter notebook, so we'll execute the main function directly
input_files = ["latex/latex_modified.tex"]
main(input_files)

Found 30 section matches in total.

Processing Chapter 1:
  Processing Set 1:
Looking for image at: /Users/leon66/Desktop/VLM Reasoning/VLM Reasoning Repo/data/raw_cot/Science/geometry/aux_lines/latex/images/2025_04_17_97bc1f7e44d93c271a88g-009(4).jpg
File exists: True
    Copied image: /Users/leon66/Desktop/VLM Reasoning/VLM Reasoning Repo/data/raw_cot/Science/geometry/aux_lines/latex/images/2025_04_17_97bc1f7e44d93c271a88g-009(4).jpg -> chapter_1_example_1_1/images/009(4).jpg
Looking for image at: /Users/leon66/Desktop/VLM Reasoning/VLM Reasoning Repo/data/raw_cot/Science/geometry/aux_lines/latex/images/2025_04_17_97bc1f7e44d93c271a88g-009(3).jpg
File exists: True
    Copied image: /Users/leon66/Desktop/VLM Reasoning/VLM Reasoning Repo/data/raw_cot/Science/geometry/aux_lines/latex/images/2025_04_17_97bc1f7e44d93c271a88g-009(3).jpg -> chapter_1_example_1_1/images/009(3).jpg
Created file: chapter_1_example_1_1/main.tex
    Created Example 1
Looking for image at: /Users/leon66/Desktop/V

In [19]:
import os
import glob

# Get all directories in current folder that start with "chapter"
chapter_dirs = [d for d in os.listdir('.') if os.path.isdir(d) and d.startswith('chapter')]

# Common image file extensions
image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']

# Count images in each chapter directory
for chapter_dir in sorted(chapter_dirs):
    image_count = 0
    
    # Search for image files in directory and its subdirectories
    for ext in image_extensions:
        image_files = glob.glob(os.path.join(chapter_dir, '**', f'*{ext}'), recursive=True)
        image_count += len(image_files)
    
    print(f"{chapter_dir}: {image_count} images")

chapter_1_example_1_1: 2 images
chapter_1_example_1_10: 2 images
chapter_1_example_1_11: 2 images
chapter_1_example_1_12: 1 images
chapter_1_example_1_13: 2 images
chapter_1_example_1_2: 2 images
chapter_1_example_1_3: 2 images
chapter_1_example_1_4: 2 images
chapter_1_example_1_5: 1 images
chapter_1_example_1_8: 2 images
chapter_1_example_1_9: 2 images
chapter_1_example_2_1: 2 images
chapter_1_example_2_2: 2 images
chapter_1_example_2_3: 2 images
chapter_1_example_2_4: 2 images
chapter_1_example_2_5: 2 images
chapter_1_example_2_6: 2 images
chapter_1_example_2_7: 1 images
chapter_1_problem_1_1: 2 images
chapter_1_problem_1_2: 2 images
chapter_1_problem_1_3: 2 images
chapter_1_problem_1_4: 4 images
chapter_1_problem_1_5: 2 images
chapter_1_problem_1_6: 2 images
chapter_1_problem_1_7: 2 images
chapter_1_problem_1_8: 3 images
chapter_1_problem_1_9: 2 images
chapter_1_problem_2_1: 2 images
chapter_1_problem_2_2: 1 images
chapter_1_problem_2_3: 2 images
chapter_1_problem_2_4: 3 images
chap

In [20]:
import os
import glob

# Get all directories in current folder that start with "chapter"
chapter_dirs = [d for d in os.listdir('.') if os.path.isdir(d) and d.startswith('chapter')]

# Common image file extensions
image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']

# Count images in each chapter directory
for chapter_dir in sorted(chapter_dirs):
    image_count = 0
    
    # Search for image files in directory and its subdirectories
    for ext in image_extensions:
        image_files = glob.glob(os.path.join(chapter_dir, '**', f'*{ext}'), recursive=True)
        image_count += len(image_files)
    
    # Only print directories that don't have exactly 2 images
    if image_count != 2:
        print(f"{chapter_dir}: {image_count} images")

chapter_1_example_1_12: 1 images
chapter_1_example_1_5: 1 images
chapter_1_example_2_7: 1 images
chapter_1_problem_1_4: 4 images
chapter_1_problem_1_8: 3 images
chapter_1_problem_2_2: 1 images
chapter_1_problem_2_4: 3 images
chapter_1_problem_2_5: 1 images
chapter_1_problem_2_6: 1 images
chapter_1_problem_2_7: 12 images
chapter_2_example_1_12: 3 images
chapter_2_example_1_3: 3 images
chapter_2_example_1_8: 3 images
chapter_2_example_1_9: 3 images
chapter_2_problem_1_11: 8 images
chapter_2_problem_1_2: 3 images
chapter_2_problem_1_4: 3 images
chapter_2_problem_1_7: 3 images
chapter_3_example_1_10: 0 images
chapter_3_example_1_12: 3 images
chapter_3_example_1_2: 1 images
chapter_3_example_1_4: 4 images
chapter_3_example_1_6: 3 images
chapter_3_example_1_7: 3 images
chapter_3_example_1_9: 3 images
chapter_3_problem_1_11: 1 images
chapter_3_problem_1_12: 7 images
chapter_3_problem_1_5: 3 images
chapter_4_example_1_1: 1 images
chapter_4_example_1_15: 1 images
chapter_4_example_1_18: 3 image

In [22]:
import os
import glob
import re
import shutil

# Get all directories in current folder that start with "chapter"
chapter_dirs = [d for d in os.listdir('.') if os.path.isdir(d) and d.startswith('chapter')]

# Common image file extensions
image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']

for chapter_dir in sorted(chapter_dirs):
    # Find all image files in this directory
    all_images = []
    for ext in image_extensions:
        images = glob.glob(os.path.join(chapter_dir, '**', f'*{ext}'), recursive=True)
        all_images.extend(images)
    
    # Only process directories with exactly 2 images
    if len(all_images) != 2:
        continue
        
    print(f"Processing {chapter_dir} (has exactly 2 images)")
    
    # Find and process the main.tex file
    tex_files = glob.glob(os.path.join(chapter_dir, '**', '*.tex'), recursive=True)
    for tex_file in tex_files:
        with open(tex_file, 'r') as f:
            content = f.read()
        
        # Find image references
        img_refs = re.findall(r'\\includegraphics.*?{(.*?)}', content)
        
        if len(img_refs) == 2:
            # Save original image paths
            original_paths = [os.path.join(os.path.dirname(tex_file), ref) for ref in img_refs]
            
            # Build new image names
            img_dir = os.path.dirname(img_refs[0])
            new_img1 = f"{img_dir}/problem_image_1.jpg"
            new_img2 = f"{img_dir}/reasoning_image_1.jpg"
            
            # Replace in the LaTeX content
            updated_content = content.replace(img_refs[0], new_img1)
            updated_content = updated_content.replace(img_refs[1], new_img2)
            
            # Write the updated content back
            with open(tex_file, 'w') as f:
                f.write(updated_content)
            
            # Rename the actual image files if they exist
            for i, orig_path in enumerate(original_paths):
                # Get components to build proper paths
                if os.path.exists(orig_path):
                    new_name = os.path.join(os.path.dirname(orig_path), 
                                          "input_image.jpg" if i == 0 else "intermediate_image_1.jpg")
                    shutil.copy2(orig_path, new_name)
                    print(f"  Renamed {orig_path} → {new_name}")
                    # Remove the original file
                    os.remove(orig_path)
                    print(f"  Removed original file: {orig_path}")
                else:
                    print(f"  Warning: Image file {orig_path} not found")
            
            print(f"  Updated {tex_file}")

Processing chapter_1_example_1_1 (has exactly 2 images)
  Renamed chapter_1_example_1_1/images/009(4).jpg → chapter_1_example_1_1/images/input_image.jpg
  Removed original file: chapter_1_example_1_1/images/009(4).jpg
  Renamed chapter_1_example_1_1/images/009(3).jpg → chapter_1_example_1_1/images/intermediate_image_1.jpg
  Removed original file: chapter_1_example_1_1/images/009(3).jpg
  Updated chapter_1_example_1_1/main.tex
Processing chapter_1_example_1_10 (has exactly 2 images)
  Renamed chapter_1_example_1_10/images/012(1).jpg → chapter_1_example_1_10/images/input_image.jpg
  Removed original file: chapter_1_example_1_10/images/012(1).jpg
  Renamed chapter_1_example_1_10/images/012(2).jpg → chapter_1_example_1_10/images/intermediate_image_1.jpg
  Removed original file: chapter_1_example_1_10/images/012(2).jpg
  Updated chapter_1_example_1_10/main.tex
Processing chapter_1_example_1_11 (has exactly 2 images)
  Renamed chapter_1_example_1_11/images/012(3).jpg → chapter_1_example_1_11

In [23]:
import os
import glob
import re
import shutil

# Get all directories in current folder that start with "chapter"
chapter_dirs = [d for d in os.listdir('.') if os.path.isdir(d) and d.startswith('chapter')]

for chapter_dir in sorted(chapter_dirs):
    # Check if both new image names exist in the directory
    input_images = glob.glob(os.path.join(chapter_dir, '**', 'input_image.jpg'), recursive=True)
    intermediate_images = glob.glob(os.path.join(chapter_dir, '**', 'intermediate_image_1.jpg'), recursive=True)
    
    # Only process directories that have both new image names
    if not (input_images and intermediate_images):
        continue
    
    print(f"Processing {chapter_dir} (has both new image names)")
    
    # Rename input_image_1.jpg to problem_image_1.jpg
    for img_path in input_images:
        new_path = os.path.join(os.path.dirname(img_path), 'problem_image_1.jpg')
        try:
            shutil.move(img_path, new_path)
            print(f"  Renamed: {img_path} → {new_path}")
        except Exception as e:
            print(f"  Error renaming {img_path}: {e}")
    
    # Rename intermediate_image_1.jpg to reasoning_image_1.jpg
    for img_path in intermediate_images:
        new_path = os.path.join(os.path.dirname(img_path), 'reasoning_image_1.jpg')
        try:
            shutil.move(img_path, new_path)
            print(f"  Renamed: {img_path} → {new_path}")
        except Exception as e:
            print(f"  Error renaming {img_path}: {e}")
    
    # Find all image files in this directory
    all_images = []
    for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
        images = glob.glob(os.path.join(chapter_dir, '**', f'*{ext}'), recursive=True)
        all_images.extend(images)
    
    # Find and delete old image files (any that aren't our new image names)
    for img_path in all_images:
        filename = os.path.basename(img_path)
        if filename != 'problem_image_1.jpg' and filename != 'reasoning_image_1.jpg':
            try:
                os.remove(img_path)
                print(f"  Removed old image: {img_path}")
            except Exception as e:
                print(f"  Error removing {img_path}: {e}")
    
    print(f"  Cleanup completed for {chapter_dir}")

Processing chapter_1_example_1_1 (has both new image names)
  Renamed: chapter_1_example_1_1/images/input_image.jpg → chapter_1_example_1_1/images/problem_image_1.jpg
  Renamed: chapter_1_example_1_1/images/intermediate_image_1.jpg → chapter_1_example_1_1/images/reasoning_image_1.jpg
  Cleanup completed for chapter_1_example_1_1
Processing chapter_1_example_1_10 (has both new image names)
  Renamed: chapter_1_example_1_10/images/input_image.jpg → chapter_1_example_1_10/images/problem_image_1.jpg
  Renamed: chapter_1_example_1_10/images/intermediate_image_1.jpg → chapter_1_example_1_10/images/reasoning_image_1.jpg
  Cleanup completed for chapter_1_example_1_10
Processing chapter_1_example_1_11 (has both new image names)
  Renamed: chapter_1_example_1_11/images/input_image.jpg → chapter_1_example_1_11/images/problem_image_1.jpg
  Renamed: chapter_1_example_1_11/images/intermediate_image_1.jpg → chapter_1_example_1_11/images/reasoning_image_1.jpg
  Cleanup completed for chapter_1_example_

In [12]:
import os
import re
import glob

# Find all LaTeX files
latex_files = glob.glob("**/*.tex", recursive=True)

# Pattern to match text in parentheses after begin document or section problem
pattern1 = r"\\begin{document}\s*\((.*?)\)"
pattern2 = r"\\section\*{Problem}\s*\((.*?)\)"

# Store the found matches to show the user
to_be_deleted = []

for file_path in latex_files:
    with open(file_path, 'r', encoding='utf-8') as file:
        try:
            content = file.read()
            
            # Check for matches
            matches1 = re.findall(pattern1, content, re.DOTALL)
            matches2 = re.findall(pattern2, content, re.DOTALL)
            
            if matches1 or matches2:
                to_be_deleted.append((file_path, matches1 + matches2))
                
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

# Display what would be deleted
print(f"Found {len(to_be_deleted)} files with text to delete:")
for file_path, matches in to_be_deleted:
    print(f"\nFile: {file_path}")
    for match in matches:
        print(f"  • ({match})")

Found 63 files with text to delete:

File: chapter_1_example_2_7/main.tex
  • (1975 AMC)

File: chapter_6_problem_3_9/main.tex
  • (1996 Mathcounts National Sprint Problem 25)

File: chapter_3_example_1_3/main.tex
  • (AMC)

File: chapter_4_example_1_19/main.tex
  • (2003 AIME 2 Problem 11)

File: chapter_2_problem_1_1/main.tex
  • (Phillips Academy Prize Exam)

File: chapter_4_example_1_20/main.tex
  • (1994 Canadian Mathematical Olympiad)

File: chapter_5_example_1_21/main.tex
  • (1985 Yangzhou Math Contest, 1994 Canadian Mathematical Olympiad)

File: chapter_2_example_1_1/main.tex
  • (AMC)

File: chapter_3_problem_1_2/main.tex
  • (AMC)

File: chapter_5_example_1_19/main.tex
  • (2002 AIME II)

File: chapter_1_problem_2_6/main.tex
  • (AMC)

File: chapter_6_example_3_8/main.tex
  • (1994 China Middle School Math Contest)

File: chapter_5_example_1_10/main.tex
  • (AMC)

File: chapter_6_example_3_11/main.tex
  • (2009 AMC 10 A Problem 21)

File: chapter_6_example_3_9/main.tex
  • (

In [13]:
import os
import re
import glob

# Find all LaTeX files
latex_files = glob.glob("**/*.tex", recursive=True)

# Pattern to match and replace text in parentheses
pattern1 = r"(\\begin{document})\s*\(.*?\)\s*"
pattern2 = r"(\\section\*{Problem})\s*\(.*?\)\s*"

# Replacement patterns - keep the command but remove parenthetical text
replacement1 = r"\1\n"  # Keep \begin{document} and add a newline
replacement2 = r"\1\n"  # Keep \section*{Problem} and add a newline

deletion_count = 0
deletion_details = []

for file_path in latex_files:
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            
            # Store original content for comparison
            original = content
            
            # Apply replacements
            modified = re.sub(pattern1, replacement1, content, flags=re.DOTALL)
            modified = re.sub(pattern2, replacement2, modified, flags=re.DOTALL)
            
            # If changes were made, write back to file
            if modified != original:
                with open(file_path, 'w', encoding='utf-8') as file:
                    file.write(modified)
                deletion_count += 1
                
                # Extract what was deleted for reporting
                deleted1 = re.findall(pattern1, content, re.DOTALL)
                deleted2 = re.findall(pattern2, content, re.DOTALL)
                deletion_details.append((file_path, deleted1 + deleted2))
                
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# Report results
print(f"Successfully cleaned {deletion_count} files:")
for file_path, _ in deletion_details:
    print(f"- {file_path}")

print("\nDeletion completed successfully.")

Successfully cleaned 63 files:
- chapter_1_example_2_7/main.tex
- chapter_6_problem_3_9/main.tex
- chapter_3_example_1_3/main.tex
- chapter_4_example_1_19/main.tex
- chapter_2_problem_1_1/main.tex
- chapter_4_example_1_20/main.tex
- chapter_5_example_1_21/main.tex
- chapter_2_example_1_1/main.tex
- chapter_3_problem_1_2/main.tex
- chapter_5_example_1_19/main.tex
- chapter_1_problem_2_6/main.tex
- chapter_6_example_3_8/main.tex
- chapter_5_example_1_10/main.tex
- chapter_6_example_3_11/main.tex
- chapter_6_example_3_9/main.tex
- chapter_5_example_1_18/main.tex
- chapter_6_example_3_10/main.tex
- chapter_6_problem_4_8/main.tex
- chapter_5_example_1_3/main.tex
- chapter_6_problem_4_7/main.tex
- chapter_5_example_1_2/main.tex
- chapter_6_example_4_7/main.tex
- chapter_6_example_4_9/main.tex
- chapter_5_problem_1_5/main.tex
- chapter_4_example_1_1/main.tex
- chapter_4_example_1_8/main.tex
- chapter_1_problem_1_1/main.tex
- chapter_6_example_4_8/main.tex
- chapter_6_example_4_6/main.tex
- ch

In [14]:
import os
import glob
import re

# Find all .tex files
tex_files = glob.glob("**/*.tex", recursive=True)
replacement_count = 0

for tex_file in tex_files:
    try:
        with open(tex_file, 'r', encoding='utf-8') as file:
            content = file.read()
            
        # Store original for comparison
        original = content
        
        # Replace old image references with new ones
        content = content.replace('images/input_image.jpg', 'images/problem_image_1.jpg')
        content = content.replace('images/intermediate_image_1.jpg', 'images/reasoning_image_1.jpg')
        
        # If changes were made, write back to file
        if content != original:
            with open(tex_file, 'w', encoding='utf-8') as file:
                file.write(content)
            
            print(f"Updated: {tex_file}")
            
            # Show specific replacements made
            if 'images/input_image.jpg' in original:
                print(f"  • Replaced: images/input_image.jpg → images/problem_image_1.jpg")
            if 'images/intermediate_image_1.jpg' in original:
                print(f"  • Replaced: images/intermediate_image_1.jpg → images/reasoning_image_1.jpg")
                
            replacement_count += 1
    
    except Exception as e:
        print(f"Error processing {tex_file}: {e}")

print(f"\nCompleted! Updated {replacement_count} LaTeX files with new image references.")

Updated: chapter_6_problem_1_4/main.tex
  • Replaced: images/input_image.jpg → images/problem_image_1.jpg
  • Replaced: images/intermediate_image_1.jpg → images/reasoning_image_1.jpg
Updated: chapter_6_problem_3_9/main.tex
  • Replaced: images/input_image.jpg → images/problem_image_1.jpg
  • Replaced: images/intermediate_image_1.jpg → images/reasoning_image_1.jpg
Updated: chapter_6_problem_1_3/main.tex
  • Replaced: images/input_image.jpg → images/problem_image_1.jpg
  • Replaced: images/intermediate_image_1.jpg → images/reasoning_image_1.jpg
Updated: chapter_2_problem_1_9/main.tex
  • Replaced: images/input_image.jpg → images/problem_image_1.jpg
  • Replaced: images/intermediate_image_1.jpg → images/reasoning_image_1.jpg
Updated: chapter_6_problem_3_7/main.tex
  • Replaced: images/input_image.jpg → images/problem_image_1.jpg
  • Replaced: images/intermediate_image_1.jpg → images/reasoning_image_1.jpg
Updated: chapter_3_example_1_3/main.tex
  • Replaced: images/input_image.jpg → images