## 압축해제해야 될 때

### 수식 추출하는 부분 다시 짜야함.
더 robust한 규칙이 필요.


In [4]:
import os
import tarfile
import gzip
import re
import io
import chardet
import json

def extract_tar_files(src_folder):
    tar_files = [os.path.join(src_folder, file) for file in os.listdir(src_folder) if file.endswith('.tar')]
    print(f"<tar_files> \n{tar_files}")
    return tar_files

def extract_gz_from_tar(tar_file):
    extracted_files = []
    with tarfile.open(tar_file, 'r') as tar:
        for member in tar.getmembers():
            if member.isfile() and member.name.endswith('.gz'):
                extracted_files.append(tar.extractfile(member).read())
    print(f"gz files : \n {extracted_files}")
    return extracted_files

def detect_encoding(data):
    result = chardet.detect(data)
    encoding = result['encoding']
    if encoding is None:
        return 'utf-8'  # 기본 인코딩 설정
    return encoding

def extract_tex_from_gz(gz_data):
    tex_content = ""
    with gzip.open(io.BytesIO(gz_data), 'rb') as f:
        raw_data = f.read()
        encoding = detect_encoding(raw_data)
        try:
            tex_content = raw_data.decode(encoding)
        except (UnicodeDecodeError, TypeError):
            tex_content = raw_data.decode('latin1')  # utf-8 디코딩 실패시 기본적으로 latin1 사용
    return tex_content

def extract_equations_with_context(tex_content):
    """1. LaTeX Data with forward, backward English data collect<br>
    detecting formula criterion:
    1) between \\begin{equation} and \\end{equation}.
    2) between $ and $.
    3) between $$ and $$."""
    
    equations_with_context = []
    i = 0

    def clean_context(context):
        """ Remove non-English words and special characters from the context """
        # Keep only letters, digits, and basic punctuation
        cleaned_context = re.sub(r'[^a-zA-Z0-9\s.,]', '', context)
        # Split into words and keep only English words (simple check)
        words = cleaned_context.split()
        english_words = [word for word in words if re.match(r'^[a-zA-Z0-9]+$', word)]
        return ' '.join(english_words)

    while i < len(tex_content):
        context_before = tex_content[max(0, i-100):i].strip()
        equation = ""
        context_after = ""
        
        if tex_content[i:i+2] == '$$':
            # Find the end of the double dollar sign equation
            end = tex_content.find('$$', i + 2)
            if end == -1:
                break
            equation = tex_content[i:end+2]
            context_after = tex_content[end+2:end+102].strip()
            i = end + 2
        elif tex_content[i] == '$':
            # Find the end of the single dollar sign equation
            end = tex_content.find('$', i + 1)
            if end == -1:
                break
            equation = tex_content[i:end+1]
            context_after = tex_content[end+1:end+101].strip()
            i = end + 1
        elif tex_content[i:i+16] == '\\begin{equation}':
            # Find the end of the equation environment
            end = tex_content.find('\\end{equation}', i + 16)
            if end == -1:
                break
            equation = tex_content[i:end+14]
            context_after = tex_content[end+14:end+114].strip()
            i = end + 14
        else:
            i += 1
            continue

        # Clean and limit context_before to 10 words or fewer
        context_before = clean_context(context_before)
        context_before_words = context_before.split()
        if len(context_before_words) > 10:
            context_before = ' '.join(context_before_words[-10:])

        # Clean and limit context_after to 10 words or fewer
        context_after = clean_context(context_after)
        context_after_words = context_after.split()
        if len(context_after_words) > 10:
            context_after = ' '.join(context_after_words[:10])
        
        # Ensure context_after does not contain another equation
        if '$' in context_after:
            context_after = context_after.split('$')[0].strip()
        if '\\begin{equation}' in context_after:
            context_after = context_after.split('\\begin{equation}')[0].strip()
                
        # Ensure the equation is 100 characters or fewer
        if len(equation) <= 100:
            equations_with_context.append({
                'context_before': context_before,
                'equation': equation,
                'context_after': context_after
            })
    
    return equations_with_context

def save_results_to_jsonl(results, output_file):
    with open(output_file, 'a', encoding='utf-8') as file:  # 'a' 모드를 사용하여 파일에 추가
        for result in results:
            file.write(json.dumps(result) + '\n')

def process_gz_file(gz_data, output_file):
    tex_content = extract_tex_from_gz(gz_data)
    equations_with_context = extract_equations_with_context(tex_content)
    if len(equations_with_context) > 10000:
        return
    save_results_to_jsonl(equations_with_context, output_file)

def main(src_folder, output_file):
    tar_files = extract_tar_files(src_folder)
    
    for tar_file in tar_files:
        gz_files = extract_gz_from_tar(tar_file)
        for gz_data in gz_files:
            process_gz_file(gz_data, output_file)

if __name__ == "__main__":
    src_folder = 'downloads2/src'  # src 폴더의 경로를 적절히 설정하세요.
    output_file = 'zip_files_extract_latex_serial_0626_0012.jsonl'
    main(src_folder, output_file)


<tar_files> 
['downloads2/src\\arXiv_src_0001_001.tar', 'downloads2/src\\arXiv_src_0002_001.tar', 'downloads2/src\\arXiv_src_0003_001.tar', 'downloads2/src\\arXiv_src_0004_001.tar', 'downloads2/src\\arXiv_src_0005_001.tar', 'downloads2/src\\arXiv_src_0006_001.tar', 'downloads2/src\\arXiv_src_0007_001.tar', 'downloads2/src\\arXiv_src_0008_001.tar', 'downloads2/src\\arXiv_src_0009_001.tar', 'downloads2/src\\arXiv_src_0010_001.tar']


## Tex 파일만 있을 때

In [2]:
import os
import re
import json

def extract_equations_with_context(tex_content):
    # Define a pattern to match both inline and displayed equations
    
    """1. LaTeX Data with forward, backward English data collect<br>
    detecting formula criterion:
    1) between \\begin{equation} and \\end{equation}.
    2) between $ and $.
    3) between $$ and $$."""

    pattern = r'(?P<context_before>.*?)(?P<equation>\$.*?\$|\$\$.*?\$\$|\\begin{equation}.*?\\end{equation})(?P<context_after>.*?)($|\n)'
    matches = re.finditer(pattern, tex_content, re.DOTALL)
    equations_with_context = []

    def clean_context(context):
        """ Remove non-English words and special characters from the context """
        # Keep only letters, digits, and basic punctuation
        cleaned_context = re.sub(r'[^a-zA-Z0-9\s.,]', '', context)
        # Split into words and keep only English words (simple check)
        words = cleaned_context.split()
        english_words = [word for word in words if re.match(r'^[a-zA-Z0-9]+$', word)]
        return ' '.join(english_words)

    for match in matches:
        context_before = match.group('context_before').strip()
        equation = match.group('equation').strip()
        context_after = match.group('context_after').strip()

        # Clean and limit context_before to 10 words or fewer
        context_before = clean_context(context_before)
        context_before_words = context_before.split()
        if len(context_before_words) > 10:
            context_before = ' '.join(context_before_words[-10:])
        
        # Clean and limit context_after to 10 words or fewer
        context_after = clean_context(context_after)
        context_after_words = context_after.split()
        if len(context_after_words) > 10:
            context_after = ' '.join(context_after_words[:10])
        
        # Ensure context_after does not contain another equation
        if '$' in context_after:
            context_after = context_after.split('$')[0].strip()
        if '\\begin{equation}' in context_after:
            context_after = context_after.split('\\begin{equation}')[0].strip()
                
        # If context_after is empty, include the next line's content
        if not context_after and '\n' in tex_content:
            remaining_text = tex_content.split(equation, 1)[1].strip()
            context_after = clean_context(remaining_text.split('\n', 1)[0].strip())
            if '$' in context_after:
                context_after = context_after.split('$')[0].strip()
            if '\\begin{equation}' in context_after:
                context_after = context_after.split('\\begin{equation}')[0].strip()

        # Re-limit context_after to 10 words or fewer
        context_after_words = context_after.split()
        if len(context_after_words) > 10:
            context_after = ' '.join(context_after_words[:10])

        # Ensure the equation is 100 characters or fewer
        if len(equation) <= 100:
            equations_with_context.append({
                'context_before': context_before,
                'equation': equation,
                'context_after': context_after
            })
    
    return equations_with_context

# Path to the arxiv_papers_tex folder
base_path = 'C:/Users/wjdrb/vscode_code/MathBridge_new/data/arxiv_papers_tex'


# List to store all equations with context
all_equations_with_context = []

# Function to read file with multiple encoding attempts
def read_file_with_multiple_encodings(file_path):
    encodings = ['utf-8', 'latin-1', 'iso-8859-1']
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.read()
        except (UnicodeDecodeError, FileNotFoundError):
            continue
    raise UnicodeDecodeError(f"Failed to read file {file_path} with available encodings.")

# Iterate over each folder (paper) in the base directory
for paper_folder in os.listdir(base_path):
    paper_folder_path = os.path.join(base_path, paper_folder)
    
    if os.path.isdir(paper_folder_path):
        print(f"Processing folder: {paper_folder_path}")
        
        # Iterate over each file in the paper folder
        for tex_file in os.listdir(paper_folder_path):
            if tex_file.endswith('.tex') or tex_file.endswith('.TEX'):
                tex_file_path = os.path.join(paper_folder_path, tex_file)
                print(f"  Reading file: {tex_file_path}")
                
                # Read the content of the tex file with multiple encoding attempts
                try:
                    tex_content = read_file_with_multiple_encodings(tex_file_path)
                except UnicodeDecodeError as e:
                    print(f"    Could not read file {tex_file_path}: {e}")
                    continue
                
                # Extract equations with context
                equations_with_context = extract_equations_with_context(tex_content)
                print(f"    Extracted {len(equations_with_context)} equations from {tex_file_path}")
                
                # Add the paper number to each extracted entry
                for entry in equations_with_context:
                    entry['paper_number'] = paper_folder
                
                # Append to the list
                all_equations_with_context.extend(equations_with_context)

# Define the output file path
output_file_path = 'equations_with_context_math_1,2,3-0625-2342.jsonl'

# Write the results to a jsonl file
with open(output_file_path, 'w', encoding='utf-8') as outfile:
    for entry in all_equations_with_context:
        json.dump(entry, outfile)
        outfile.write('\n')

print(f"Total equations extracted: {len(all_equations_with_context)}")
print(f"Results written to {output_file_path}")


Processing folder: C:/Users/wjdrb/vscode_code/MathBridge_new/data/arxiv_papers_tex\0704.0002
  Reading file: C:/Users/wjdrb/vscode_code/MathBridge_new/data/arxiv_papers_tex\0704.0002\sparsity-certifying.tex
    Extracted 247 equations from C:/Users/wjdrb/vscode_code/MathBridge_new/data/arxiv_papers_tex\0704.0002\sparsity-certifying.tex
Processing folder: C:/Users/wjdrb/vscode_code/MathBridge_new/data/arxiv_papers_tex\0704.0010
  Reading file: C:/Users/wjdrb/vscode_code/MathBridge_new/data/arxiv_papers_tex\0704.0010\my_macros.TEX
    Extracted 2 equations from C:/Users/wjdrb/vscode_code/MathBridge_new/data/arxiv_papers_tex\0704.0010\my_macros.TEX
  Reading file: C:/Users/wjdrb/vscode_code/MathBridge_new/data/arxiv_papers_tex\0704.0010\PartialCubes.TEX
    Extracted 541 equations from C:/Users/wjdrb/vscode_code/MathBridge_new/data/arxiv_papers_tex\0704.0010\PartialCubes.TEX
Processing folder: C:/Users/wjdrb/vscode_code/MathBridge_new/data/arxiv_papers_tex\0704.0014
  Reading file: C:/Use

## Serial Search Method

In [3]:
import os
import re
import json

def extract_equations_with_context(tex_content):
    """1. LaTeX Data with forward, backward English data collect<br>
    detecting formula criterion:
    1) between \\begin{equation} and \\end{equation}.
    2) between $ and $.
    3) between $$ and $$."""
    
    equations_with_context = []
    i = 0

    def clean_context(context):
        """ Remove non-English words and special characters from the context """
        # Keep only letters, digits, and basic punctuation
        cleaned_context = re.sub(r'[^a-zA-Z0-9\s.,]', '', context)
        # Split into words and keep only English words (simple check)
        words = cleaned_context.split()
        english_words = [word for word in words if re.match(r'^[a-zA-Z0-9]+$', word)]
        return ' '.join(english_words)

    while i < len(tex_content):
        context_before = tex_content[max(0, i-100):i].strip()
        equation = ""
        context_after = ""
        
        if tex_content[i:i+2] == '$$':
            # Find the end of the double dollar sign equation
            end = tex_content.find('$$', i + 2)
            if end == -1:
                break
            equation = tex_content[i:end+2]
            context_after = tex_content[end+2:end+102].strip()
            i = end + 2
        elif tex_content[i] == '$':
            # Find the end of the single dollar sign equation
            end = tex_content.find('$', i + 1)
            if end == -1:
                break
            equation = tex_content[i:end+1]
            context_after = tex_content[end+1:end+101].strip()
            i = end + 1
        elif tex_content[i:i+16] == '\\begin{equation}':
            # Find the end of the equation environment
            end = tex_content.find('\\end{equation}', i + 16)
            if end == -1:
                break
            equation = tex_content[i:end+14]
            context_after = tex_content[end+14:end+114].strip()
            i = end + 14
        else:
            i += 1
            continue

        # Clean and limit context_before to 10 words or fewer
        context_before = clean_context(context_before)
        context_before_words = context_before.split()
        if len(context_before_words) > 10:
            context_before = ' '.join(context_before_words[-10:])

        # Clean and limit context_after to 10 words or fewer
        context_after = clean_context(context_after)
        context_after_words = context_after.split()
        if len(context_after_words) > 10:
            context_after = ' '.join(context_after_words[:10])
        
        # Ensure context_after does not contain another equation
        if '$' in context_after:
            context_after = context_after.split('$')[0].strip()
        if '\\begin{equation}' in context_after:
            context_after = context_after.split('\\begin{equation}')[0].strip()
                
        # Ensure the equation is 100 characters or fewer
        if len(equation) <= 100:
            equations_with_context.append({
                'context_before': context_before,
                'equation': equation,
                'context_after': context_after
            })
    
    return equations_with_context

# Path to the arxiv_papers_tex folder
base_path = 'C:/Users/wjdrb/vscode_code/MathBridge_new/data/arxiv_papers_tex'


# List to store all equations with context
all_equations_with_context = []

# Function to read file with multiple encoding attempts
def read_file_with_multiple_encodings(file_path):
    encodings = ['utf-8', 'latin-1', 'iso-8859-1']
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.read()
        except (UnicodeDecodeError, FileNotFoundError):
            continue
    raise UnicodeDecodeError(f"Failed to read file {file_path} with available encodings.")

# Iterate over each folder (paper) in the base directory
for paper_folder in os.listdir(base_path):
    paper_folder_path = os.path.join(base_path, paper_folder)
    
    if os.path.isdir(paper_folder_path):
        print(f"Processing folder: {paper_folder_path}")
        
        # Iterate over each file in the paper folder
        for tex_file in os.listdir(paper_folder_path):
            if tex_file.endswith('.tex') or tex_file.endswith('.TEX'):
                tex_file_path = os.path.join(paper_folder_path, tex_file)
                print(f"  Reading file: {tex_file_path}")
                
                # Read the content of the tex file with multiple encoding attempts
                try:
                    tex_content = read_file_with_multiple_encodings(tex_file_path)
                except UnicodeDecodeError as e:
                    print(f"    Could not read file {tex_file_path}: {e}")
                    continue
                
                # Extract equations with context
                equations_with_context = extract_equations_with_context(tex_content)
                print(f"    Extracted {len(equations_with_context)} equations from {tex_file_path}")
                
                # Add the paper number to each extracted entry
                for entry in equations_with_context:
                    entry['paper_number'] = paper_folder
                
                # Append to the list
                all_equations_with_context.extend(equations_with_context)

# Define the output file path
output_file_path = 'equations_with_context_math_serial-0626-0007.jsonl'

# Write the results to a jsonl file
with open(output_file_path, 'w', encoding='utf-8') as outfile:
    for entry in all_equations_with_context:
        json.dump(entry, outfile)
        outfile.write('\n')

print(f"Total equations extracted: {len(all_equations_with_context)}")
print(f"Results written to {output_file_path}")


Processing folder: C:/Users/wjdrb/vscode_code/MathBridge_new/data/arxiv_papers_tex\0704.0002
  Reading file: C:/Users/wjdrb/vscode_code/MathBridge_new/data/arxiv_papers_tex\0704.0002\sparsity-certifying.tex
    Extracted 469 equations from C:/Users/wjdrb/vscode_code/MathBridge_new/data/arxiv_papers_tex\0704.0002\sparsity-certifying.tex
Processing folder: C:/Users/wjdrb/vscode_code/MathBridge_new/data/arxiv_papers_tex\0704.0010
  Reading file: C:/Users/wjdrb/vscode_code/MathBridge_new/data/arxiv_papers_tex\0704.0010\my_macros.TEX
    Extracted 2 equations from C:/Users/wjdrb/vscode_code/MathBridge_new/data/arxiv_papers_tex\0704.0010\my_macros.TEX
  Reading file: C:/Users/wjdrb/vscode_code/MathBridge_new/data/arxiv_papers_tex\0704.0010\PartialCubes.TEX
    Extracted 1708 equations from C:/Users/wjdrb/vscode_code/MathBridge_new/data/arxiv_papers_tex\0704.0010\PartialCubes.TEX
Processing folder: C:/Users/wjdrb/vscode_code/MathBridge_new/data/arxiv_papers_tex\0704.0014
  Reading file: C:/Us

## 회의 후 robust 알고리즘

In [27]:
import os
import re
import json
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

def replace_user_defined_commands(content):
    """Replace user-defined commands in LaTeX content with their definitions."""
    macro_patterns = [
        r'\\def\\(\w+)\{(.+?)\}', 
        r'\\newcommand\\(\w+)(\[[0-9]+\])?\{(.+?)\}', 
        r'\\renewcommand\\(\w+)(\[[0-9]+\])?\{(.+?)\}'
    ]
    
    macros = {}
    
    for pattern in macro_patterns:
        for match in re.findall(pattern, content, re.DOTALL):
            if len(match) == 3:
                macro_name, _, macro_definition = match
            else:
                macro_name, macro_definition = match
            # Escape backslashes and other regex special characters in the definition
            macro_definition = re.escape(macro_definition)
            macros[macro_name] = macro_definition

    # Replace all macro instances in the content
    for macro_name, macro_definition in macros.items():
        macro_pattern = re.compile(r'\\' + macro_name + r'\b')
        content = macro_pattern.sub(macro_definition, content)

    return content

def extract_equations_with_context(tex_content):
    """Extract equations and context from LaTeX content."""
    
    equations_with_context = []
    i = 0

    def clean_context(context):
        """Remove non-English words and special characters from the context."""
        # Keep only letters, digits, and basic punctuation
        cleaned_context = re.sub(r'[^a-zA-Z0-9\s.,]', '', context)
        # Tokenize into words and keep only English words (simple check)
        words = word_tokenize(cleaned_context)
        english_words = [word for word in words if re.match(r'^[a-zA-Z0-9]+$', word)]
        return ' '.join(english_words)

    # Replace user-defined commands with their definitions
    tex_content = replace_user_defined_commands(tex_content)

    tex_words = word_tokenize(tex_content)
    while i < len(tex_words):
        context_before = " ".join(tex_words[max(0, i-30):i]).strip()
        equation = ""
        context_after = ""
        eq_type = -1

        if tex_words[i] == '$' and i + 1 < len(tex_words) and tex_words[i + 1] == '$':
            # Find the end of the double dollar sign equation
            end = i + 2
            while end < len(tex_words) and not (tex_words[end] == '$' and end + 1 < len(tex_words) and tex_words[end + 1] == '$'):
                end += 1
            if end >= len(tex_words):
                break
            equation = " ".join(tex_words[i:end+2])
            context_after = " ".join(tex_words[end+2:end+32]).strip()
            eq_type = 1
            i = end + 2
        elif tex_words[i] == '$':
            # Find the end of the single dollar sign equation
            end = i + 1
            while end < len(tex_words) and tex_words[end] != '$':
                end += 1
            if end >= len(tex_words):
                break
            equation = " ".join(tex_words[i:end+1])
            context_after = " ".join(tex_words[end+1:end+31]).strip()
            eq_type = 0
            i = end + 1
        elif tex_words[i] == '\\begin' and i + 1 < len(tex_words) and tex_words[i + 1] == '{equation}':
            # Find the end of the equation environment
            end = i + 2
            while end < len(tex_words) and not (tex_words[end] == '\\end' and end + 1 < len(tex_words) and tex_words[end + 1] == '{equation}'):
                end += 1
            if end >= len(tex_words):
                break
            equation = " ".join(tex_words[i:end+2])
            context_after = " ".join(tex_words[end+2:end+32]).strip()
            eq_type = 2
            i = end + 2
        else:
            i += 1
            continue

        # Clean and limit context_before to 30 words or fewer
        context_before_words = word_tokenize(context_before)
        if len(context_before_words) > 30:
            context_before = ' '.join(context_before_words[-30:])

        # Clean and limit context_after to 30 words or fewer
        context_after_words = word_tokenize(context_after)
        if len(context_after_words) > 30:
            context_after = ' '.join(context_after_words[:30])
        
        # Ensure context_before does not contain another equation
        unwanted_tokens = ['$', '\\end{equation}', '}', '{', ']', '[', ')', '(', '\\rm', ',', '.', '\\vtl', '\\it']

        for token in unwanted_tokens:
            if token in context_before:
                context_before = context_before.split(token)[-1].strip()

        # Ensure context_after does not contain another equation
        if '$' in context_after:
            context_after = context_after.split('$')[0].strip()
        if '\\begin{equation}' in context_after:
            context_after = context_after.split(r'\\begin{equation}')[0].strip()
        if '\\' in context_after:
            context_after = context_after.split('\\')[0].strip()
                
        # Ensure the equation is 100 characters or fewer
        if len(equation) <= 100:
            equations_with_context.append({
                'context_before': context_before,
                'equation': equation,
                'context_after': context_after,
                'eq_type': eq_type
            })
    
    return equations_with_context

# Path to the arxiv_papers_tex folder
base_path = r'C:\Users\wjdrb\Downloads\drive-download-20240708T045529Z-001'
#base_path = r'C:\Users\wjdrb\vscode_code\MathBridge_new\data\test_file'

# List to store all equations with context
all_equations_with_context = []

# Function to read file with multiple encoding attempts
def read_file_with_multiple_encodings(file_path):
    encodings = ['utf-8', 'latin-1', 'iso-8859-1']
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.read()
        except (UnicodeDecodeError, FileNotFoundError):
            continue
    raise UnicodeDecodeError(f"Failed to read file {file_path} with available encodings.")

# Iterate over each folder (paper) in the base directory
for paper_folder in os.listdir(base_path):
    paper_folder_path = os.path.join(base_path, paper_folder) #2301 folder
    
    if os.path.isdir(paper_folder_path):
        print(f"Processing folder: {paper_folder_path}")
        
        # Iterate over each file in the paper folder
        for tex_file in os.listdir(paper_folder_path):
            if tex_file.endswith('.tex') or tex_file.endswith('.TEX'):
                tex_file_path = os.path.join(paper_folder_path, tex_file)
                print(f"  Reading file: {tex_file_path}")
                
                # Read the content of the tex file with multiple encoding attempts
                try:
                    tex_content = read_file_with_multiple_encodings(tex_file_path)
                except UnicodeDecodeError as e:
                    print(f"    Could not read file {tex_file_path}: {e}")
                    continue
                
                # Extract equations with context
                equations_with_context = extract_equations_with_context(tex_content)
                print(f"    Extracted {len(equations_with_context)} equations from {tex_file_path}")
                
                # Add the paper number to each extracted entry
                for entry in equations_with_context:
                    entry['paper_number'] = tex_file.split(".tex")[0]
                
                # Append to the list
                all_equations_with_context.extend(equations_with_context)

# Define the output file path
output_file_path = 'equations_with_context_0709_Hyungondata.jsonl'

# Write the results to a jsonl file
with open(output_file_path, 'w', encoding='utf-8') as outfile:
    for entry in all_equations_with_context:
        json.dump(entry, outfile)
        outfile.write('\n')

print(f"Total equations extracted: {len(all_equations_with_context)}")
print(f"Results written to {output_file_path}")


Processing folder: C:\Users\wjdrb\vscode_code\MathBridge_new\data\test_file\2301
  Reading file: C:\Users\wjdrb\vscode_code\MathBridge_new\data\test_file\2301\2301.00001.tex
    Extracted 0 equations from C:\Users\wjdrb\vscode_code\MathBridge_new\data\test_file\2301\2301.00001.tex
  Reading file: C:\Users\wjdrb\vscode_code\MathBridge_new\data\test_file\2301\2301.00002.tex
    Extracted 10 equations from C:\Users\wjdrb\vscode_code\MathBridge_new\data\test_file\2301\2301.00002.tex
Processing folder: C:\Users\wjdrb\vscode_code\MathBridge_new\data\test_file\2302
  Reading file: C:\Users\wjdrb\vscode_code\MathBridge_new\data\test_file\2302\2302.00001.tex
    Extracted 0 equations from C:\Users\wjdrb\vscode_code\MathBridge_new\data\test_file\2302\2302.00001.tex
  Reading file: C:\Users\wjdrb\vscode_code\MathBridge_new\data\test_file\2302\2302.00002.tex
    Extracted 10 equations from C:\Users\wjdrb\vscode_code\MathBridge_new\data\test_file\2302\2302.00002.tex
Processing folder: C:\Users\wjdr

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wjdrb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
