# LaTeXpOsEd: Parsing Stage

In this stage, the downloaded latex files are merged into one JSON list file with only the filtered comments. The comments are filtered for boilerplate, excessive white space, purely separator characters and others.

Before running this script:

- Complete: [1_scrape.ipynb](1_scrape.ipynb)

In [None]:
%pip install -q urlextract tiktoken

In [None]:
import os
import json
import re
import tiktoken
from tqdm import tqdm
from collections import Counter

In [None]:
PAPERS_FOLDER = 'data/final'
COMMON_COMMENTS_TXT = 'tmp/common_comments.txt'
COMMENTS_JSONL = 'data/paper_comments.jsonl'

In [None]:
def extract_latex_comment_blocks(latex_code: str) -> list:
    matches = re.findall(r'\\begin\{comment\}(.*?)\\end\{comment\}', latex_code, re.DOTALL | re.IGNORECASE)
    return [ match.strip() for match in matches if match.strip() ]

def extract_latex_comment_lines(latex_code: str) -> list:
    lines = latex_code.splitlines()
    in_verbatim = False
    comments = []

    # Regex for detecting start and end of verbatim-like environments
    verbatim_start = re.compile(r'\\begin\{(verbatim|lstlisting|Verbatim)\}')
    verbatim_end = re.compile(r'\\end\{(verbatim|lstlisting|Verbatim)\}')

    for line in lines:
        stripped_line = line.strip()

        # Handle entering or leaving verbatim
        if verbatim_start.search(stripped_line):
            in_verbatim = True
        if verbatim_end.search(stripped_line):
            in_verbatim = False
            continue
        if in_verbatim:
            continue

        # Find first unescaped `%`
        i = 0
        while i < len(line):
            if line[i] == '%':
                if i > 0 and line[i - 1] == '\\':
                    i += 1
                    continue
                # Extract comment without `%` and leading whitespace
                comments.append(line[i+1:].strip())
                break
            i += 1

    return comments

def extract_latex_comments(latex_code: str) -> list:
    comments = extract_latex_comment_lines(latex_code) + extract_latex_comment_blocks(latex_code)
    
    comments = [comment for comment in comments if comment.strip()] # Remove empty comments
    return comments

In [None]:
# Load all json files in folder
for paper_name in os.listdir(PAPERS_FOLDER):
    if paper_name.endswith('.json'):
        filepath = os.path.join(PAPERS_FOLDER, paper_name)
        with open(filepath, 'r', encoding='utf-8') as f:
            files = json.load(f)

In [None]:
# Iterator class for more convenient iteration of the dataset
class PaperContentIterator():
    def __init__(self, papers_folder: str):
        self.papers_folder = papers_folder
        self.paper_files = [f for f in os.listdir(papers_folder) if f.endswith('.json')]
        self.iteration_count = len(self.paper_files)
        self.current_paper_index = 0
    
    def __iter__(self):
        return self
    
    def __len__(self):
        return self.iteration_count

    def __next__(self) -> tuple[str, dict[str, str]]:
        if self.current_paper_index >= self.iteration_count:
            raise StopIteration
        current_paper = self.paper_files[self.current_paper_index]
        filepath = os.path.join(self.papers_folder, current_paper)
        with open(filepath, 'r', encoding='utf-8') as f:
            content = (current_paper, json.load(f))
        self.current_paper_index += 1
        return content

In [None]:
# Calculate statistics
file_count = 0
total_tokens = 0
comment_tokens = 0
ext_comment_count = 0

encoder = tiktoken.get_encoding("cl100k_base")

paper_iterator = PaperContentIterator(PAPERS_FOLDER)
with tqdm(total=len(paper_iterator)) as pbar:
    for name, files in paper_iterator:
        if files is None:
            pbar.update(1)
            continue
        for file in files:
            file_count += 1
            ext_comments = extract_latex_comments(files[file])
            ext_comment_count += len(ext_comments)
            total_tokens += len(encoder.encode(files[file], disallowed_special=()))
            comment_tokens += len(encoder.encode('\n'.join(ext_comments), disallowed_special=()))
        pbar.update(1)
        pbar.set_description(f"Processing files={file_count} total_tokens={total_tokens} comment_tokens={comment_tokens} ext_comment_count={ext_comment_count}")

file_count, total_tokens, comment_tokens, ext_comment_count

In [None]:
# Get most common comments to filter out boilerplate
comment_counter = Counter()

def cleanup_comments(comments: list[str]) -> list[str]:
    comments = [comment.strip() for comment in comments] # Strip leading/trailing whitespace
    comments = [re.sub(r'\s+', ' ', comment) for comment in comments] # Normalize whitespace
    comments = [comment for comment in comments if len(comment.strip()) > 5] # Remove short comments (including empty)
    return comments

paper_iterator = PaperContentIterator(PAPERS_FOLDER)
with tqdm(total=len(paper_iterator), desc="Extracting comments") as pbar:
    for name, files in paper_iterator:
        if files is None:
            pbar.update(1)
            continue
        for file in files:
            comments = extract_latex_comments(files[file])
            cleaned_comments = cleanup_comments(comments)
            comment_counter.update(cleaned_comments)
        pbar.update(1)
        
comment_counter.most_common(20)

In [None]:
# Save common comments (that appear more than 10 times) to a file
with open(COMMON_COMMENTS_TXT, 'w', encoding='utf-8') as f:
    for comment, count in comment_counter.items():
        if count > 10:
            f.write(f"{comment}\n")

In [None]:
# Merge into one big comments file
remaining_comment_count = 0

def cleanup_comments(comments: list[str]) -> list[str]:
    comments = [comment.strip() for comment in comments] # Strip leading/trailing whitespace
    comments = [re.sub(r'\s+', ' ', comment) for comment in comments] # Normalize whitespace
    comments = [comment for comment in comments if len(comment.strip()) > 5] # Remove short comments (including empty)
    comments = [re.sub(r'%{4,}', '%%%', comment) for comment in comments] # Remove long separators
    comments = [re.sub(r'-{4,}', '---', comment) for comment in comments] # Remove long separators
    comments = [re.sub(r'={4,}', '===', comment) for comment in comments] # Remove long separators
    return comments

with open(COMMON_COMMENTS_TXT, 'r', encoding='utf-8') as f:
    boilerplate = set(line.strip() for line in f if line.strip())

def filter_boilerplate(comments: list[str], boilerplate: set[str]) -> list[str]:
    return [comment for comment in comments if comment not in boilerplate]

paper_iterator = PaperContentIterator(PAPERS_FOLDER)
with tqdm(total=len(paper_iterator), desc="Extracting comments") as pbar:
    with open(COMMENTS_JSONL, 'w', encoding='utf-8') as out_file:
        for name, files in paper_iterator:
            if files is None:
                pbar.update(1)
                continue
            merged_comments = ''
            for file in files:
                comments = extract_latex_comments(files[file])
                cleaned_comments = filter_boilerplate(comments, boilerplate)
                cleaned_comments = cleanup_comments(cleaned_comments)
                remaining_comment_count += len(cleaned_comments)
                merged_comments += '\n'.join(cleaned_comments)
            out_file.write(json.dumps({"name": name, "comments": merged_comments}) + '\n')
            pbar.update(1)
            
remaining_comment_count