In [2]:
import re

def load_tex_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        corpus = file.readlines()
    return [line.strip() for line in corpus]

def process_line(line):
    # 수식 부분을 제외하고 나머지 텍스트를 소문자로 변환
    def lower_except_math(text):
        if text.startswith('$') or text.startswith('\\[') or text.startswith('\\('):
            return text
        else:
            return text.lower()

    # 수식 부분을 제외한 텍스트를 찾기 위한 정규식
    pattern = re.compile(r'(\$.*?\$|\\\[.*?\\\]|\\\(.*?\\\))')
    parts = pattern.split(line)
    processed_parts = [lower_except_math(part) for part in parts]
    return ''.join(processed_parts)

def process_corpus(corpus):
    return [process_line(line) for line in corpus]

# 사용 예시
filepath = '2301.01754.tex'
corpus = load_tex_file(filepath)
processed_corpus = process_corpus(corpus)


In [3]:
import re
from typing import List
import math
from collections import Counter
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

class TeXBLEU:
    def __init__(self, corpus: List[str], vocab_size: int = 30000):
        self.tokenizer = self._train_bpe_tokenizer(corpus, vocab_size)
        
    def _train_bpe_tokenizer(self, corpus: List[str], vocab_size: int) -> Tokenizer:
        tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
        trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
                             vocab_size=vocab_size)
        tokenizer.pre_tokenizer = Whitespace()
        tokenizer.train_from_iterator(corpus, trainer)
        return tokenizer
    
    def preprocess_latex(self, text: str) -> str:
        # Add space before '\'
        text = re.sub(r'(?<![\\])(\\)', r' \1', text)
        # Remove multiple spaces
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    
    def add_positional_encoding(self, tokens: List[str]) -> List[str]:
        return [f"{i}:{token}" for i, token in enumerate(tokens)]
    
    def calculate_texbleu(self, reference: str, candidate: str, max_n: int = 4) -> float:
        ref_tokens = self.tokenizer.encode(self.preprocess_latex(reference)).tokens
        cand_tokens = self.tokenizer.encode(self.preprocess_latex(candidate)).tokens
        
        ref_tokens = self.add_positional_encoding(ref_tokens)
        cand_tokens = self.add_positional_encoding(cand_tokens)
        
        bp = self._brevity_penalty(ref_tokens, cand_tokens)
        
        scores = []
        for n in range(1, max_n + 1):
            scores.append(self._modified_precision(ref_tokens, cand_tokens, n))
        
        if 0 in scores:
            return 0
        
        score = bp * math.exp(sum(math.log(s) for s in scores) / max_n)
        return score
    
    def _brevity_penalty(self, ref_tokens: List[str], cand_tokens: List[str]) -> float:
        r = len(ref_tokens)
        c = len(cand_tokens)
        
        if c > r:
            return 1
        else:
            return math.exp(1 - r/c)
    
    def _modified_precision(self, ref_tokens: List[str], cand_tokens: List[str], n: int) -> float:
        ref_ngrams = Counter(self._get_ngrams(ref_tokens, n))
        cand_ngrams = Counter(self._get_ngrams(cand_tokens, n))
        
        max_counts = {}
        for ngram, count in cand_ngrams.items():
            max_counts[ngram] = max(0, count - max(0, count - ref_ngrams[ngram]))
        
        if len(cand_ngrams) == 0:
            return 0
        
        return sum(max_counts.values()) / sum(cand_ngrams.values())
    
    def _get_ngrams(self, tokens: List[str], n: int) -> List[tuple]:
        return [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]

# Example usage
corpus = [
    "This is a sample LaTeX document.",
    "\\begin{document} Hello, world! \\end{document}",
    "E = mc^2",
    "\\frac{1}{2}",
    "\\sum_{i=1}^n i = \\frac{n(n+1)}{2}"
]

texbleu = TeXBLEU(processed_corpus)

reference = "\\begin{equation} f(x) = \\int_{-\\infty}^\\infty \\hat f(\\xi)\\,e^{2 \\pi i \\xi x} \\,d\\xi \\end{equation}"
candidate1 = "\\begin{equation} f(x) = \\int_{-\\infty}^\\infty \\hat f(\\xi) e^{2 \\pi i \\xi x} d\\xi \\end{equation}"
candidate2 = "\\begin{align} f(x) = \\int_{-\\infty}^\\infty \\hat f(\\xi)\\,e^{2 \\pi i \\xi x} \\,d\\xi \\end{align}"

score1 = texbleu.calculate_texbleu(reference, candidate1)
score2 = texbleu.calculate_texbleu(reference, candidate2)

print(f"TeXBLEU score for candidate1: {score1:.4f}")
print(f"TeXBLEU score for candidate2: {score2:.4f}")

TeXBLEU score for candidate1: 0.5888
TeXBLEU score for candidate2: 0.9195


\\begin{equation} f(x) = \\int_{-\\infty}^\\infty \\hat f(\\xi)\\,e^{2 \\pi i \\xi x} \\,d\\xi \\end{equation}

\\begin{equation} f(x) = \\int_{-\\infty}^\\infty \\hat f(\\xi) e^{2 \\pi i \\xi x} d\\xi \\end{equation}

\\begin{align} f(x) = \\int_{-\\infty}^\\infty \\hat f(\\xi)\\,e^{2 \\pi i \\xi x} \\,d\\xi \\end{align}