In [1]:
import os
import tarfile
import gzip
import re
import io
import chardet
import json

def extract_tar_files(src_folder):
    tar_files = [os.path.join(src_folder, file) for file in os.listdir(src_folder) if file.endswith('.tar')]
    print(f"<tar_files> \n{tar_files}")
    return tar_files

def extract_gz_from_tar(tar_file):
    extracted_files = []
    with tarfile.open(tar_file, 'r') as tar:
        for member in tar.getmembers():
            if member.isfile() and member.name.endswith('.gz'):
                extracted_files.append(tar.extractfile(member).read())
    print(f"gz files : \n {extracted_files}")
    return extracted_files

def detect_encoding(data):
    result = chardet.detect(data)
    encoding = result['encoding']
    if encoding is None:
        return 'utf-8'  # 기본 인코딩 설정
    return encoding

def extract_tex_from_gz(gz_data):
    tex_content = ""
    with gzip.open(io.BytesIO(gz_data), 'rb') as f:
        raw_data = f.read()
        encoding = detect_encoding(raw_data)
        try:
            tex_content = raw_data.decode(encoding)
        except (UnicodeDecodeError, TypeError):
            tex_content = raw_data.decode('latin1')  # utf-8 디코딩 실패시 기본적으로 latin1 사용
    return tex_content

def extract_equations_with_context(tex_content):
    """1. LaTeX Data with forward, backward English data collect<br>
    detecting formula criterion:
    1) between \\begin{equation} and \\end{equation}.
    2) between $ and $.
    3) between $$ and $$."""
    
    equations_with_context = []
    i = 0

    def clean_context(context):
        """ Remove non-English words and special characters from the context """
        # Keep only letters, digits, and basic punctuation
        cleaned_context = re.sub(r'[^a-zA-Z0-9\s.,]', '', context)
        # Split into words and keep only English words (simple check)
        words = cleaned_context.split()
        english_words = [word for word in words if re.match(r'^[a-zA-Z0-9]+$', word)]
        return ' '.join(english_words)

    while i < len(tex_content):
        context_before = tex_content[max(0, i-100):i].strip()
        equation = ""
        context_after = ""
        
        if tex_content[i:i+2] == '$$':
            # Find the end of the double dollar sign equation
            end = tex_content.find('$$', i + 2)
            if end == -1:
                break
            equation = tex_content[i:end+2]
            context_after = tex_content[end+2:end+102].strip()
            i = end + 2
        elif tex_content[i] == '$':
            # Find the end of the single dollar sign equation
            end = tex_content.find('$', i + 1)
            if end == -1:
                break
            equation = tex_content[i:end+1]
            context_after = tex_content[end+1:end+101].strip()
            i = end + 1
        elif tex_content[i:i+16] == '\\begin{equation}':
            # Find the end of the equation environment
            end = tex_content.find('\\end{equation}', i + 16)
            if end == -1:
                break
            equation = tex_content[i:end+14]
            context_after = tex_content[end+14:end+114].strip()
            i = end + 14
        else:
            i += 1
            continue

        # Clean and limit context_before to 10 words or fewer
        context_before = clean_context(context_before)
        context_before_words = context_before.split()
        if len(context_before_words) > 10:
            context_before = ' '.join(context_before_words[-10:])

        # Clean and limit context_after to 10 words or fewer
        context_after = clean_context(context_after)
        context_after_words = context_after.split()
        if len(context_after_words) > 10:
            context_after = ' '.join(context_after_words[:10])
        
        # Ensure context_after does not contain another equation
        if '$' in context_after:
            context_after = context_after.split('$')[0].strip()
        if '\\begin{equation}' in context_after:
            context_after = context_after.split('\\begin{equation}')[0].strip()
                
        # Ensure the equation is 100 characters or fewer
        if len(equation) <= 100:
            equations_with_context.append({
                'context_before': context_before,
                'equation': equation,
                'context_after': context_after
            })
    
    return equations_with_context

def save_results_to_jsonl(results, output_file):
    with open(output_file, 'a', encoding='utf-8') as file:  # 'a' 모드를 사용하여 파일에 추가
        for result in results:
            file.write(json.dumps(result) + '\n')

def process_gz_file(gz_data, output_file):
    tex_content = extract_tex_from_gz(gz_data)
    equations_with_context = extract_equations_with_context(tex_content)
    if len(equations_with_context) > 10000:
        return
    save_results_to_jsonl(equations_with_context, output_file)

def main(src_folder, output_file):
    tar_files = extract_tar_files(src_folder)
    
    for tar_file in tar_files:
        gz_files = extract_gz_from_tar(tar_file)
        for gz_data in gz_files:
            process_gz_file(gz_data, output_file)

if __name__ == "__main__":
    src_folder = 'downloads2/src'  # src 폴더의 경로를 적절히 설정하세요.
    output_file = 'zip_files_extract_latex_serial_0626_0012.jsonl'
    main(src_folder, output_file)


<tar_files> 
['downloads2/src\\arXiv_src_0001_001.tar', 'downloads2/src\\arXiv_src_0002_001.tar', 'downloads2/src\\arXiv_src_0003_001.tar', 'downloads2/src\\arXiv_src_0004_001.tar', 'downloads2/src\\arXiv_src_0005_001.tar', 'downloads2/src\\arXiv_src_0006_001.tar', 'downloads2/src\\arXiv_src_0007_001.tar', 'downloads2/src\\arXiv_src_0008_001.tar', 'downloads2/src\\arXiv_src_0009_001.tar', 'downloads2/src\\arXiv_src_0010_001.tar']
