In [28]:
import os
import tarfile
import gzip
import re
import io
import chardet
import json

def extract_tar_files(src_folder):
    tar_files = [os.path.join(src_folder, file) for file in os.listdir(src_folder) if file.endswith('.tar')]
    return tar_files

def extract_gz_from_tar(tar_file):
    extracted_files = []
    with tarfile.open(tar_file, 'r') as tar:
        for member in tar.getmembers():
            if member.isfile() and member.name.endswith('.gz'):
                extracted_files.append(tar.extractfile(member).read())
    return extracted_files

def detect_encoding(data):
    result = chardet.detect(data)
    encoding = result['encoding']
    if encoding is None:
        return 'utf-8'  # 기본 인코딩 설정
    return encoding

def extract_tex_from_gz(gz_data):
    tex_content = ""
    with gzip.open(io.BytesIO(gz_data), 'rb') as f:
        raw_data = f.read()
        encoding = detect_encoding(raw_data)
        try:
            tex_content = raw_data.decode(encoding)
        except (UnicodeDecodeError, TypeError):
            tex_content = raw_data.decode('latin1')  # utf-8 디코딩 실패시 기본적으로 latin1 사용
    return tex_content

def extract_equations_with_context(tex_content):
    pattern = r'(?P<context_before>.*?)(?P<equation>\$.*?\$)(?P<context_after>.*?)(?=\n|$)'
    matches = re.finditer(pattern, tex_content, re.DOTALL)
    equations_with_context = []
    for match in matches:
        context_before = match.group('context_before').strip()
        equation = match.group('equation').strip()
        context_after = match.group('context_after').strip()

        #context_after에 수식이 없도록 처리.
        if '$' in context_after:
                context_after = context_after.split('$')[0].strip()
                
        # context_after가 비어 있는 경우 다음 줄의 내용을 포함하도록 보정
        if not context_after and '\n' in tex_content:
            remaining_text = tex_content.split(equation, 1)[1].strip()
            
            #context_after에 수식이 없도록 처리.
            context_after = remaining_text.split('\n', 1)[0].strip()
            if '$' in context_after:
                context_after = context_after.split('$')[0].strip()
        equations_with_context.append({
            'context_before': context_before,
            'equation': equation,
            'context_after': context_after
        })
    return equations_with_context

def main(src_folder):
    tar_files = extract_tar_files(src_folder)
    all_equations_with_context = []
    
    for tar_file in tar_files:
        gz_files = extract_gz_from_tar(tar_file)
        for gz_data in gz_files:
            tex_content = extract_tex_from_gz(gz_data)
            equations_with_context = extract_equations_with_context(tex_content)
            all_equations_with_context.extend(equations_with_context)
    
    return all_equations_with_context

def save_results_to_jsonl(results, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        for result in results:
            file.write(json.dumps(result) + '\n')

if __name__ == "__main__":
    src_folder = 'downloads/src'  # src 폴더의 경로를 적절히 설정하세요.
    output_file = 'output.jsonl'
    equations_with_context = main(src_folder)
    save_results_to_jsonl(equations_with_context, output_file)
