## 압축해제해야 될 때

In [1]:
import os
import tarfile
import gzip
import re
import io
import chardet
import json

def extract_tar_files(src_folder):
    tar_files = [os.path.join(src_folder, file) for file in os.listdir(src_folder) if file.endswith('.tar')]
    print(f"<tar_files> \n{tar_files}")
    return tar_files

def extract_gz_from_tar(tar_file):
    extracted_files = []
    with tarfile.open(tar_file, 'r') as tar:
        for member in tar.getmembers():
            if member.isfile() and member.name.endswith('.gz'):
                extracted_files.append(tar.extractfile(member).read())
    print(f"gz files : \n {extracted_files}")
    return extracted_files

def detect_encoding(data):
    result = chardet.detect(data)
    encoding = result['encoding']
    if encoding is None:
        return 'utf-8'  # 기본 인코딩 설정
    return encoding

def extract_tex_from_gz(gz_data):
    tex_content = ""
    with gzip.open(io.BytesIO(gz_data), 'rb') as f:
        raw_data = f.read()
        encoding = detect_encoding(raw_data)
        try:
            tex_content = raw_data.decode(encoding)
        except (UnicodeDecodeError, TypeError):
            tex_content = raw_data.decode('latin1')  # utf-8 디코딩 실패시 기본적으로 latin1 사용
    return tex_content

def extract_equations_with_context(tex_content):
    pattern = r'(?P<context_before>.{0,200}?)(?P<equation>\$.*?\$)(?P<context_after>.{0,200}?)($|\n)'
    matches = re.finditer(pattern, tex_content, re.DOTALL)
    equations_with_context = []
    for match in matches:
        context_before = match.group('context_before').strip()
        equation = match.group('equation').strip()
        context_after = match.group('context_after').strip()

        # context_before를 10단어 이하로 제한
        context_before_words = context_before.split()
        if len(context_before_words) > 10:
            context_before = ' '.join(context_before_words[-10:])
        
        # context_after를 10단어 이하로 제한
        context_after_words = context_after.split()
        if len(context_after_words) > 10:
            context_after = ' '.join(context_after_words[:10])
        
        # context_after에 수식이 없도록 처리
        if '$' in context_after:
            context_after = context_after.split('$')[0].strip()
                
        # context_after가 비어 있는 경우 다음 줄의 내용을 포함하도록 보정
        if not context_after and '\n' in tex_content:
            remaining_text = tex_content.split(equation, 1)[1].strip()
            context_after = remaining_text.split('\n', 1)[0].strip()
            if '$' in context_after:
                context_after = context_after.split('$')[0].strip()

        # context_after를 다시 10단어 이하로 제한
        context_after_words = context_after.split()
        if len(context_after_words) > 10:
            context_after = ' '.join(context_after_words[:10])

        # 수식의 길이가 100 이하인지 확인
        if len(equation) <= 100:
            equations_with_context.append({
                'context_before': context_before,
                'equation': f'<equation>{equation}</equation>',
                'context_after': context_after
            })
    return equations_with_context

def save_results_to_jsonl(results, output_file):
    with open(output_file, 'a', encoding='utf-8') as file:  # 'a' 모드를 사용하여 파일에 추가
        for result in results:
            file.write(json.dumps(result) + '\n')

def process_gz_file(gz_data, output_file):
    tex_content = extract_tex_from_gz(gz_data)
    equations_with_context = extract_equations_with_context(tex_content)
    if len(equations_with_context) > 10000:
        return
    save_results_to_jsonl(equations_with_context, output_file)

def main(src_folder, output_file):
    tar_files = extract_tar_files(src_folder)
    
    for tar_file in tar_files:
        gz_files = extract_gz_from_tar(tar_file)
        for gz_data in gz_files:
            process_gz_file(gz_data, output_file)

if __name__ == "__main__":
    src_folder = 'downloads2/src'  # src 폴더의 경로를 적절히 설정하세요.
    output_file = 'output_example.jsonl'
    main(src_folder, output_file)


<tar_files> 
['downloads2/src\\arXiv_src_0001_001.tar', 'downloads2/src\\arXiv_src_0002_001.tar', 'downloads2/src\\arXiv_src_0003_001.tar', 'downloads2/src\\arXiv_src_0004_001.tar', 'downloads2/src\\arXiv_src_0005_001.tar', 'downloads2/src\\arXiv_src_0006_001.tar', 'downloads2/src\\arXiv_src_0007_001.tar', 'downloads2/src\\arXiv_src_0008_001.tar', 'downloads2/src\\arXiv_src_0009_001.tar', 'downloads2/src\\arXiv_src_0010_001.tar']


## Tex 파일만 있을 때

In [3]:
import os
import re
import json

def extract_equations_with_context(tex_content):
    pattern = r'(?P<context_before>.{0,200}?)(?P<equation>\$.*?\$)(?P<context_after>.{0,200}?)($|\n)'
    matches = re.finditer(pattern, tex_content, re.DOTALL)
    equations_with_context = []
    for match in matches:
        context_before = match.group('context_before').strip()
        equation = match.group('equation').strip()
        context_after = match.group('context_after').strip()

        # context_before를 10단어 이하로 제한
        context_before_words = context_before.split()
        if len(context_before_words) > 10:
            context_before = ' '.join(context_before_words[-10:])
        
        # context_after를 10단어 이하로 제한
        context_after_words = context_after.split()
        if len(context_after_words) > 10:
            context_after = ' '.join(context_after_words[:10])
        
        # context_after에 수식이 없도록 처리
        if '$' in context_after:
            context_after = context_after.split('$')[0].strip()
                
        # context_after가 비어 있는 경우 다음 줄의 내용을 포함하도록 보정
        if not context_after and '\n' in tex_content:
            remaining_text = tex_content.split(equation, 1)[1].strip()
            context_after = remaining_text.split('\n', 1)[0].strip()
            if '$' in context_after:
                context_after = context_after.split('$')[0].strip()

        # context_after를 다시 10단어 이하로 제한
        context_after_words = context_after.split()
        if len(context_after_words) > 10:
            context_after = ' '.join(context_after_words[:10])

        # 수식의 길이가 100 이하인지 확인
        if len(equation) <= 100:
            equations_with_context.append({
                'context_before': context_before,
                'equation': f'<equation>{equation}</equation>',
                'context_after': context_after
            })
    return equations_with_context

# Path to the arxiv_papers_tex folder
base_path = 'C:/Users/wjdrb/vscode_code/MathBridge_new/data/arxiv_papers_tex'


# List to store all equations with context
all_equations_with_context = []

# Function to read file with multiple encoding attempts
def read_file_with_multiple_encodings(file_path):
    encodings = ['utf-8', 'latin-1', 'iso-8859-1']
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.read()
        except (UnicodeDecodeError, FileNotFoundError):
            continue
    raise UnicodeDecodeError(f"Failed to read file {file_path} with available encodings.")

# Iterate over each folder (paper) in the base directory
for paper_folder in os.listdir(base_path):
    paper_folder_path = os.path.join(base_path, paper_folder)
    
    if os.path.isdir(paper_folder_path):
        # Iterate over each file in the paper folder
        for tex_file in os.listdir(paper_folder_path):
            if tex_file.endswith('.tex') or tex_file.endswith('.TEX'):
                tex_file_path = os.path.join(paper_folder_path, tex_file)
                
                # Read the content of the tex file with multiple encoding attempts
                try:
                    tex_content = read_file_with_multiple_encodings(tex_file_path)
                except UnicodeDecodeError as e:
                    print(f"Could not read file {tex_file_path}: {e}")
                    continue
                
                # Extract equations with context
                equations_with_context = extract_equations_with_context(tex_content)
                
                # Add the paper number to each extracted entry
                for entry in equations_with_context:
                    entry['paper_number'] = paper_folder
                
                # Append to the list
                all_equations_with_context.extend(equations_with_context)

# Define the output file path
output_file_path = 'equations_with_context_math.jsonl'

# Write the results to a jsonl file
with open(output_file_path, 'w', encoding='utf-8') as outfile:
    for entry in all_equations_with_context:
        json.dump(entry, outfile)
        outfile.write('\n')
