## It is banned by arXiv to access papers with crolling

# 논문아이디 추출하는 코드

In [32]:
import os
import requests
from bs4 import BeautifulSoup
import csv
# max papers 변수로 카테고리별 논문 다운로드 개수를 정할 수 있습니다.
def get_arxiv_papers_link(category="math.AG", max_papers=5, save_dir="./arxiv_papers"):
    html_link = []
    base_url = "https://arxiv.org"
    category_url = f"{base_url}/list/{category}/current"

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    try:
        response = requests.get(category_url)
        if response.status_code != 200:
            print("Failed to retrieve the webpage")
            return

        soup = BeautifulSoup(response.content, "html.parser")
        paper_links = soup.find_all("a", title="Abstract", limit=max_papers)
        

        for link in paper_links:
            abstract_page = base_url + link["href"]
            extracted_content = abstract_page.rsplit('/', 1)[-1] +"v1"
            html_link.append(extracted_content)
        print(html_link)
        return html_link
            

    except Exception as e:
        print(f"An error occurred: {e}")

# 논문 id가 주어졌을 때 html에서 앞텍스트, 수식부분, 뒷텍스트만 추출하는 코드

In [29]:
import requests
from bs4 import BeautifulSoup
import csv

def download_math_expressions_unique(arxiv_id, file_name='math_expressions.csv'):
    # HTML 파일의 URL
    url = f'https://arxiv.org/html/{arxiv_id}'

    # 요청 보내기
    response = requests.get(url)
    if response.status_code == 200:
        # HTML 파싱
        soup = BeautifulSoup(response.text, 'html.parser')
        # <math alttext=""> 태그 찾기
        math_tags = soup.find_all('math', alttext=True)
        
        # 중복 제거를 위한 세트
        unique_expressions = set()
        
        # alttext 속성 값 추출 및 세트에 추가
        for tag in math_tags:
            alttext = tag['alttext'].strip()
            unique_expressions.add(alttext)
        
        #txt 파일로 저장.
        """# 세트의 내용을 파일에 저장
        with open('math.txt', 'w', encoding='utf-8') as file:
            for expression in unique_expressions:
                file.write(expression + '\n\n')"""
        # csv 파일로 저장.
        with open(file_name, 'w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow([' '])
            for expression in unique_expressions:
                writer.writerow([expression])
                
        print(f'formula saved: {file_name}')
    else:
        return -1

"""# 사용 예시
arxiv_id = '2403.14579v1' # 이 부분을 원하는 arXiv ID로 변경하세요.
download_math_expressions_unique(arxiv_id)"""


"# 사용 예시\narxiv_id = '2403.14579v1' # 이 부분을 원하는 arXiv ID로 변경하세요.\ndownload_math_expressions_unique(arxiv_id)"

In [38]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import os

def download_math_expressions_with_context(arxiv_id, file_name='math_expressions.jsonl'):
    # HTML 파일의 URL
    url = f'https://arxiv.org/html/{arxiv_id}'

    # 요청 보내기
    response = requests.get(url)
    if response.status_code == 200:
        # HTML 파싱
        soup = BeautifulSoup(response.text, 'html.parser')
        # <math alttext=""> 태그 찾기
        math_tags = soup.find_all('math', alttext=True)
        
        # alttext 속성 값과 앞 뒤 텍스트 추출
        expressions_with_context = []
        for tag in math_tags:
            alttext = tag['alttext'].strip()
            previous_text = tag.previous_sibling.strip() if tag.previous_sibling and isinstance(tag.previous_sibling, str) else ''
            next_text = tag.next_sibling.strip() if tag.next_sibling and isinstance(tag.next_sibling, str) else ''
            expression_with_context = {
                'Previous Text': previous_text,
                'Math Expression': alttext,
                'Next Text': next_text
            }
            expressions_with_context.append(expression_with_context)
        
        # 중복 제거
        unique_expressions = [dict(t) for t in {tuple(d.items()) for d in expressions_with_context}]
        
        # 디렉토리 생성 (존재하지 않는 경우)
        os.makedirs(os.path.dirname(file_name), exist_ok=True)
        
        # jsonl 파일로 저장
        with open(file_name, 'w', encoding='utf-8') as file:
            for expression in unique_expressions:
                file.write(json.dumps(expression, ensure_ascii=False) + '\n')
                
        print(f'Formulae saved: {file_name}')
    else:
        print('Failed to retrieve the page')
        return -1

### 하나 테스트

In [40]:
id = '2406.00228v1'
download_math_expressions_with_context(id, file_name=f"arxiv_formulas_jsonl_240618/{id}_tex.jsonl")

Formulae saved: arxiv_formulas_jsonl_240618/2406.00228v1_tex.jsonl


In [42]:
for id in get_arxiv_papers_link():
    
    try:
        download_math_expressions_with_context(id, file_name=f"arxiv_formulas_jsonl_240618/{id}_tex.jsonl")
    except Exception as e:
        print(Exception)


['2406.00228v1', '2406.00230v1', '2406.00395v1', '2406.00400v1', '2406.00463v1']
Formulae saved: arxiv_formulas_jsonl_240618/2406.00228v1_tex.jsonl
Formulae saved: arxiv_formulas_jsonl_240618/2406.00230v1_tex.jsonl
Formulae saved: arxiv_formulas_jsonl_240618/2406.00395v1_tex.jsonl
Formulae saved: arxiv_formulas_jsonl_240618/2406.00400v1_tex.jsonl
Formulae saved: arxiv_formulas_jsonl_240618/2406.00463v1_tex.jsonl


### jsonl 파일 표로 보기

In [44]:
import pandas as pd
import json

def jsonl_to_dataframe(file_name):
    # JSON Lines 파일 읽기
    data = []
    with open(file_name, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    
    # DataFrame으로 변환
    df = pd.DataFrame(data)
    
    # DataFrame 출력
    return df

# Example usage
df = jsonl_to_dataframe('arxiv_formulas_jsonl_240618/2406.00463v1_tex.jsonl')
df

Unnamed: 0,Previous Text,Math Expression,Next Text
0,puisque la variété,\tilde{X},est rationnelle sur
1,avec,c\in F^{\times},.
2,", donc",\mathbb{Z}[\omega]^{+}\oplus\mathbb{Z}[\omega]...,. Ainsi
3,,(1-\sigma)(xe_{1}+ye_{2})=xe_{1}+ye_{2}+xe_{1}...,
4,". Dans le premier cas, l’algèbre",A,"est constante, et donc nulle dans"
...,...,...,...
1984,avec,div_{X_{K}}(f)=N_{K/k}(\Delta),sur la classe de
1985,est définie par,\Phi=0,.
1986,,u\in\mathbb{R}(\mathbb{P}^{1})\subset\mathbb{R...,
1987,", on peut aussi déduire",(iv),de la manière suivante.\nLa fibration en coniq...


# jsonl 파일들 하나로 합치기

In [45]:
import glob

def merge_jsonl_files(input_directory, output_file):
    # 입력 디렉토리에서 모든 jsonl 파일 찾기
    jsonl_files = glob.glob(f'{input_directory}/*.jsonl')
    
    # 출력 파일 열기
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for file_name in jsonl_files:
            with open(file_name, 'r', encoding='utf-8') as infile:
                for line in infile:
                    outfile.write(line)
    
    print(f'Merged {len(jsonl_files)} files into {output_file}')

# Example usage
merge_jsonl_files('arxiv_formulas_jsonl_240618', 'merged_arxiv_formulas.jsonl')


Merged 5 files into merged_arxiv_formulas.jsonl
