In [1]:
from bs4 import BeautifulSoup
import requests
import os
from tqdm.notebook import tqdm
from PyPDF2 import PdfReader
import requests
from requests.exceptions import RequestException
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm
import time
import threading
import json
import re
from datetime import datetime

In [2]:
now = datetime.now()
BEGIN_DATE = '2024-5-1'
END_DATE = '2024-5-14'
# TYPE = 'computer_science'
# TYPE = 'mathematics'
TYPE = 'physics'
json_save_path = f"data/arxiv_pdfs_{TYPE}_{BEGIN_DATE.replace('-', '')}to{END_DATE.replace('-', '')}.json"


temp_folder = 'arxiv' + now.strftime("%Y-%m-%d-%H-%M-%S")
PAGE_SIZE = '100'
START = '0'
PAGE_NUM = 12
max_sample = 1000
begin = 2000
end = 7000
pad = 1000

In [3]:
path_list = []

for START in [str(i * int(PAGE_SIZE)) for i in range(PAGE_NUM)]:
    
    url = f'https://arxiv.org/search/advanced?advanced=1&terms-0-operator=AND&terms-0-term=&terms-0-field=title&classification-{TYPE}=y&classification-include_cross_list=exclude&date-year=&date-filter_by=date_range&date-from_date={BEGIN_DATE}&date-to_date={END_DATE}&date-date_type=submitted_date_first&abstracts=hide&size={PAGE_SIZE}&order=-announced_date_first&start={START}'
    print(f'url:{url}')

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    papers = soup.find_all('li', class_='arxiv-result')

    pdf_links = [paper.find('a', string='pdf')['href'] for paper in papers if paper.find('a', string='pdf')]

    print(f"start:{START} success:{len(pdf_links)}")
    
    path_list += pdf_links
    
    if len(pdf_links) == 0:
        break

print(f'total:{len(path_list)}')

url:https://arxiv.org/search/advanced?advanced=1&terms-0-operator=AND&terms-0-term=&terms-0-field=title&classification-physics=y&classification-include_cross_list=exclude&date-year=&date-filter_by=date_range&date-from_date=2024-5-1&date-to_date=2024-5-14&date-date_type=submitted_date_first&abstracts=hide&size=100&order=-announced_date_first&start=0
start:0 success:100
url:https://arxiv.org/search/advanced?advanced=1&terms-0-operator=AND&terms-0-term=&terms-0-field=title&classification-physics=y&classification-include_cross_list=exclude&date-year=&date-filter_by=date_range&date-from_date=2024-5-1&date-to_date=2024-5-14&date-date_type=submitted_date_first&abstracts=hide&size=100&order=-announced_date_first&start=100
start:100 success:100
url:https://arxiv.org/search/advanced?advanced=1&terms-0-operator=AND&terms-0-term=&terms-0-field=title&classification-physics=y&classification-include_cross_list=exclude&date-year=&date-filter_by=date_range&date-from_date=2024-5-1&date-to_date=2024-5-14

In [14]:
def download_pdf(pdf_url, path, total_size, max_retries=3, timeout=10, update_progress=None):
    pdf_name = pdf_url.split('/')[-1]
    file_path = os.path.join(path, f'{pdf_name}.pdf')

    attempts = 0
    while attempts < max_retries:
        try:
            with requests.get(pdf_url, stream=True, timeout=timeout) as r:
                r.raise_for_status()
                with open(file_path, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)
            if update_progress:
                update_progress()
            return True
        except RequestException as e:
            attempts += 1
            time.sleep(1)
            if attempts == max_retries:
                if update_progress:
                    update_progress()
                return False

def progress_monitor(total_tasks):
    progress = tqdm(total=total_tasks, desc="downloading", leave=True)
    while not progress_monitor.finished:
        progress.n = progress_monitor.completed_tasks
        progress.refresh()
        time.sleep(0.5)
    progress.n = progress_monitor.completed_tasks
    progress.refresh()
    progress.close()

def download_pdfs_concurrently(pdf_links, path, num_threads=10):
    os.makedirs(path, exist_ok=True)
    progress_monitor.completed_tasks = 0
    progress_monitor.finished = False

    def update_progress():
        progress_monitor.completed_tasks += 1

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(download_pdf, url, path, 0, update_progress=update_progress) for url in pdf_links]
        
        monitor_thread = threading.Thread(target=progress_monitor, args=(len(pdf_links),))
        monitor_thread.start()

        for future in as_completed(futures):
            pass

    progress_monitor.finished = True
    monitor_thread.join()
    

download_pdfs_concurrently(path_list, temp_folder)


downloading:   0%|          | 0/1200 [00:00<?, ?it/s]

In [15]:
# Processing data
def extract_first_n_chars_from_pdfs(folder_path, n):
    extracted_texts = []
    for filename in tqdm(os.listdir(folder_path)):
        if filename.endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, 'rb') as file:
                    reader = PdfReader(file)
                    text = ""
                    for page in reader.pages:
                        page_text = page.extract_text()
                        if page_text:
                            text += page_text
                            if len(text) >= n:
                                break
                    extracted_texts.append(text[:n])
            except Exception as e:
                print(f"Error reading {filename}: {e}")
    return extracted_texts


def save_list_as_json(file_path, string_list):
    """
    Saves a list of strings as a JSON file.

    :param file_path: Path where the JSON file will be saved.
    :param string_list: List of strings to be saved in the JSON file.
    """
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(string_list, file, ensure_ascii=False, indent=4)


def replace_multiple_spaces_with_single(input_string):
    return re.sub(' +', ' ', input_string)

        
assert begin < end

extracted_texts = extract_first_n_chars_from_pdfs(temp_folder, begin + end + pad)

extracted_texts = [text.encode('utf-8', 'ignore').decode('utf-8') for text in extracted_texts]

extracted_texts = [x.replace('\n', '') for x in extracted_texts]

extracted_texts = [replace_multiple_spaces_with_single(x) for x in extracted_texts]

extracted_texts = [x[begin: end] for x in extracted_texts]

# Only retain samples where the proportion of letters exceeds x%
n = (end - begin) * 0.6
extracted_texts = [s for s in extracted_texts if len(re.findall(r'[a-zA-Z]', s)) > n]

extracted_texts = extracted_texts[:max_sample]

print(len(extracted_texts))
# print([len(x) for x in extracted_texts])

# Save as a JSON file for faster loading in the future if needed.
save_list_as_json(json_save_path, extracted_texts)


  0%|          | 0/1200 [00:00<?, ?it/s]

Error reading 2404.11961.pdf: [Errno 22] Invalid argument
1000
