In [2]:
pip install paperscraper


Collecting paperscraper
  Downloading paperscraper-0.2.14-py3-none-any.whl.metadata (13 kB)
Collecting arxiv>=1.4.2 (from paperscraper)
  Downloading arxiv-2.1.3-py3-none-any.whl.metadata (6.1 kB)
Collecting pymed-paperscraper (from paperscraper)
  Downloading pymed_paperscraper-0.0.1-py3-none-any.whl.metadata (3.3 kB)
Collecting scholarly>=1.0.0 (from paperscraper)
  Downloading scholarly-1.7.11-py3-none-any.whl.metadata (7.4 kB)
Collecting matplotlib-venn (from paperscraper)
  Downloading matplotlib-venn-1.1.1.tar.gz (40 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting impact-factor>=1.1.0 (from paperscraper)
  Downloading impact_factor-1.1.2-py3-none-any.whl.metadata (3.3 kB)
Collecting thefuzz (from paperscraper)
  Downloading thefuzz-0.22.1-py3-none-any.whl.metadata (3.9 kB)
Collecting pytest (from paperscraper)
  Downloading pytest-8.3.3-py3-none-any.

# COMPUTER SCIENCE 

In [4]:
import os
from paperscraper.arxiv import get_and_dump_arxiv_papers
from paperscraper.pdf import save_pdf_from_dump

# Define the years and base path
years = ['2024', '2023', '2022', '2021', '2020']
base_path = "/Users/kuntal/Documents/Github/arxiv scraper"
pdf_directory = os.path.join(base_path, 'pdfs')

# Create a directory named 'pdfs' if it doesn't exist
os.makedirs(pdf_directory, exist_ok=True)

for year in years:
    # Define the query for the specific year
    query = [['cs'], [year]]

    # Set the metadata file for each year
    metadata_filepath = os.path.join(base_path, f'arxiv_cs_{year}.jsonl')

    # Fetch and dump arXiv papers metadata
    get_and_dump_arxiv_papers(query, output_filepath=metadata_filepath)

    # Download and save PDFs to the 'pdfs' directory
    save_pdf_from_dump(metadata_filepath, pdf_path=pdf_directory, key_to_save='doi')

    print(f"Papers for {year} have been saved to the '{pdf_directory}' directory.")


Processing (all:cs) AND (all:2024): 694it [00:30, 22.72it/s]
  soup = BeautifulSoup(response.text, features="lxml")
Processing paper 694/694: 100%|██████████| 694/694 [27:55<00:00,  2.41s/it]


Papers for 2024 have been saved to the '/Users/kuntal/Documents/Github/arxiv scraper/pdfs' directory.


Processing (all:cs) AND (all:2023): 692it [00:30, 22.94it/s]
  soup = BeautifulSoup(response.text, features="lxml")
Processing paper 45/692:   6%|▋         | 44/692 [01:41<24:53,  2.30s/it]  


KeyboardInterrupt: 

In [None]:
pip install arxiv


In [None]:
import arxiv
import time

# Define search parameters
search_query = "cat:cs"  # Topic of interest
start_date = "2024-01-01"
end_date = "2024-11-20"


# Initialize pagination parameters
max_results_per_call = 100  # Maximum results per API call
start_index = 0             # Starting index for pagination
all_results = []            # List to store all retrieved results


while True:
    # Perform search with pagination
    search = arxiv.Search(
        query=search_query,
        max_results=max_results_per_call,
        sort_by=arxiv.SortCriterion.SubmittedDate,
        sort_order=arxiv.SortOrder.Ascending,
        start=start_index,
        date_from=start_date,
        date_to=end_date
    )

    # Retrieve results
    results = list(search.results())

    if not results:
        # No more results to fetch
        break

    # Append retrieved results to the list
    all_results.extend(results)

    # Update start_index for next batch
    start_index += max_results_per_call

    # Respect arXiv's rate limits by adding a delay between requests
    time.sleep(3)  # 3-second delay

print(f"Total papers retrieved: {len(all_results)}")


# api

In [14]:
import os
import requests
import time
from datetime import datetime
import feedparser

# Define search parameters
search_query = "cat:cs.*"  # Computer Science category
year = 2024
max_results_per_call = 100  # Maximum results per API call
total_papers_to_download = 3000  # Total number of papers to download
base_url = "http://export.arxiv.org/api/query?"

# Directory structure
base_dir = "pdfs"
year_dir = os.path.join(base_dir, str(year))
os.makedirs(year_dir, exist_ok=True)

# Function to fetch papers from arXiv API
def fetch_papers(start_index, max_results):
    query = f"search_query={search_query}&start={start_index}&max_results={max_results}&sortBy=submittedDate&sortOrder=descending"
    response = requests.get(base_url + query)
    response.raise_for_status()
    return feedparser.parse(response.text)

# Function to download PDF
def download_pdf(pdf_url, pdf_filename):
    try:
        print(f"Downloading {pdf_url}...")
        response = requests.get(pdf_url)
        response.raise_for_status()
        with open(pdf_filename, "wb") as pdf_file:
            pdf_file.write(response.content)
        print(f"Saved to {pdf_filename}")
    except Exception as e:
        print(f"Failed to download {pdf_url}: {e}")

# Main loop to fetch and download papers
downloaded_count = 0
start_index = 0

while downloaded_count < total_papers_to_download:
    try:
        feed = fetch_papers(start_index, max_results_per_call)
        entries = feed.entries

        if not entries:
            print("No more entries found or received an empty page.")
            break

        for entry in entries:
            published_year = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ").year
            if published_year == year:
                pdf_url = entry.link.replace("abs", "pdf") + ".pdf"
                pdf_filename = os.path.join(year_dir, f"{entry.id.split('/')[-1]}.pdf")
                if not os.path.exists(pdf_filename):
                    download_pdf(pdf_url, pdf_filename)
                    downloaded_count += 1
                    if downloaded_count >= total_papers_to_download:
                        break

        start_index += max_results_per_call
        time.sleep(3)  # Pause to respect arXiv's rate limits

    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}. Retrying after a pause.")
        time.sleep(10)  # Wait before retrying

print(f"Downloaded {downloaded_count} papers to {year_dir}")


Downloading http://arxiv.org/pdf/2411.13553v1.pdf...
Saved to pdfs/2024/2411.13553v1.pdf
No more entries found or received an empty page.
Downloaded 1 papers to pdfs/2024


In [17]:
#  arXiv's OAI-PMH

In [18]:
import os
import requests
import time
from datetime import datetime
import feedparser

# Define search parameters
search_query = "cat:cs.*"  # Computer Science category
year = 2024
max_results_per_call = 100  # Maximum results per API call
total_papers_to_download = 3000  # Total number of papers to download
base_url = "http://export.arxiv.org/api/query?"

# Directory structure
base_dir = "pdfs"
year_dir = os.path.join(base_dir, str(year))
os.makedirs(year_dir, exist_ok=True)

# Function to fetch papers from arXiv API
def fetch_papers(start_index, max_results):
    query = f"search_query={search_query}&start={start_index}&max_results={max_results}&sortBy=submittedDate&sortOrder=descending"
    response = requests.get(base_url + query)
    response.raise_for_status()
    return feedparser.parse(response.text)

# Function to download PDF
def download_pdf(pdf_url, pdf_filename):
    try:
        print(f"Downloading {pdf_url}...")
        response = requests.get(pdf_url)
        response.raise_for_status()
        with open(pdf_filename, "wb") as pdf_file:
            pdf_file.write(response.content)
        print(f"Saved to {pdf_filename}")
    except Exception as e:
        print(f"Failed to download {pdf_url}: {e}")

# Main loop to fetch and download papers
downloaded_count = 0
start_index = 0

while downloaded_count < total_papers_to_download:
    try:
        feed = fetch_papers(start_index, max_results_per_call)
        entries = feed.entries

        if not entries:
            print("No more entries found or received an empty page.")
            break

        for entry in entries:
            published_year = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ").year
            if published_year == year:
                pdf_url = entry.link.replace("abs", "pdf") + ".pdf"
                pdf_filename = os.path.join(year_dir, f"{entry.id.split('/')[-1]}.pdf")
                if not os.path.exists(pdf_filename):
                    download_pdf(pdf_url, pdf_filename)
                    downloaded_count += 1
                    if downloaded_count >= total_papers_to_download:
                        break

        start_index += max_results_per_call
        time.sleep(3)  # Pause to respect arXiv's rate limits

    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}. Retrying after a pause.")
        time.sleep(10)  # Wait before retrying

print(f"Downloaded {downloaded_count} papers to {year_dir}")


Downloading http://arxiv.org/pdf/2411.13552v1.pdf...


KeyboardInterrupt: 

 # arXiv's OAI-PMH


In [20]:
pip install sickle requests

Collecting sickle
  Downloading Sickle-0.7.0-py3-none-any.whl.metadata (4.5 kB)
Downloading Sickle-0.7.0-py3-none-any.whl (12 kB)
    extract-msg (<=0.29.*)
                 ~~~~~~~^[0m[33m
[0mInstalling collected packages: sickle
Successfully installed sickle-0.7.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [22]:
from sickle import Sickle
from sickle.models import Record
import time
import requests
import os

# Initialize the Sickle client with the arXiv OAI-PMH endpoint
sickle = Sickle('http://export.arxiv.org/oai2')

# Define the parameters for the ListRecords request
params = {
    'metadataPrefix': 'arXiv',
    'set': 'cs',          # Computer Science category
    'from': '2024-01-01',
    'until': '2024-12-31'
}

# Define the base directory where PDFs will be saved
base_dir = '/Users/kuntal/Documents/Github/arxiv scraper/pdfs'

# Create the year directory inside the base directory
year = '2024'
year_dir = os.path.join(base_dir, year)
os.makedirs(year_dir, exist_ok=True)

# Retrieve records from the OAI-PMH interface
records = sickle.ListRecords(**params)

# Counter for tracking the number of papers downloaded
paper_count = 0

# Iterate over the records and download PDFs
for record in records:
    if isinstance(record, Record):
        metadata = record.metadata
        arxiv_id = metadata.get('id')[0]

        # Download the PDF
        pdf_url = f"http://arxiv.org/pdf/{arxiv_id}.pdf"
        try:
            response = requests.get(pdf_url, timeout=10)
            response.raise_for_status()
            # Replace slashes in arXiv ID to avoid directory issues
            pdf_filename = f"{arxiv_id.replace('/', '_')}.pdf"
            pdf_path = os.path.join(year_dir, pdf_filename)
            with open(pdf_path, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded PDF: {pdf_path}")
            paper_count += 1
        except requests.exceptions.RequestException as e:
            print(f"Failed to download PDF for {arxiv_id}: {e}")

        # Be polite and avoid overwhelming the server
        time.sleep(3)
    else:
        print("No more records.")
        break

print(f"Total papers downloaded: {paper_count}")


Downloaded PDF: /Users/kuntal/Documents/Github/arxiv scraper/pdfs/2024/0705.1329.pdf
Downloaded PDF: /Users/kuntal/Documents/Github/arxiv scraper/pdfs/2024/0710.3901.pdf
Downloaded PDF: /Users/kuntal/Documents/Github/arxiv scraper/pdfs/2024/0802.3284.pdf
Downloaded PDF: /Users/kuntal/Documents/Github/arxiv scraper/pdfs/2024/0802.3300.pdf
Downloaded PDF: /Users/kuntal/Documents/Github/arxiv scraper/pdfs/2024/0802.3414.pdf
Downloaded PDF: /Users/kuntal/Documents/Github/arxiv scraper/pdfs/2024/0803.0966.pdf
Downloaded PDF: /Users/kuntal/Documents/Github/arxiv scraper/pdfs/2024/0806.1636.pdf
Downloaded PDF: /Users/kuntal/Documents/Github/arxiv scraper/pdfs/2024/0808.0163.pdf
Downloaded PDF: /Users/kuntal/Documents/Github/arxiv scraper/pdfs/2024/0808.0521.pdf
Downloaded PDF: /Users/kuntal/Documents/Github/arxiv scraper/pdfs/2024/0811.1449.pdf
Downloaded PDF: /Users/kuntal/Documents/Github/arxiv scraper/pdfs/2024/0901.0044.pdf
Downloaded PDF: /Users/kuntal/Documents/Github/arxiv scraper/pdfs

KeyboardInterrupt: 

# only 2 column format papers

In [2]:
from sickle import Sickle
from sickle.models import Record
import time
import requests
import os
from PyPDF2 import PdfReader

# Initialize the Sickle client with the arXiv OAI-PMH endpoint
sickle = Sickle('http://export.arxiv.org/oai2')

# Define the parameters for the ListRecords request
params = {
    'metadataPrefix': 'arXiv',
    'set': 'cs',          # Computer Science category
    'from': '2024-01-01',
    'until': '2024-12-31'
}

# Define the base directory where PDFs will be saved
base_dir = '/Users/kuntal/Documents/Github/arxiv_scraper/pdfs-2columns-format'

# Create the year directory inside the base directory
year = '2024'
year_dir = os.path.join(base_dir, year)
os.makedirs(year_dir, exist_ok=True)

# Function to check if a PDF is two-column formatted
def is_two_column(pdf_path):
    try:
        reader = PdfReader(pdf_path)
        first_page = reader.pages[0]  # Only check the first page
        # Analyze text positions or layout to detect two-column structure
        # Placeholder: Actual implementation depends on library capabilities
        text_positions = first_page.extract_text()  # Example; refine as needed
        # Logic to analyze column layout
        return "two-column criteria" in text_positions  # Replace with actual logic
    except Exception as e:
        print(f"Error reading PDF {pdf_path}: {e}")
        return False

# Retrieve records from the OAI-PMH interface
records = sickle.ListRecords(**params)

# Counter for tracking the number of papers downloaded
paper_count = 0

# Iterate over the records and download PDFs
for record in records:
    if isinstance(record, Record):
        metadata = record.metadata
        arxiv_id = metadata.get('id')[0]

        # Download the PDF temporarily for checking
        pdf_url = f"http://arxiv.org/pdf/{arxiv_id}.pdf"
        temp_pdf_path = os.path.join(year_dir, f"temp_{arxiv_id.replace('/', '_')}.pdf")
        try:
            response = requests.get(pdf_url, timeout=10)
            response.raise_for_status()
            with open(temp_pdf_path, 'wb') as f:
                f.write(response.content)

            # Check if the PDF is two-column formatted
            if is_two_column(temp_pdf_path):
                pdf_filename = f"{arxiv_id.replace('/', '_')}.pdf"
                final_pdf_path = os.path.join(year_dir, pdf_filename)
                os.rename(temp_pdf_path, final_pdf_path)
                print(f"Downloaded two-column PDF: {final_pdf_path}")
                paper_count += 1
            else:
                os.remove(temp_pdf_path)  # Delete the non-matching PDF
                print(f"Skipped non-two-column PDF: {arxiv_id}")

        except requests.exceptions.RequestException as e:
            print(f"Failed to download PDF for {arxiv_id}: {e}")
            if os.path.exists(temp_pdf_path):
                os.remove(temp_pdf_path)

        # Be polite and avoid overwhelming the server
        time.sleep(3)
    else:
        print("No more records.")
        break

print(f"Total two-column papers downloaded: {paper_count}")


Skipped non-two-column PDF: 0705.1329
Skipped non-two-column PDF: 0710.3901
Skipped non-two-column PDF: 0802.3284
Skipped non-two-column PDF: 0802.3300
Skipped non-two-column PDF: 0802.3414
Skipped non-two-column PDF: 0803.0966
Skipped non-two-column PDF: 0806.1636
Skipped non-two-column PDF: 0808.0163
Skipped non-two-column PDF: 0808.0521
Skipped non-two-column PDF: 0811.1449
Skipped non-two-column PDF: 0901.0044
Skipped non-two-column PDF: 0901.1988
Skipped non-two-column PDF: 0903.2016
Skipped non-two-column PDF: 0903.4826
Skipped non-two-column PDF: 0904.3742
Skipped non-two-column PDF: 0905.3108
Skipped non-two-column PDF: 0911.0105
Skipped non-two-column PDF: 0911.5246
Skipped non-two-column PDF: 1004.3702
Skipped non-two-column PDF: 1005.1871
Skipped non-two-column PDF: 1005.2465
Skipped non-two-column PDF: 1005.4648
Skipped non-two-column PDF: 1006.2883
Skipped non-two-column PDF: 1008.2715
Skipped non-two-column PDF: 1011.2973
Skipped non-two-column PDF: 1101.0350
Failed to do

unknown widths : 
[0, IndirectObject(114, 0, 4911694160)]
unknown widths : 
[0, IndirectObject(132, 0, 4911694160)]
unknown widths : 
[0, IndirectObject(127, 0, 4911694160)]
unknown widths : 
[0, IndirectObject(138, 0, 4911694160)]
unknown widths : 
[0, IndirectObject(118, 0, 4911694160)]
unknown widths : 
[0, IndirectObject(137, 0, 4911694160)]
unknown widths : 
[0, IndirectObject(122, 0, 4911694160)]
unknown widths : 
[0, IndirectObject(142, 0, 4911694160)]


Skipped non-two-column PDF: 1301.3870
Skipped non-two-column PDF: 1301.4016


unknown widths : 
[0, IndirectObject(151, 0, 4912374288)]
unknown widths : 
[0, IndirectObject(167, 0, 4912374288)]
unknown widths : 
[0, IndirectObject(161, 0, 4912374288)]
unknown widths : 
[0, IndirectObject(146, 0, 4912374288)]
unknown widths : 
[0, IndirectObject(139, 0, 4912374288)]
unknown widths : 
[0, IndirectObject(156, 0, 4912374288)]
unknown widths : 
[0, IndirectObject(171, 0, 4912374288)]
unknown widths : 
[0, IndirectObject(166, 0, 4912374288)]
unknown widths : 
[0, IndirectObject(147, 0, 4912374288)]


Skipped non-two-column PDF: 1301.6714
Skipped non-two-column PDF: 1303.1778
Skipped non-two-column PDF: 1303.2033
Skipped non-two-column PDF: 1303.2967
Skipped non-two-column PDF: 1303.4315
Skipped non-two-column PDF: 1304.5774
Skipped non-two-column PDF: 1304.7435
Skipped non-two-column PDF: 1305.5617
Skipped non-two-column PDF: 1306.1138
Skipped non-two-column PDF: 1308.0497
Skipped non-two-column PDF: 1308.2910
Skipped non-two-column PDF: 1309.0193
Skipped non-two-column PDF: 1309.7583
Skipped non-two-column PDF: 1310.4149
Skipped non-two-column PDF: 1311.2191
Skipped non-two-column PDF: 1311.3269
Skipped non-two-column PDF: 1311.4566
Skipped non-two-column PDF: 1312.2169
Skipped non-two-column PDF: 1312.3092
Skipped non-two-column PDF: 1312.5572
Skipped non-two-column PDF: 1401.2411
Skipped non-two-column PDF: 1401.3801
Skipped non-two-column PDF: 1402.5593
Skipped non-two-column PDF: 1403.1076
Skipped non-two-column PDF: 1403.2001
Skipped non-two-column PDF: 1403.3369
Skipped non-

KeyboardInterrupt: 