# Extract Research Papers from arXiv API



In [None]:
import requests
import pandas as pd
from xml.etree import ElementTree as ET

# Define the arXiv API base URL and query parameters
arxiv_base_url = "http://export.arxiv.org/api/query?"
search_query = 'cat:cs.*'  
max_results = 400  
start = 0

# Prepare an empty list to store the paper data
papers_data = []

# Function to fetch data from arXiv API
def fetch_arxiv_data(start, max_results):
    url = f"{arxiv_base_url}search_query={search_query}&start={start}&max_results={max_results}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Error fetching data: {response.status_code}")
        return None

# Function to parse the arXiv response and extract metadata
def parse_arxiv_response(response_text):
    root = ET.fromstring(response_text)
    namespaces = {'': 'http://www.w3.org/2005/Atom'}  # XML namespace for arXiv response
    papers = []
    
    for entry in root.findall('entry', namespaces):
        pdf_url = None
        # Extract PDF URL from the links
        for link in entry.findall('link', namespaces):
            if 'pdf' in link.get('title', '').lower():
                pdf_url = link.get('href')
                break
        
        paper = {
            'title': entry.find('title', namespaces).text,
            'id': entry.find('id', namespaces).text,
            'published': entry.find('published', namespaces).text,
            'summary': entry.find('summary', namespaces).text,
            'authors': ', '.join([author.find('name', namespaces).text for author in entry.findall('author', namespaces)]),
            'categories': entry.find('category', namespaces).get('term'),
            'pdf_url': pdf_url  # Add the PDF URL to the data
        }
        papers.append(paper)
    
    return papers

# Fetch and process the data
response_text = fetch_arxiv_data(start, max_results)
if response_text:
    papers = parse_arxiv_response(response_text)
    papers_df = pd.DataFrame(papers)
    
    # Save the data to a CSV file
    papers_df.to_csv("arxiv_papers_with_pdf.csv", index=False)
    print("Data saved to arxiv_papers_with_pdf.csv")


Data saved to arxiv_papers_with_pdf.csv


In [None]:
import os
import json
import requests
import fitz  # PyMuPDF for PDF text extraction
import pandas as pd
from tqdm import tqdm

# Function to download a PDF file
def download_pdf(pdf_url, save_dir="pdfs"):
    os.makedirs(save_dir, exist_ok=True)  # Create directory if not exists
    pdf_filename = os.path.join(save_dir, pdf_url.split('/')[-1])  # Extract filename from URL
    
    if not os.path.exists(pdf_filename):  # Skip if already downloaded
        response = requests.get(pdf_url)
        if response.status_code == 200:
            with open(pdf_filename, "wb") as f:
                f.write(response.content)
            return pdf_filename
        else:
            print(f"❌ Failed to download {pdf_url}")
            return None
    return pdf_filename

# Function to extract full text from a PDF
def extract_full_text(pdf_path):
    doc = fitz.open(pdf_path)  # Open the PDF
    return "\n".join(page.get_text("text") for page in doc)  # Merge all pages

# Load the CSV file
csv_file = "arxiv_papers_with_pdf.csv"
df = pd.read_csv(csv_file)

# Create a list to store extracted data
papers_data = []

# Extract text for each PDF and store in JSON
for index, row in tqdm(df.iterrows(), total=len(df)):
    paper_info = {
        "title": row["title"],
        "id": row["id"],
        "pdf_url": row["pdf_url"],
        "full_text": ""  # Default empty if extraction fails
    }

    pdf_path = download_pdf(row["pdf_url"])  # Download the PDF
    if pdf_path:
        paper_info["full_text"] = extract_full_text(pdf_path)  

    papers_data.append(paper_info)  # Append to list

# Save as JSON
json_file = "arxiv_papers_with_full_text.json"
with open(json_file, "w", encoding="utf-8") as f:
    json.dump(papers_data, f, indent=4, ensure_ascii=False)

print(f"✅ JSON file saved as {json_file}")


  1%|          | 3/400 [00:00<01:37,  4.08it/s]

❌ Failed to download http://arxiv.org/pdf/cs/0701021v2


 15%|█▌        | 61/400 [00:06<00:45,  7.43it/s]

❌ Failed to download http://arxiv.org/pdf/1012.4170v2


 23%|██▎       | 91/400 [00:12<01:59,  2.58it/s]

❌ Failed to download http://arxiv.org/pdf/0911.2829v1


 26%|██▌       | 102/400 [00:35<21:37,  4.35s/it]

❌ Failed to download http://arxiv.org/pdf/1009.3306v1


 35%|███▍      | 139/400 [00:44<01:33,  2.78it/s]

❌ Failed to download http://arxiv.org/pdf/1108.3558v2


 36%|███▌      | 142/400 [01:05<12:31,  2.91s/it]

❌ Failed to download http://arxiv.org/pdf/1202.4535v1


 36%|███▋      | 145/400 [01:10<09:01,  2.12s/it]

MuPDF error: syntax error: could not parse color space (142 0 R)

MuPDF error: syntax error: could not parse color space (142 0 R)

MuPDF error: syntax error: could not parse color space (237 0 R)

MuPDF error: syntax error: could not parse color space (237 0 R)



 42%|████▏     | 167/400 [02:02<09:04,  2.34s/it]

❌ Failed to download http://arxiv.org/pdf/1904.06159v1


 53%|█████▎    | 212/400 [05:04<12:40,  4.05s/it]

❌ Failed to download http://arxiv.org/pdf/2112.14770v1


 55%|█████▍    | 218/400 [05:42<12:47,  4.21s/it]

❌ Failed to download http://arxiv.org/pdf/1001.4573v1


 55%|█████▍    | 219/400 [05:45<11:49,  3.92s/it]

❌ Failed to download http://arxiv.org/pdf/2211.10675v1


 56%|█████▌    | 222/400 [05:53<09:52,  3.33s/it]

❌ Failed to download http://arxiv.org/pdf/2404.13672v1


 66%|██████▋   | 266/400 [07:22<04:58,  2.23s/it]

❌ Failed to download http://arxiv.org/pdf/2206.01250v1


 75%|███████▌  | 301/400 [08:28<02:33,  1.55s/it]

MuPDF error: unsupported error: cannot create appearance stream for Screen annotations

MuPDF error: unsupported error: cannot create appearance stream for Screen annotations

MuPDF error: unsupported error: cannot create appearance stream for Screen annotations

MuPDF error: unsupported error: cannot create appearance stream for Screen annotations

MuPDF error: unsupported error: cannot create appearance stream for Screen annotations

MuPDF error: unsupported error: cannot create appearance stream for Screen annotations



 79%|███████▉  | 315/400 [09:51<19:57, 14.09s/it]

❌ Failed to download http://arxiv.org/pdf/1405.2281v1


 80%|████████  | 320/400 [09:58<04:49,  3.62s/it]

MuPDF error: syntax error: could not parse color space (327 0 R)

MuPDF error: syntax error: could not parse color space (370 0 R)

MuPDF error: syntax error: could not parse color space (456 0 R)

MuPDF error: syntax error: could not parse color space (479 0 R)

MuPDF error: syntax error: could not parse color space (550 0 R)

MuPDF error: syntax error: could not parse color space (674 0 R)

MuPDF error: syntax error: could not parse color space (692 0 R)

MuPDF error: syntax error: could not parse color space (727 0 R)

MuPDF error: syntax error: could not parse color space (769 0 R)

MuPDF error: syntax error: could not parse color space (802 0 R)

MuPDF error: syntax error: could not parse color space (840 0 R)

MuPDF error: syntax error: could not parse color space (858 0 R)

MuPDF error: syntax error: could not parse color space (898 0 R)

MuPDF error: syntax error: could not parse color space (914 0 R)



 89%|████████▉ | 355/400 [11:14<01:17,  1.72s/it]

❌ Failed to download http://arxiv.org/pdf/1511.02528v1


 92%|█████████▎| 370/400 [12:01<01:35,  3.18s/it]

❌ Failed to download http://arxiv.org/pdf/2309.07166v1


100%|██████████| 400/400 [13:18<00:00,  2.00s/it]


✅ JSON file saved as arxiv_papers_with_full_text.json
