# Paper Downloads

In [1]:
"""
Search and download papers from Semantic Scholar.

Usage:
    python semantic_scholar_search.py "deep learning biology" 5 ./output/
"""

import requests
import os
import sys
import json
from tqdm import tqdm

BASE_URL = "https://api.semanticscholar.org/graph/v1/paper/search"

def search_semantic_scholar(query, limit=10):
    params = {
        "query": query,
        "limit": limit,
        "fields": "title,authors,abstract,url,year,externalIds,isOpenAccess,openAccessPdf"
    }
    response = requests.get(BASE_URL, params=params)
    response.raise_for_status()
    return response.json()["data"]

def save_metadata(papers, outdir):
    metadata_path = os.path.join(outdir, "papers_metadata.json")
    with open(metadata_path, "w") as f:
        json.dump(papers, f, indent=2)
    print(f"[+] Saved metadata to {metadata_path}")

def download_pdfs(papers, outdir):
    for i, paper in tqdm(enumerate(papers), desc="Downloading Papers"):
        pdf_url = paper.get("openAccessPdf", {}).get("url")
        if pdf_url:
            try:
                response = requests.get(pdf_url)
                response.raise_for_status()
                filename = f"paper_{i+1}.pdf"
                filepath = os.path.join(outdir, filename)
                with open(filepath, "wb") as f:
                    f.write(response.content)
                print(f"[+] Downloaded: {filename}")
            except Exception as e:
                print(f"[!] Failed to download paper {i+1}: {e}")
        else:
            print(f"[!] No open access PDF for paper {i+1}")


In [8]:
papers[0]['title'].replace(' ', '_').replace('.','_').replace('-','_').replace(',','_').replace(':','_')

'Hypertrophic_Scars_and_Keloids__Advances_in_Treatment_and_Review_of_Established_Therapies'

In [2]:
output_dir = "downloads"
query = "keloids"
num_papers = 100
os.makedirs(output_dir, exist_ok=True)
papers = search_semantic_scholar(query, num_papers)
save_metadata(papers, output_dir)
download_pdfs(papers, output_dir)

[+] Saved metadata to downloads/papers_metadata.json


Downloading Papers: 2it [00:00, 10.49it/s]

[!] No open access PDF for paper 1
[!] Failed to download paper 2: 403 Client Error: Forbidden for url: https://journals.lww.com/10.1097/PRS.0000000000008667
[!] No open access PDF for paper 3


Downloading Papers: 4it [00:00,  8.23it/s]

[!] Failed to download paper 4: 403 Client Error: Forbidden for url: https://pmc.ncbi.nlm.nih.gov/articles/PMC8975835


Downloading Papers: 5it [00:01,  3.57it/s]

[+] Downloaded: paper_5.pdf


Downloading Papers: 6it [00:01,  3.58it/s]

[!] Failed to download paper 6: 403 Client Error: Forbidden for url: https://pmc.ncbi.nlm.nih.gov/articles/PMC9797913


Downloading Papers: 7it [00:01,  3.85it/s]

[!] Failed to download paper 7: 403 Client Error: Forbidden for url: https://journals.lww.com/10.1097/CM9.0000000000002093
[!] No open access PDF for paper 8
[!] Failed to download paper 9: 403 Client Error: Forbidden for url: https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/exd.14121


Downloading Papers: 10it [00:03,  2.13it/s]

[+] Downloaded: paper_10.pdf


Downloading Papers: 11it [00:05,  1.32it/s]

[+] Downloaded: paper_11.pdf


Downloading Papers: 12it [00:05,  1.55it/s]

[!] Failed to download paper 12: 403 Client Error: Forbidden for url: https://pmc.ncbi.nlm.nih.gov/articles/PMC7940466


Downloading Papers: 13it [00:06,  1.69it/s]

[+] Downloaded: paper_13.pdf


Downloading Papers: 14it [00:07,  1.11it/s]

[+] Downloaded: paper_14.pdf
[!] Failed to download paper 15: 403 Client Error: Forbidden for url: https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/exd.14414


Downloading Papers: 16it [00:11,  1.24s/it]

[+] Downloaded: paper_16.pdf


Downloading Papers: 17it [00:13,  1.44s/it]

[+] Downloaded: paper_17.pdf


Downloading Papers: 18it [00:15,  1.75s/it]

[+] Downloaded: paper_18.pdf
[!] No open access PDF for paper 19


Downloading Papers: 20it [00:20,  2.06s/it]

[+] Downloaded: paper_20.pdf


Downloading Papers: 21it [00:22,  1.84s/it]

[+] Downloaded: paper_21.pdf


Downloading Papers: 22it [00:23,  1.73s/it]

[+] Downloaded: paper_22.pdf


Downloading Papers: 23it [00:24,  1.47s/it]

[+] Downloaded: paper_23.pdf


Downloading Papers: 24it [00:24,  1.23s/it]

[+] Downloaded: paper_24.pdf


Downloading Papers: 25it [00:32,  3.09s/it]

[+] Downloaded: paper_25.pdf
[!] No open access PDF for paper 26


Downloading Papers: 27it [00:35,  2.26s/it]

[+] Downloaded: paper_27.pdf
[!] No open access PDF for paper 28


Downloading Papers: 29it [00:44,  3.22s/it]

[+] Downloaded: paper_29.pdf
[!] No open access PDF for paper 30


Downloading Papers: 31it [00:46,  2.43s/it]

[+] Downloaded: paper_31.pdf
[!] No open access PDF for paper 32


Downloading Papers: 34it [00:48,  1.63s/it]

[+] Downloaded: paper_33.pdf
[!] Failed to download paper 34: 403 Client Error: Forbidden for url: https://journals.lww.com/10.1097/GOX.0000000000002582


Downloading Papers: 35it [00:51,  1.91s/it]

[+] Downloaded: paper_35.pdf
[!] No open access PDF for paper 36
[!] Failed to download paper 37: 403 Client Error: Forbidden for url: https://onlinelibrary.wiley.com/doi/pdf/10.1111/jocd.12828
[!] No open access PDF for paper 38
[!] No open access PDF for paper 39


Downloading Papers: 40it [00:52,  1.28it/s]

[!] Failed to download paper 40: 403 Client Error: Forbidden for url: https://journals.lww.com/01720096-201903000-00015


Downloading Papers: 41it [00:52,  1.37it/s]

[!] Failed to download paper 41: 403 Client Error: Forbidden for url: https://oamjms.eu/index.php/mjms/article/view/oamjms.2019.099


Downloading Papers: 42it [00:54,  1.11it/s]

[+] Downloaded: paper_42.pdf


Downloading Papers: 43it [00:55,  1.04it/s]

[+] Downloaded: paper_43.pdf
[!] No open access PDF for paper 44
[!] No open access PDF for paper 45
[!] No open access PDF for paper 46


Downloading Papers: 48it [00:55,  2.33it/s]

[!] Failed to download paper 47: 403 Client Error: Forbidden for url: https://pmc.ncbi.nlm.nih.gov/articles/PMC7949502
[!] Failed to download paper 48: 403 Client Error: Forbidden for url: https://www.karger.com/Article/Pdf/491924


Downloading Papers: 49it [00:57,  1.63it/s]

[+] Downloaded: paper_49.pdf
[!] No open access PDF for paper 50


Downloading Papers: 51it [00:57,  2.23it/s]

[+] Downloaded: paper_51.pdf
[!] No open access PDF for paper 52


Downloading Papers: 53it [00:59,  1.46it/s]

[+] Downloaded: paper_53.pdf


Downloading Papers: 54it [01:03,  1.27s/it]

[+] Downloaded: paper_54.pdf


Downloading Papers: 55it [01:06,  1.60s/it]

[+] Downloaded: paper_55.pdf


Downloading Papers: 56it [01:06,  1.33s/it]

[!] Failed to download paper 56: 403 Client Error: Forbidden for url: https://journals.lww.com/00029330-201707200-00012
[!] No open access PDF for paper 57
[!] No open access PDF for paper 58
[!] No open access PDF for paper 59
[!] No open access PDF for paper 60


Downloading Papers: 61it [01:10,  1.07it/s]

[+] Downloaded: paper_61.pdf


Downloading Papers: 62it [01:12,  1.06s/it]

[+] Downloaded: paper_62.pdf


Downloading Papers: 65it [01:12,  1.51it/s]

[+] Downloaded: paper_63.pdf
[!] No open access PDF for paper 64
[!] Failed to download paper 65: 403 Client Error: Forbidden for url: https://downloads.hindawi.com/archive/2016/5162394.pdf
[!] No open access PDF for paper 66


Downloading Papers: 67it [01:16,  1.01s/it]

[+] Downloaded: paper_67.pdf
[!] No open access PDF for paper 68


Downloading Papers: 69it [01:16,  1.28it/s]

[+] Downloaded: paper_69.pdf
[!] No open access PDF for paper 70
[!] No open access PDF for paper 71
[!] Failed to download paper 72: 403 Client Error: Forbidden for url: https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/1346-8138.14110


Downloading Papers: 73it [01:24,  1.29s/it]

[+] Downloaded: paper_73.pdf
[!] No open access PDF for paper 74


Downloading Papers: 75it [01:24,  1.00it/s]

[!] Failed to download paper 75: 403 Client Error: Forbidden for url: https://pmc.ncbi.nlm.nih.gov/articles/PMC7949688


Downloading Papers: 76it [01:24,  1.13it/s]

[!] Failed to download paper 76: 403 Client Error: Forbidden for url: https://journals.lww.com/01720096-201705000-00021


Downloading Papers: 77it [01:31,  1.94s/it]

[+] Downloaded: paper_77.pdf


Downloading Papers: 78it [01:37,  2.76s/it]

[+] Downloaded: paper_78.pdf
[!] No open access PDF for paper 79
[!] No open access PDF for paper 80
[!] No open access PDF for paper 81
[!] No open access PDF for paper 82
[!] No open access PDF for paper 83
[!] Failed to download paper 84: 403 Client Error: Forbidden for url: https://downloads.hindawi.com/journals/bmri/2016/5893481.pdf
[!] No open access PDF for paper 85
[!] No open access PDF for paper 86
[!] No open access PDF for paper 87


Downloading Papers: 88it [01:37,  1.36it/s]

[!] Failed to download paper 88: 403 Client Error: Forbidden for url: https://pmc.ncbi.nlm.nih.gov/articles/PMC7950168


Downloading Papers: 89it [01:38,  1.30it/s]

[+] Downloaded: paper_89.pdf
[!] No open access PDF for paper 90
[!] No open access PDF for paper 91


Downloading Papers: 92it [01:39,  1.75it/s]

[!] Failed to download paper 92: 403 Client Error: Forbidden for url: https://pmc.ncbi.nlm.nih.gov/articles/PMC7949878
[!] No open access PDF for paper 93
[!] No open access PDF for paper 94


Downloading Papers: 95it [01:40,  2.10it/s]

[!] Failed to download paper 95: 403 Client Error: Forbidden for url: https://journals.lww.com/01720096-201506000-00014
[!] No open access PDF for paper 96
[!] No open access PDF for paper 97
[!] No open access PDF for paper 98


Downloading Papers: 99it [01:41,  2.27it/s]

[+] Downloaded: paper_99.pdf


Downloading Papers: 100it [01:42,  1.03s/it]

[+] Downloaded: paper_100.pdf





In [4]:
save_metadata(papers, output_dir)

[+] Saved metadata to downloads/papers_metadata.json


In [5]:
download_pdfs(papers, output_dir)

[!] No open access PDF for paper 1
[!] Failed to download paper 2: 403 Client Error: Forbidden for url: https://journals.lww.com/10.1097/PRS.0000000000008667
[!] No open access PDF for paper 3
[!] Failed to download paper 4: 403 Client Error: Forbidden for url: https://pmc.ncbi.nlm.nih.gov/articles/PMC8975835
[+] Downloaded: paper_5.pdf
[!] Failed to download paper 6: 403 Client Error: Forbidden for url: https://pmc.ncbi.nlm.nih.gov/articles/PMC9797913
[!] Failed to download paper 7: 403 Client Error: Forbidden for url: https://journals.lww.com/10.1097/CM9.0000000000002093
[!] No open access PDF for paper 8
[!] Failed to download paper 9: 403 Client Error: Forbidden for url: https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/exd.14121
[+] Downloaded: paper_10.pdf


In [None]:
def main():
    if len(sys.argv) != 4:
        print("Usage: python semantic_scholar_search.py \"<query>\" <num_papers> <output_dir>")
        sys.exit(1)

    query = sys.argv[1]
    num_papers = int(sys.argv[2])
    output_dir = sys.argv[3]

    os.makedirs(output_dir, exist_ok=True)
    papers = search_semantic_scholar(query, num_papers)
    save_metadata(papers, output_dir)
    download_pdfs(papers, output_dir)

if __name__ == "__main__":
    main()
