# Semantic Scholar API
100 requests per 5 minutes
The API allows up to 100 requests per 5 minutes. 

To access a higher rate limit, complete the form to request authentication for your project.

Max limit for each request is 100, so every 5 min we can gather (100*100)* amount of people

Issue/Drawback of semantic scholar: it needs a search term, meaning, the topics/industry/niche needs to be chosen 

In [5]:
import requests
import json
import pandas as pd
import time
import string
import json
import os
from dotenv import load_dotenv
load_dotenv()


True

In [25]:
# Parameters for retrieving data
amount_of_data = 100  # Total amount of data to retrieve
start_date = 2019  # Year of publication
end_date = 2023  # Year of publication
journal = ["Nat. Chem. Biol"]
# research_field = ['Computer Science'] # Research field(s) to query (as a list)
alphabet_letters = ['I'] #  list(string.ascii_uppercase) # Research field(s) to query (as a list)
initial_offset = 0  # Initial start offset
number_of_publications_per_request = 100

# Gentle fetching
sleep_duration = 1  # Sleep duration in seconds between requests
retry_sleep_duration = 1  # Sleep duration in seconds between requests
max_retries = 100  # Maximum number of retries for failed requests

# Define the API endpoint
api_url = 'https://api.semanticscholar.org/graph/v1/paper/search'
length_df = 0

In [21]:
# Retrieving single publication for debugging purposes
params = {
    'query': "A",  # Use the first research field from the list
    'venue': journal[0],
    'year': 2013,
    'limit': 100,  # Number of results per request
    'offset': 0,  # Offset for pagination
    # 'fields': 'title,authors,abstract,citationCount,year',
    'fields': 'externalIds,title,abstract,citationCount,referenceCount,influentialCitationCount,fieldsOfStudy,s2FieldsOfStudy,publicationTypes,publicationDate,publicationVenue,year',

}

# Make the GET request
response = requests.get(api_url, params=params,headers={'X-API-KEY':os.getenv("SEMANTICSCHOLAR_API_KEY")})
# Check if the request was successful
if response.status_code == 200:
    data = json.loads(response.text)
data

{'total': 250,
 'offset': 0,
 'next': 100,
 'data': [{'paperId': '61d3ee2ffb35b66d7a0f3f3bce2439074c1751ce',
   'externalIds': {'MAG': '2147806662',
    'DOI': '10.1261/rna.035667.112',
    'CorpusId': 4786570,
    'PubMed': '23249747'},
   'publicationVenue': {'id': 'c999d5bf-6e3d-40df-a5a0-d32a59714a2d',
    'name': 'RNA: A publication of the RNA Society',
    'type': 'journal',
    'alternate_names': ['RNA', 'RNA publ RNA Soc'],
    'issn': '1355-8382',
    'url': 'http://uk.cambridge.org/journals/rna/rnaifc.htm',
    'alternate_urls': ['http://rnajournal.cshlp.org/',
     'https://rnajournal.cshlp.org/']},
   'title': 'Circular RNAs are abundant, conserved, and associated with ALU repeats.',
   'abstract': 'Circular RNAs composed of exonic sequence have been described in a small number of genes. Thought to result from splicing errors, circular RNA species possess no known function. To delineate the universe of endogenous circular RNAs, we performed high-throughput sequencing (RNA-s

In [27]:
# Fetching Data
year_list = list(range(start_date, end_date + 1))
retries_retry_exhausted = False
year_list.reverse()
for j, year in enumerate(year_list):
    print("Year",year)
    for k,letter in enumerate(alphabet_letters):
        # First request to get the number of total papers to limit the loop
        params = {
            'query': alphabet_letters[k],  # Use the first research field from the list
            'venue': journal[0],
            'year': year,
            'limit': 1,  # Number of results per request
            'offset': 0,  # Offset for pagination
            'fields': 'title,authors,abstract,citationCount,year',
            # 'fields': 'externalIds,title,authors,abstract,citationCount,referenceCount,influentialCitationCount,fieldsOfStudy,s2FieldsOfStudy,publicationTypes,publicationDate,citations,references,publicationVenue,year',

        }

        # Make the GET request
        response = requests.get(api_url, params=params,headers={'x-api-key':os.getenv("SEMANTICSCHOLAR_API_KEY")})
        # response = requests.get(api_url, params=params)
        time.sleep(1)

        # Check if the request was successful
        if response.status_code == 200:
            data = json.loads(response.text)

        # calculate the batches required to get every publication for that topic
        num_batches, remainder = divmod(data['total'], number_of_publications_per_request)
        batches = [number_of_publications_per_request] * num_batches
        if remainder > 0:
            batches.append(remainder)
            
        print("For the letter: ", letter ," batches: ",batches)

        for i,batch in enumerate(batches):
            offset = initial_offset + number_of_publications_per_request * i  # Calculate the current offset
            retries = 0

            print("Letter: ", alphabet_letters[k]," Batch: ",str(batches[i]), " Iteration: ", i)
            while retries < max_retries:
                # Define query parameters
                params = {
                    'query': alphabet_letters[k],  # Use the first research field from the list
                    'venue': journal[0],
                    'year': year,
                    'limit': batch,  # Number of results per request
                    'offset': offset,  # Offset for pagination
                    # 'fields': 'externalIds,title,authors,abstract,citationCount,referenceCount,influentialCitationCount,fieldsOfStudy,s2FieldsOfStudy,publicationTypes,publicationDate,citations,references,publicationVenue,year',
                    'fields': 'externalIds,title,abstract,citationCount,referenceCount,influentialCitationCount,fieldsOfStudy,s2FieldsOfStudy,publicationTypes,publicationDate,publicationVenue,year',
                }

                # Make the GET request
                response = requests.get(api_url, params=params,headers={'x-api-key':os.getenv("SEMANTICSCHOLAR_API_KEY")})
                # response = requests.get(api_url, params=params)
                # Check if the request was successful
                if response.status_code == 200:
                    data = json.loads(response.text)

                    # Create a DataFrame for unfiltered results
                    df = pd.DataFrame(data['data'])

                    break  # Successful request, exit the retry loop
                else:
                    print(f"Error (Attempt {retries + 1}):", response.status_code)
                    retries += 1
                    if retries < max_retries:
                        print("Retrying after sleep: "+str(retry_sleep_duration*1)+"min")
                        time.sleep(retry_sleep_duration*1)
                    if retries == max_retries:
                        print("Fetch Failed")
                        retries_retry_exhausted = True
                        break
                if retries_retry_exhausted == True:
                    print("Fetch Failed")
                    break
            if retries_retry_exhausted == True:
                print("Fetch Failed")
                break
            # print("total amount",response.json())
            length_df += len(df)  # Update the total length of unfiltered data

            # Save each iteration into a separate file - to keep some of the data in case of an error
            # The filename format includes the research field, offset, and is converted to lowercase
            df.to_csv('./data/'+ str(year) +"_"+ journal[0]+ "_" + alphabet_letters[k].replace(" ", "_").lower() + "_" + str(offset) + '.csv', index=False)

            # Sleep for a specified duration between requests
            time.sleep(sleep_duration)
    if retries_retry_exhausted == True:
        print("Fetch Failed")
        break
    # Create an empty DataFrame to store the merged data
    merged_data = pd.DataFrame()
    # Iterate through CSV files in the directory
    for filename in os.listdir("./data/"):
        if filename.endswith(".csv"):
            # Extract the year from the filename (assuming the filename follows the "year_" format)
            file_year = int(filename.split("_")[0])

            if file_year == year:
                # Read the CSV file into a DataFrame
                temp_df = pd.read_csv("./data/"+ filename)
                merged_data = pd.concat([merged_data, temp_df], ignore_index=True)
                # Remove the old CSV file
                os.remove("./data/"+ filename)
    # Calculate the number of rows dropped
    dropped_count = len(merged_data) - len(merged_data.drop_duplicates(subset='paperId', keep='first'))

    # Remove rows with duplicate 'externalIdsIDs' values
    merged_data = merged_data.drop_duplicates(subset='paperId', keep='first')

    # Calculate the number of rows remaining
    remaining_count = len(merged_data)

    # Print the counts
    print(f"Removed {dropped_count} rows. Rows remaining: {remaining_count}")
    merged_data.to_csv('./data/'+ str(year) +"_"+ journal[0]+"_"+str(remaining_count)+'no_cit_no_ref_no_auth.csv', index=False)
    time.sleep(sleep_duration)
 


# Print the total length of unfiltered data after all iterations
print("Length of the data", length_df)


Year 2023


For the letter:  I  batches:  [100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,

KeyboardInterrupt: 

In [72]:
merged_data.head().T

Unnamed: 0,0,1,2,3,4
paperId,636d37634070bc542aaacd57879ab3ad6e066f1d,b108cd62fd9c8c6c2a44b9e9b17146e507b7c928,fb0d1d801638eda679f8e95f849680321654f62c,5174ab7da437427c09963fa2778aab2e8c5a17c0,3d5ab41844ebcbd4f842b784d837be8b46db4201
externalIds,"{'DOI': '10.1126/science.adc8714', 'CorpusId':...","{'ArXiv': '2303.17484', 'DOI': '10.1126/scienc...","{'DOI': '10.1126/science.abq5693', 'CorpusId':...","{'DOI': '10.1126/science.abn2937', 'CorpusId':...","{'DOI': '10.1126/science.ade4884', 'CorpusId':..."
publicationVenue,"{'id': 'f59506a8-d8bb-4101-b3d4-c4ac3ed03dad',...","{'id': 'f59506a8-d8bb-4101-b3d4-c4ac3ed03dad',...","{'id': 'f59506a8-d8bb-4101-b3d4-c4ac3ed03dad',...","{'id': 'f59506a8-d8bb-4101-b3d4-c4ac3ed03dad',...","{'id': 'f59506a8-d8bb-4101-b3d4-c4ac3ed03dad',..."
title,Developmental strategies underlying gigantism ...,A cosmic stream of atomic carbon gas connected...,DNA methylation networks underlying mammalian ...,Leveraging base-pair mammalian constraint to u...,Widespread diversity deficits of coral reef sh...
abstract,"In amniotes, the predominant developmental str...",The growth of galaxies in the early Universe i...,"Using DNA methylation profiles (n = 15,456) fr...",Thousands of genomic regions have been associa...,A global survey of coral reefs reveals that ov...
year,2023,2023,2023,2023,2023
referenceCount,89,93,85,0,38
citationCount,2,2,2,1,1
influentialCitationCount,0,0,0,0,0
fieldsOfStudy,['Medicine'],"['Physics', 'Medicine']",['Medicine'],['Medicine'],['Medicine']


In [8]:
df.head().T

Unnamed: 0,0,1,2,3,4
paperId,5db4916130c2001927bbaccd5a3afb3872d8e6ac,765fc7d33627119d652113e6d065d047f8f6f9bf,fc975f2cf0473cdffeef8a8093f61e212f8edf6d,fc5e34c8d420cc464beb769eab57cf6baabb11af,a4d4027465ecdae907ed897d733609e0f49cd980
title,Authors’ 2015 additions to the IOC consensus s...,High Order Exponential Integrators for Nonline...,Classification of 4-dimensional homogeneous we...,Marginal and Irrelevant Disorder in Einstein-M...,Five-parameter class of solutions to the vacuu...
abstract,"In April 2014, the International Olympic Commi...",This article deals with the numerical integrat...,,A. M. G. thanks Hong Liu and Elias Kiritsis fo...,We present a new five-parameter class of Ricci...
year,2015,2015,2015,2015,2015
citationCount,59,31,15,13,11
authors,"[{'authorId': '3888843', 'name': 'M. Mountjoy'...","[{'authorId': '1869826', 'name': 'C. Besse'}, ...","[{'authorId': '1405423205', 'name': 'T. Arias-...","[{'authorId': '1397392420', 'name': 'Antonio M...","[{'authorId': '2144838481', 'name': 'Yu Chen'}..."


# Single Query - Using the Batch Endpoint

In [None]:
api_url = 'https://api.semanticscholar.org/graph/v1/paper/search'


# Rest of the code is not required but it provides example how to filter the data before saving

In [None]:
# # Fetching and filtering at the same time if required
# # Fetching Data
# length_filtered_df = 0
# length_unfiltered_df = 0

# for i in range(int(amount_of_data/100)):
#     offset = initial_offset + 100 * i
#     retries = 0
    
#     while retries < max_retries:
#         # Define query parameters
#         params = {
#             'query': research_field,
#             'year': year_of_publication,
#             'limit': 100,
#             'offset': offset,
#             'fields': 'title,authors,abstract,citationCount,year',
#         }

#         # Make the GET request
#         response = requests.get(api_url, params=params)

#         # Check if the request was successful
#         if response.status_code == 200:
#             data = json.loads(response.text)

#             # Create a DataFrame for unfiltered results
#             df_unfiltered = pd.DataFrame(data['data'])

#             # Filter the results based on 'citationCount' > 10
#             filtered_results = [paper for paper in data['data'] if paper.get('citationCount', 0) > 10]

#             # Create a DataFrame for filtered results
#             df_filtered = pd.DataFrame(filtered_results)


#             break  # Successful request, exit the retry loop
#         else:
#             print(f"Error (Attempt {retries + 1}):", response.status_code)
#             retries += 1
#             if retries < max_retries:
#                 print("Retrying after sleep...")
#                 time.sleep(sleep_duration)
    
#     # length_filtered_df = len(df_filtered)
#     length_unfiltered_df += len(df_unfiltered)

#     # Save each iteration into separate file - to keep some of the data in case of an error
#     df_unfiltered.to_csv('./data/'+research_field.replace(" ","_").lower() + "_" + str(offset) + '.csv', index=False)
#     df_filtered.to_csv('./data/df_filtered' + str(offset) + '.csv', index=False)

#     # Sleep for a specified duration between requests
#     print("Research field",research_field,"Iteration: ", i)
#     time.sleep(sleep_duration)
# print("Length of the filtered data",length_unfiltered_df)
# print("Length of the unfiltered data",length_filtered_df)

In [None]:
# https://api.semanticscholar.org/graph/v1/paper/search?query=computer+science&year=2015&fields=title,year,authors,citationCount&limit=50