# Semantic Scholar API
100 requests per 5 minutes
The API allows up to 100 requests per 5 minutes. 

To access a higher rate limit, complete the form to request authentication for your project.

Max limit for each request is 100, so every 5 min we can gather (100*100)* amount of people

Issue/Drawback of semantic scholar: it needs a search term, meaning, the topics/industry/niche needs to be chosen 

In [41]:
import requests
import json
import pandas as pd
import time
import string
import json


In [49]:
# Parameters for retrieving data
amount_of_data = 100  # Total amount of data to retrieve
start_date = 1900  # Year of publication
end_date = 2023  # Year of publication
journal = ["Science"]
# research_field = ['Computer Science'] # Research field(s) to query (as a list)
alphabet_letters = list(string.ascii_uppercase) # Research field(s) to query (as a list)
initial_offset = 0  # Initial start offset
number_of_publications_per_request = 100

# Gentle fetching
sleep_duration = 5  # Sleep duration in seconds between requests
max_retries = 3  # Maximum number of retries for failed requests

# Define the API endpoint
api_url = 'https://api.semanticscholar.org/graph/v1/paper/search'
length_df = 0

In [52]:
# Fetching Data
year_list = list(range(start_date, end_date + 1))
year_list.reverse()
for j, year in enumerate(year_list):
    print("Year",year)
    for k,letter in enumerate(alphabet_letters):
        # First request to get the number of total papers to limit the loop
        params = {
            'query': alphabet_letters[k],  # Use the first research field from the list
            'venue': journal[0],
            'year': year,
            'limit': 1,  # Number of results per request
            'offset': 0,  # Offset for pagination
            'fields': 'title,authors,abstract,citationCount,referenceCount,influentialCitationCount,fieldsOfStudy,s2FieldsOfStudy,publicationTypes,publicationDate,embedding.specter_v2,tldr,citations,references,publicationVenue,year',
        }

        # Make the GET request
        response = requests.get(api_url, params=params)

        # Check if the request was successful
        if response.status_code == 200:
            data = json.loads(response.text)

        # calculate the batches required to get every publication for that topic
        num_batches, remainder = divmod(data['total'], number_of_publications_per_request)
        batches = [number_of_publications_per_request] * num_batches
        if remainder > 0:
            batches.append(remainder)
            
        print("For the letter: ", letter ," batches: ",batches)

        for i,batch in enumerate(batches):
            offset = initial_offset + number_of_publications_per_request * i  # Calculate the current offset
            retries = 0

            while retries < max_retries:
                # Define query parameters
                params = {
                    'query': alphabet_letters[k],  # Use the first research field from the list
                    'venue': journal[0],
                    'year': year,
                    'limit': batch,  # Number of results per request
                    'offset': offset,  # Offset for pagination
                    'fields': 'title,authors,abstract,citationCount,year',
                }

                # Make the GET request
                response = requests.get(api_url, params=params)
                # Check if the request was successful
                if response.status_code == 200:
                    data = json.loads(response.text)

                    # Create a DataFrame for unfiltered results
                    df = pd.DataFrame(data['data'])

                    break  # Successful request, exit the retry loop
                else:
                    print(f"Error (Attempt {retries + 1}):", response.status_code)
                    retries += 1
                    if retries < max_retries:
                        print("Retrying after sleep...")
                        time.sleep(sleep_duration)
                    if retries == max_retries:
                        print("Sleeping for 5min")
                        time.sleep(300)
            # print("total amount",response.json())
            length_df += len(df)  # Update the total length of unfiltered data

            # Save each iteration into a separate file - to keep some of the data in case of an error
            # The filename format includes the research field, offset, and is converted to lowercase
            df.to_csv('./data/'+ str(year) +"_"+ journal[0]+"_" + alphabet_letters[k].replace(" ", "_").lower() + "_" + str(offset) + '.csv', index=False)

            # Sleep for a specified duration between requests
            print("Letter", alphabet_letters[k], "Iteration: ", i)
            time.sleep(sleep_duration)

# Print the total length of unfiltered data after all iterations
print("Length of the data", length_df)


Year 2023
For the letter:  A  batches:  [100, 91]
Letter A Iteration:  0
Letter A Iteration:  1
For the letter:  B  batches:  [100, 3]
Letter B Iteration:  0
Letter B Iteration:  1
For the letter:  C  batches:  [100, 77]
Letter C Iteration:  0
Letter C Iteration:  1
For the letter:  D  batches:  [100, 30]
Letter D Iteration:  0
Letter D Iteration:  1
For the letter:  E  batches:  [100, 36]
Letter E Iteration:  0
Letter E Iteration:  1
For the letter:  F  batches:  [76]
Letter F Iteration:  0
For the letter:  G  batches:  [82]
Letter G Iteration:  0
For the letter:  H  batches:  [100, 5]
Letter H Iteration:  0
Letter H Iteration:  1
For the letter:  I  batches:  [76]
Letter I Iteration:  0
For the letter:  J  batches:  [100, 98]
Letter J Iteration:  0
Letter J Iteration:  1
For the letter:  K  batches:  [92]
Letter K Iteration:  0
For the letter:  L  batches:  [100, 34]
Letter L Iteration:  0
Letter L Iteration:  1
For the letter:  M  batches:  [100, 100, 27]
Letter M Iteration:  0
Lett

KeyboardInterrupt: 

In [8]:
df.head().T

Unnamed: 0,0,1,2,3,4
paperId,5db4916130c2001927bbaccd5a3afb3872d8e6ac,765fc7d33627119d652113e6d065d047f8f6f9bf,fc975f2cf0473cdffeef8a8093f61e212f8edf6d,fc5e34c8d420cc464beb769eab57cf6baabb11af,a4d4027465ecdae907ed897d733609e0f49cd980
title,Authors’ 2015 additions to the IOC consensus s...,High Order Exponential Integrators for Nonline...,Classification of 4-dimensional homogeneous we...,Marginal and Irrelevant Disorder in Einstein-M...,Five-parameter class of solutions to the vacuu...
abstract,"In April 2014, the International Olympic Commi...",This article deals with the numerical integrat...,,A. M. G. thanks Hong Liu and Elias Kiritsis fo...,We present a new five-parameter class of Ricci...
year,2015,2015,2015,2015,2015
citationCount,59,31,15,13,11
authors,"[{'authorId': '3888843', 'name': 'M. Mountjoy'...","[{'authorId': '1869826', 'name': 'C. Besse'}, ...","[{'authorId': '1405423205', 'name': 'T. Arias-...","[{'authorId': '1397392420', 'name': 'Antonio M...","[{'authorId': '2144838481', 'name': 'Yu Chen'}..."


# Single Query - Using the Batch Endpoint

In [None]:
api_url = 'https://api.semanticscholar.org/graph/v1/paper/search'


# Rest of the code is not required but it provides example how to filter the data before saving

In [None]:
# # Fetching and filtering at the same time if required
# # Fetching Data
# length_filtered_df = 0
# length_unfiltered_df = 0

# for i in range(int(amount_of_data/100)):
#     offset = initial_offset + 100 * i
#     retries = 0
    
#     while retries < max_retries:
#         # Define query parameters
#         params = {
#             'query': research_field,
#             'year': year_of_publication,
#             'limit': 100,
#             'offset': offset,
#             'fields': 'title,authors,abstract,citationCount,year',
#         }

#         # Make the GET request
#         response = requests.get(api_url, params=params)

#         # Check if the request was successful
#         if response.status_code == 200:
#             data = json.loads(response.text)

#             # Create a DataFrame for unfiltered results
#             df_unfiltered = pd.DataFrame(data['data'])

#             # Filter the results based on 'citationCount' > 10
#             filtered_results = [paper for paper in data['data'] if paper.get('citationCount', 0) > 10]

#             # Create a DataFrame for filtered results
#             df_filtered = pd.DataFrame(filtered_results)


#             break  # Successful request, exit the retry loop
#         else:
#             print(f"Error (Attempt {retries + 1}):", response.status_code)
#             retries += 1
#             if retries < max_retries:
#                 print("Retrying after sleep...")
#                 time.sleep(sleep_duration)
    
#     # length_filtered_df = len(df_filtered)
#     length_unfiltered_df += len(df_unfiltered)

#     # Save each iteration into separate file - to keep some of the data in case of an error
#     df_unfiltered.to_csv('./data/'+research_field.replace(" ","_").lower() + "_" + str(offset) + '.csv', index=False)
#     df_filtered.to_csv('./data/df_filtered' + str(offset) + '.csv', index=False)

#     # Sleep for a specified duration between requests
#     print("Research field",research_field,"Iteration: ", i)
#     time.sleep(sleep_duration)
# print("Length of the filtered data",length_unfiltered_df)
# print("Length of the unfiltered data",length_filtered_df)

In [None]:
# https://api.semanticscholar.org/graph/v1/paper/search?query=computer+science&year=2015&fields=title,year,authors,citationCount&limit=50