# Semantic Scholar API
100 requests per 5 minutes
The API allows up to 100 requests per 5 minutes. 

To access a higher rate limit, complete the form to request authentication for your project.

Max limit for each request is 100, so every 5 min we can gather (100*100)* amount of people

Issue/Drawback of semantic scholar: it needs a search term, meaning, the topics/industry/niche needs to be chosen 

In [2]:
import requests
import json
import pandas as pd
import time

In [3]:
# Parameters for retrieving data
amount_of_data = 1000  # Total amount of data to retrieve
year_of_publication = 2015  # Year of publication
research_field = ['computer science']  # Research field(s) to query (as a list)
initial_offset = 0  # Initial start offset

# Gentle fetching
sleep_duration = 30  # Sleep duration in seconds between requests
max_retries = 3  # Maximum number of retries for failed requests

# Define the API endpoint
api_url = 'https://api.semanticscholar.org/graph/v1/paper/search'
length_df = 0

In [None]:
# Fetching Data

for i in range(int(amount_of_data / 100)):
    offset = initial_offset + 100 * i  # Calculate the current offset
    retries = 0

    while retries < max_retries:
        # Define query parameters
        params = {
            'query': research_field[0],  # Use the first research field from the list
            'year': year_of_publication,
            'limit': 100,  # Number of results per request
            'offset': offset,  # Offset for pagination
            'fields': 'title,authors,abstract,citationCount,year',
        }

        # Make the GET request
        response = requests.get(api_url, params=params)

        # Check if the request was successful
        if response.status_code == 200:
            data = json.loads(response.text)

            # Create a DataFrame for unfiltered results
            df = pd.DataFrame(data['data'])

            break  # Successful request, exit the retry loop
        else:
            print(f"Error (Attempt {retries + 1}):", response.status_code)
            retries += 1
            if retries < max_retries:
                print("Retrying after sleep...")
                time.sleep(sleep_duration)

    length_df += len(df)  # Update the total length of unfiltered data

    # Save each iteration into a separate file - to keep some of the data in case of an error
    # The filename format includes the research field, offset, and is converted to lowercase
    df.to_csv('./data/' + research_field[0].replace(" ", "_").lower() + "_" + str(offset) + '.csv', index=False)

    # Sleep for a specified duration between requests
    print("Research field", research_field[0], "Iteration: ", i)
    time.sleep(sleep_duration)

# Print the total length of unfiltered data after all iterations
print("Length of the unfiltered data", length_df)


# Rest of the code is not required but it provides example how to filter the data before saving

In [None]:
# # Fetching and filtering at the same time if required
# # Fetching Data
# length_filtered_df = 0
# length_unfiltered_df = 0

# for i in range(int(amount_of_data/100)):
#     offset = initial_offset + 100 * i
#     retries = 0
    
#     while retries < max_retries:
#         # Define query parameters
#         params = {
#             'query': research_field,
#             'year': year_of_publication,
#             'limit': 100,
#             'offset': offset,
#             'fields': 'title,authors,abstract,citationCount,year',
#         }

#         # Make the GET request
#         response = requests.get(api_url, params=params)

#         # Check if the request was successful
#         if response.status_code == 200:
#             data = json.loads(response.text)

#             # Create a DataFrame for unfiltered results
#             df_unfiltered = pd.DataFrame(data['data'])

#             # Filter the results based on 'citationCount' > 10
#             filtered_results = [paper for paper in data['data'] if paper.get('citationCount', 0) > 10]

#             # Create a DataFrame for filtered results
#             df_filtered = pd.DataFrame(filtered_results)


#             break  # Successful request, exit the retry loop
#         else:
#             print(f"Error (Attempt {retries + 1}):", response.status_code)
#             retries += 1
#             if retries < max_retries:
#                 print("Retrying after sleep...")
#                 time.sleep(sleep_duration)
    
#     # length_filtered_df = len(df_filtered)
#     length_unfiltered_df += len(df_unfiltered)

#     # Save each iteration into separate file - to keep some of the data in case of an error
#     df_unfiltered.to_csv('./data/'+research_field.replace(" ","_").lower() + "_" + str(offset) + '.csv', index=False)
#     df_filtered.to_csv('./data/df_filtered' + str(offset) + '.csv', index=False)

#     # Sleep for a specified duration between requests
#     print("Research field",research_field,"Iteration: ", i)
#     time.sleep(sleep_duration)
# print("Length of the filtered data",length_unfiltered_df)
# print("Length of the unfiltered data",length_filtered_df)