# Scraper for Semantic Scholar (SES)
This a scraping tool, that uses the REST APIs of Semantic Scholar to retrieve literature from the platform.

In [None]:
import requests, json
import pandas as pd
import numpy as np
from tqdm import tqdm
import time

### Search Parameters

In [None]:
# search literature published between year_range_start and year_range_end
year_range_start = 1950
year_range_end = 2024

year = str(year_range_start)+"-"+str(year_range_end)


return_fields =["paperId",
                 "externalIds", 
                 "url",
                 "title",
                 "abstract",
                 "venue",
                 "publicationVenue",
                 "year",
                 "referenceCount",
                 "citationCount",
                 "influentialCitationCount",
                 "isOpenAccess",
                 "openAccessPdf",
                 "fieldsOfStudy",
                 "publicationTypes",
                 "publicationDate",
                 "journal",
                 "citationStyles",
                 "authors"]

# Keywords for the search
keywords = []
with open("keywords.txt", "r") as file: # change name to the keywords_yourname
    for line in file.readlines():
        keywords.append(line.replace("\n","").lower())

# Number of requests per keysword
requests_per_keyword = 10

### Changing offset

In [None]:
# Function to create URL from keyword

def create_url_from_keyword(keyword, offset=0):
    pre = "https://api.semanticscholar.org/graph/v1/paper/search?"
    q = "&query=" + keyword.replace(" ", "+")
    post = "&offset=" + str(offset) + "&limit=100&sort=relevance"
    
    if year:
        y = "&year=" + year
    else:
        y = ""
        
    if return_fields:
        ret = "&fields=" + ",".join(return_fields)
    else:
        ret = ""
    
    return pre + q + y + post + ret

# Function to send requests with rate limiting
def send_request(url):
    response = requests.get(url)
    return response

### Parse Literature

In [None]:
results = []
seen_paper_ids = set()  
sleep_interval = 5 

for keyword in tqdm(keywords, desc="Keywords Progress"):
    offset = 0

    for _ in range(requests_per_keyword):
        while True:  
            try:
                # Construct URL with the current offset
                url = create_url_from_keyword(keyword, offset)
                
                # Send the API request
                response = send_request(url)

                # Check response status
                if response.status_code == 200:
                    response_data = response.json()
                    for paper in response_data['data']:
                        paper_id = paper['paperId']
                        # Only add the paper if it hasn't been seen before
                        if paper_id not in seen_paper_ids:
                            # Add the keyword to the paper
                            paper['keyword'] = keyword
                            results.append(paper)
                            seen_paper_ids.add(paper_id)
                    break  # Exit retry loop on success
                elif response.status_code == 429 or response.status_code == 504:
                    # If HTTP 429 or 504, sleep and retry
                    #print(f"Request for keyword '{keyword}' failed with status code {response.status_code}: {response.text}, retrying in 5 seconds")
                    time.sleep(sleep_interval)
                else:
                    # If other error, print error and continue with next request
                    print(f"Request for keyword '{keyword}' failed with status code {response.status_code}: {response.text}")
                    break
                
            except Exception as e:
                print(f"An error occurred: {e}")
                time.sleep(sleep_interval)
        
        offset += 100

### Clean results and prepare output

In [None]:
len(results)

In [None]:
df = pd.DataFrame(results)

In [None]:
# creates string out of dict of authors
def get_authors(dic):
    ret = ""
    for i, p in enumerate(dic):
        if i==0:
            ret = p["name"]
        else: 
            ret = ret+", "+p["name"]
    return ret

In [None]:
# add clear authors column
df["authors_clear"] = df.apply(lambda row: get_authors(row["authors"]) , axis=1)