# Scraper for Semantic Scholar (SES) and DBLP
This a scraping tool, that uses the REST APIs of Semantic Scholar and DBLP to retrieve literature from both platforms.

## Setup
Use the following segment for the setup up your literature search.

### Search Parameters
You can set up your search parameters in the following code segment:

In [None]:
# What platforms do you want to search?
# Semantic Scholar
search_semantic_scholar = True
# DBLP
search_dblp = True

# search literature published between year_range_start and year_range_end
# set each to False if you want to search all literature up to a year or from a specific year onwards
# if you use DBLP, specify both
year_range_start = 2019
year_range_end = 2022

# enter your Semantic Scholar key here:
SSkey = "INSERT YOUR KEY HERE"

# how many results per keyword should be scraped from Semantic Scholar?
max_ses = 300

# select returned fields or set to FALSE
# set to False if only default parameters are required
return_fieldes =["paperId",
                 "externalIds", 
                 "url",
                 "title",
                 "abstract",
                 "venue",
                 "publicationVenue",
                 "year",
                 "referenceCount",
                 "citationCount",
                 "influentialCitationCount",
                 "isOpenAccess",
                 "openAccessPdf",
                 "fieldsOfStudy",
                 "publicationTypes",
                 "publicationDate",
                 "journal",
                 "citationStyles",
                 "authors"]

# select the type of publication or False to not filter by Publication type
publication_type  = False
# select from Review, JournalArticle, CaseReport, ClinicalTrial, Dataset, Editorial, LettersAndComments, MetaAnalysis, News, Study, Book, BookSection


# select the fields of study to filter for or set to False
fields_of_Study = ["Computer Science",
                   "Medicine",
                   "Chemistry",
                   "Biology",
                   "Materials Science",
                   "Physics",
                   "Geology",
                   "Psychology",
                   "Art",
                   "History",
                   "Geography",
                   "Sociology",
                   "Business",
                   "Political Science",
                   "Economics",
                   "Philosophy",
                   "Mathematics",
                   "Engineering",
                   "Environmental Science",
                   "Agricultural and Food Sciences",
                   "Education",
                   "Law",
                   "Linguistics"]
    
    
# filenames for export
ses_filename = "semScho_publications_export.csv"
dblp_filename = "dblp_publications_export.csv"

### Preparing Keywords
Provide a keyword file  named "keywords_ses.txt" for SES and "keywords_ses.txt" for DBLP, in which each line contains a search query. The scraper will transform the query to the required format.

In [None]:
# load SES keywords
keywords_ses = []
with open("keywords_ses.txt", "r") as file:
    
    for line in file.readlines():
        keywords_ses.append(line.replace("\n","").lower())


# Load DBLP keywords
keywords_dblp = []
# with open("keywords_dblp.txt", "r") as file:
with open("keywords_ses.txt", "r") as file:
    
    for line in file.readlines():
        keywords_dblp.append(line.replace("\n","").lower())


## Set Up and Methods

In [None]:
# preparation of SES parameters

if fields_of_Study:
    fields_of_Study = ",".join([a.replace(" ", "+") for a in fields_of_Study])

if year_range_start:
    if year_range_end:
        if year_range_end == year_range_start:
            year = str(year_range_start)
            year_range = [year_range_start]
        else:
            year = str(year_range_start)+"-"+str(year_range_end)
            year_range = [y for y in range (year_range_start, year_range_end+1)]
    else:
        year = str(year_range_start)+"-"
else:
    if year_range_end:
        if year_range_end == year_range_start:
            year = str(year_range_start)
        else:
            year = str(year_range_start)+"-"+str(year_range_end)
    else:
        year = False

### Import of modules and definition of methods

In [None]:
import requests, json
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
from IPython.display import display, clear_output

In [None]:
# generate SES url based on keyword input
def create_url_from_keyw(keyword):
    pre = "https://api.semanticscholar.org/graph/v1/paper/search?"
    q = "&query="+keyword.replace(" ","+")
    post = "&offset=0&limit=100&x-api-key="+SSkey+"&sort=relevance"
    
    if year:
        y = "&year="+year
    else:
        y = ""
        
    if return_fieldes:
        ret = "&fields="+",".join(return_fieldes)
    else:
        ret = ""
    
    if fields_of_Study:
        fos = "&fieldsOfStudy="+fields_of_Study
    else: 
        fos = "" 
    
    if publication_type:
        p_type = "&publicationTypes="+publication_type
    else :
        p_type = ""
        
    return pre+q+y+post+ret+fos+p_type

# generate dblp url based on keyword input
def create_url_from_keyw_dblp(keyword):
    pre = "https://dblp.org/search/publ/api?"
    alt_pre = "https://dblp.uni-trier.de.org/search/publ/api?"
    alt_pre2 = "https://dblp2.uni-trier.de/search/publ/api?"
    q = "q="+keyword.replace(" ","+")
    cap = "&h=1000&format=json"
    return pre+q+cap

# SES scrape method 
def create_database_ses(urllist, keyw, results_per_keyw=100):
    exp = {}
    debug = []
    for i, url in enumerate(urllist):
        exp[keyw[i]] = []
        res = []
        print(i+1,"/",len(keyw),":", keyw[i])
        
        max_offset = results_per_keyw-100
        curr_offset = 0
        
        # first 100 documents
        text = requests.get(url).text
        res_json = json.loads(text)
        if "data" in res_json.keys():
            res.extend(json.loads(text)["data"])
        else:
            debug.append(res_json)
        exp[keyw[i]].extend(res)
        print(str(len(res))+" entries retrieved", end="")
        time.sleep(1)
        print("", end="\r")
        
        # iterations until max: 
        while curr_offset < max_offset:
            url = url.replace("offset="+str(curr_offset),"offset="+str(curr_offset+100))
            curr_offset += 100
            text = requests.get(url).text
            res_json = json.loads(text)
            if "data" in res_json.keys():
                res.extend(json.loads(text)["data"])
            else:
                print("there was a problem retrieving or saving publications.")
                debug.append(res_json)
            
            print(str(len(res))+" publications retrieved", end="")
            print("", end="\r")
            
            time.sleep(1)
            
            exp[keyw[i]].extend(res)
            
        clear_output()

    return exp, debug


# for matching papers to search terms by ID
def check_id(paper_id, data_dic, criter):
    ret = []
    for search_term in data_dic.keys():
        for paper in data_dic[search_term]:
            if paper[criter] == paper_id:
                if search_term not in ret:
                    ret.append(search_term)
    return ret

# creates string out of dict of authors
def get_authors(dic):
    ret = ""
    for i, p in enumerate(dic):
        if i==0:
            ret = p["name"]
        else: 
            ret = ret+", "+p["name"]
    return ret

# creates string for dblp return format
def get_authors_dblp(dic):
    ret = ""
    if type(dic)==float:
        return ""
    if type(dic["author"]) == dict:
        return dic["author"]["text"]
    for i, p in enumerate(dic["author"]):
        if i==0:
            ret = p["text"]
        else: 
            ret = ret+", "+p["text"]
    return ret

# scrape dblp for given years
# provide years as list of ints
# keyw is list of keywords that was used to generate urls and is only used for output and structuring
def get_dblp_data_years(urls, keyw, years):
    exp = {}
    print("retrieving literature from year range:", years)
    time.sleep(3)
    
    for i, url in enumerate(urls):
        print(i+1,"/",len(keyw),":", keyw[i])
        
        text = requests.get(url).text
        res_json = json.loads(text)
        
        # retrieve results
        print(int(res_json["result"]["hits"]["@total"]),"entries found in total for \"",keyw[i],"\"")
        if (int(res_json["result"]["hits"]["@total"]) > 0):
            finds = (res_json["result"]["hits"]["hit"])
            
        # filter by year range
        clean_results = [a["info"] for a in finds if int(a["info"]["year"]) in years]
        exp[keyw[i].replace(" ", "_")] = clean_results
        print("entries in year range >>",len(clean_results))
        time.sleep(3)
        clear_output()
        
        
    return exp

In [None]:
# creates lists uf URLS for REST API access
if search_semantic_scholar:
    urls = [create_url_from_keyw(key) for key in keywords_ses]
if search_dblp:
    urls_dblp = [create_url_from_keyw_dblp(key) for key in keywords_dblp]

## Scrape Literature
The following section performs the actual search.

In [None]:
# get semantic scholar results:

if search_semantic_scholar:
    raw_data, raw_bugreport = create_database_ses(urls, keywords_ses, max_ses)

In [None]:
# get dblp results:

if search_dblp:
    raw_data_dblp = get_dblp_data_years(urls_dblp, keywords_dblp, year_range)

## Clean results and prepare output

### Convert to dataframes

In [None]:
# SES
if search_semantic_scholar:
    dFrames_ses = {}   
    for d in raw_data.keys(): 
        dFrames_ses[d] = pd.DataFrame(raw_data[d])  

In [None]:
# DBLP
if search_dblp:
    dFrames_dblp = {}   
    for d in raw_data_dblp.keys():
        dFrames_dblp[d] = pd.DataFrame(raw_data_dblp[d])  

### Remove duplicates from scrapes for different keywords

In [None]:
# Semantic Scholar
if search_semantic_scholar:
    # join results from all searches
    df_ses = pd.concat(dFrames_ses.values())

    # filter duplicates
    df_no_dupes_ses = df_ses.drop_duplicates(subset="paperId")

In [None]:
# DBLP
if search_dblp:
    # join results from all searches
    df_dblp = pd.concat(dFrames_dblp.values())

    # filter duplicates
    df_no_dupes_dblp = df_dblp.drop_duplicates(subset="url")

    # filter semantic scholar by date
    data_export_dblp = df_no_dupes_dblp

### Convert authors to exportable Strings

In [None]:
# SES
if search_semantic_scholar:
    data_export_cs = df_no_dupes_ses.astype({'year': 'int'})

    # add tags for search queries
    data_export_cs["searchQuery"] = data_export_cs.apply(lambda row: check_id(row["paperId"], raw_data, "paperId") , axis=1)

    # add clear authors column
    data_export_cs["authors_clear"] = data_export_cs.apply(lambda row: get_authors(row["authors"]) , axis=1)

In [None]:
# DBLP
if search_dblp:
    # add tags for search queries
    data_export_dblp["searchQuery"] = data_export_dblp.apply(lambda row: check_id(row["url"], raw_data_dblp, "url") , axis=1)

    # add clear authors column
    data_export_dblp["authors_clear"] = data_export_dblp.apply(lambda row: get_authors_dblp(row["authors"]) , axis=1)

In [None]:
# Export to CSV
if search_semantic_scholar:
    data_export_cs.to_csv(ses_filename)

if search_dblp:
    data_export_dblp.to_csv(dblp_filename)