In [None]:
import requests
import urllib.request
import glob
import json
import os
import shutil
import sys
import time
import random
from semanticscholar import SemanticScholar
from tqdm import tqdm

#### Functions/Utilities

In [None]:
# Create json file
def write_to_json(file_in_any, file_out_json):
    with open(file_out_json + ".json", "w") as outfile:
        json.dump(file_in_any, outfile)
        
# Copy files from one directory to another
def copy_file_to_dir(source, destination):
    for filename in glob.glob(os.path.join(source, '*.*')):
        shutil.copy(filename, destination) 

# Hit the Semantic Scholar API and retrieve up to 10,000 papers via search terms. 
def get_data(query=str, offset=int, limit=int, NumResults=int):
    '''
    There is a hard limit of 10,000 papers per query. The sum of 'offset' and 'limit' must be < 10,000. 
    query: the string to be searched on Semantic Scholar
    offset: What result/paper to start retrieval from the query results
    limit: The length of results to retrieve at one time. Also functions as retrieval increments. Max 100.
    '''

    data = []

    while offset <= NumResults:
        url = f"https://api.semanticscholar.org/graph/v1/paper/search?query={query}&fields=authors&offset={offset}&limit={limit}"

        try:
            r = requests.get(url,timeout=15)
            r.raise_for_status()
            with urllib.request.urlopen(url) as webaddress:
                response = json.loads(webaddress.read().decode())
            time.sleep(3.3) # to work around the rate limit
            data.extend(response["data"])
                
        except requests.exceptions.HTTPError as errh:
            print ("Http Error:", errh)
            code = errh.response.status_code
            if code == 403:
                print("Timed out at:", offset)
                time.sleep(330) # to work around 403 errors
                offset -= limit    
            else:
                offset += limit  
                continue       

        except requests.exceptions.ConnectionError as errc:
            print ("Error Connecting:", errc)
            print("failed at:", offset)

        except requests.exceptions.Timeout as errt:
            print ("Timeout Error:", errt)
            print("failed at:", offset)            

        except requests.exceptions.RequestException as err:
            print ("Oops: Something Else", err)
            print("failed at:", offset)
        
        offset += limit     
    
    return data

#### Global Variables

In [None]:
offset = 0
limit = 100
NumResults = 9900
s2_api_key = 'qZWKkOKyzP5g9fgjyMmBt1MN2NTC6aT61UklAiyw'


#### Get PaperIDs and AuthorIDs for 13 queries about WNT Signalling

In [None]:
# Even though many of these queries return hundreds of thousands of papers, only the first 10,000 are available 
# due to a built in limit in AWS ElasticSearch. Could probably create a work around, but this is a sufficient 
# sample amount of paper and author IDs. From this, we can build out a network using paper IDs by author IDs 
# at a much faster rate due to having an API token. Cause this takes absolutely forever. 

data_WNT3SignallingGSK = get_data("WNT3+signalling", offset, limit, NumResults)
data_WNT5aSignalling = get_data("WNT5a+signalling", offset, limit, NumResults)
data_WNTSignallingVangl2 = get_data("WNT+signalling+Vangl2", offset, limit, NumResults)
data_WNTSignallingTCF = get_data("WNT+signalling+TCF", offset, limit, NumResults)
data_WNTSignallingLef = get_data("WNT+signalling+Lef", offset, limit, NumResults)
data_WNTSignallingPCP = get_data("WNT+signalling+PCP", offset, limit, NumResults)
data_WNTSignallingAxin = get_data("WNT+signalling+Axin", offset, limit, NumResults)
data_WNTSignallingCancer = get_data("WNT+signalling+Cancer", offset, limit, NumResults)
data_WNTSignallingStemCells = get_data("WNT+signalling+Stem+Cells", offset, limit, NumResults)
data_WNTSignallingFrizzled = get_data("WNT+signalling+Frizzled", offset, limit, NumResults)
data_WNTSignallingGSK = get_data("WNT+signalling+GSK", offset, limit, NumResults)
data_WNTSignallingBetaCatenin = get_data("WNT+signalling+Beta+Catenin", offset, limit, NumResults)
data_WNTSignalling = get_data("WNT+signalling", offset, limit, NumResults)

#### Inspecting our datasets

In [None]:
print(len(data_WNTSignallingVangl2),1)
print(len(data_WNT3SignallingGSK),2)
print(len(data_WNT5aSignalling),3)
print(len(data_WNTSignallingTCF),4)
print(len(data_WNTSignallingLef),5)
print(len(data_WNTSignallingPCP),6)
print(len(data_WNTSignallingAxin),7)
print(len(data_WNTSignallingCancer),8)
print(len(data_WNTSignallingStemCells),9)
print(len(data_WNTSignallingFrizzled),10)
print(len(data_WNTSignallingGSK),11)
print(len(data_WNTSignallingBetaCatenin),12)
print(len(data_WNTSignalling),13)

#### Create initial json dataset of paper and author IDs

In [None]:
data = (data_WNTSignalling + data_WNTSignallingBetaCatenin + data_WNTSignallingPCP 
        + data_WNTSignallingAxin + data_WNTSignallingCancer + data_WNTSignallingStemCells 
        + data_WNTSignallingFrizzled + data_WNTSignallingGSK + data_WNTSignallingTCF 
        + data_WNTSignallingLef + data_WNTSignallingVangl2 + data_WNT5aSignalling + data_WNT3SignallingGSK)
print(len(data))
write_to_json(data, "paper_and_authorIDs")

#### Retrieve unique Paper IDs from Initial Dataset, inspect total unique, and write to json

In [None]:
paperIDs = []
with open("paper_and_authorIDs.json", 'r', encoding='utf-8') as papers:
    papers_data = json.load(papers)
    for idx, paper in tqdm(enumerate(papers_data), total=len(papers_data)):        
        try:
            if paper["paperId"]:
                paperIDs.append(paper["paperId"])
            else: 
                continue
        except:
            print(sys.exc_info()[0])  
            
pID = set(paperIDs)
paperIDs_final = list(pID)
print(len(paperIDs_final))

write_to_json(paperIDs_final, "paperIDs")

#### Retrieve unique Author IDs from Initial Dataset, inspect total unique, and write to json

In [None]:
authorIDs = []
with open("paper_and_authorIDs.json", 'r', encoding='utf-8') as papers:
    papers_data = json.load(papers)
    for idx, paper in tqdm(enumerate(papers_data), total=len(papers_data)):        
        try:
            for author in paper["authors"]:
                try:
                    if author["authorId"]:
                        paperIDs.append(author["authorId"])
                    else: 
                        continue
                except:
                    print(sys.exc_info()[0])  
        except:   
            print(sys.exc_info()[0])
            
aID = set(authorIDs)
authorIDs_final = list(aID)
print(len(authorIDs_final))

write_to_json(authorIDs_final, "authorIDs")

#### Create list variables of unique AuthorIDs and PaperIDs, from previous jsons, used for API requests (instead of re-running all previous cells)

In [None]:
with open("authorIDs.json", 'r', encoding='utf-8') as author_json:
    authorIDs = json.load(author_json)

print("unique authorIDs:", len(authorIDs))

with open("paperIDs.json", 'r', encoding='utf-8') as paper_json:
    paperIDs = json.load(paper_json)

print("unique paperIDs:", len(paperIDs))

#### Retrieve all information for each PaperID

In [None]:
# this took 12hrs to complete with API key

sch = SemanticScholar(api_key=s2_api_key)
papers_all_info = []

for id in range(0, len(paperIDs)):
    try:
        if not (id / 100).is_integer(): 
            papers_all_info.append(sch.paper(paperIDs[id]))
            id += 1
        else: 
            papers_all_info.append(sch.paper(paperIDs[id]))
            print(f'paper info retrieved: {id}')
            id += 1
    
    except:
        print(sys.exc_info()[0], "writing to json, sleep 20 sec, continue", id)
        write_to_json(papers_all_info, "papers")
        time.sleep(60) # to work around 403 errors

write_to_json(papers_all_info, "papers_corpus")
print("completed")

#### Retrieve all author information by AuthorID

In [None]:
# this took 36hrs to complete with API key

sch = SemanticScholar(api_key=s2_api_key)
authors_all_info = []

for id in range(0, len(authorIDs)):
    try:
        if not (id / 100).is_integer(): 
            authors_all_info.append(sch.author(authorIDs[id]))
            id += 1

        else: 
            authors_all_info.append(sch.author(authorIDs[id]))
            print(f'author info retrieved: {id}')
            id += 1
    
    except:
        print(sys.exc_info()[0], "writing to json, sleep 30 sec, continue", id)
        write_to_json(authors_all_info, "authors1")
        time.sleep(30) # to work around 403 errors

write_to_json(authors_all_info, "author_corpus")
print("completed")

#### Inspect how many unique results for full paper and author information (should be same, or close, to authorIDs abnd paperIDs list above)

In [None]:
with open("papers_corpus.json", 'r', encoding='utf-8') as paper_json:
    papers = json.load(paper_json)

print("unique papers:", len(papers))

with open("authors_corpus.json", 'r', encoding='utf-8') as authors_json:
    authors = json.load(authors_json)

print("unique authors:", len(authors))

#### the original api pull required batches to get the full author_corpus, below code is for knitting together batches

# authors_list = []
# authors = ["authors1.json", "authors2.json", "authors3.json"]
# for i in authors:
#     with open(i, 'r', encoding='utf-8') as author_json:
#         auth = json.load(author_json)
#         #print(len(auth))
#     authors_list += auth
# print(len(authors_list))

# write_to_json(authors_list, "author_corpus")

#### Retrieving all unique paperIDs from author_corpus information

In [None]:
paperIDs = []
with open("authors_corpus", 'r', encoding='utf-8') as papers:
    papers_data = json.load(papers)
    for idx, paper in tqdm(enumerate(papers_data), total=len(papers_data)):        
        try:
            for i in paper["papers"]:
                try:
                    if i["paperId"]:
                        paperIDs.append(i["paperId"])
                    else: 
                        continue
                except:
                    print(sys.exc_info()[0])
        except:
            print(sys.exc_info()[0])
                      
pID = set(paperIDs)
paperIDs_corpus = list(pID)
print("unique paperIDs", len(paperIDs_corpus)) ## 7,239,766 papers for 146,853 authors from 39,249 initial papers 

#### Taking a random sample of paperIDs and authorIDs for testing; excluding authors of the papers

In [None]:
random.seed(2022-2-1)

with open("papers_corpus.json", 'r', encoding='utf-8') as paper_json:
    papers_corpus = json.load(paper_json)
    papers_corpus_sample = random.sample(papers_corpus,1000)

write_to_json(papers_corpus_sample, "papers_corpus_sample")

In [None]:
random.seed(2022-2-1)

with open("papers_corpus_sample.json", 'r', encoding='utf-8') as paper_json:
    papers_corpus_sample = json.load(paper_json)

authorIDs_remove = []
for paper in papers_corpus_sample:
    for author in paper["authors"]:
        authorIDs_remove.append(author["authorId"])

aID = set(authorIDs_remove)
authorIDs_remove = list(aID)

with open("authorIDs.json", 'r', encoding='utf-8') as authors_json:
    authorIDs = json.load(authors_json)

authorIDs_updated = [id for id in authorIDs if id not in authorIDs_remove]

authorIDs_sample = random.sample(authorIDs_updated,5000)

write_to_json(authorIDs_sample, "authorIDs_sample")

#### Retrieving and Appending PaperIDs to sample of AuthorIDs

In [None]:
with open("authorIDs_sample.json", 'r', encoding='utf-8') as author_json:
    authorIDs = json.load(author_json)

with open("authors_corpus.json", 'r', encoding='utf-8') as author_json2:
    paperIDs_lookup = json.load(author_json2)

badRecords = []
papers = []
authorCorpusLookup = {}

for i in authorIDs:
    for j in paperIDs_lookup:
        try:
            if j["authorId"] == str(i): 
                for k in j["papers"]:
                    papers.append(k["paperId"])                
                authorCorpusLookup[i] = papers
                papers = []
            else:
                continue
        except:
            badRecords.append(j)

write_to_json(authorCorpusLookup, "authorCorpusLookup")

#### List comprehension to speed this up, just needs error handling and abstraction

# def search(name, people):
#     return [element for element in people if element['name'] == name]

#### super quick nested list comprehension, but can't seem to get lengths... come back later
#papers = [[i["papers"][j]["paperId"] for i in paperIDs_lookup[1:2]] for j in range(0, len(i["papers"]))]

#print(paperIDs_lookup.keys())

# papers = [i for i in paperIDs_lookup if i["authorId"] == "2064217435"]
# papers = [i["papers"] for i in paperIDs_lookup]
# paperIDs = [[j[l]["paperId"] for j in papers] for l in range(0, len(papers[0]))]

#print([[j for j in i] for i in paperIDs_lookup])
#print(len(paperIDs_lookup))

#### THE KEY DOESN'T EXIST IN SOME RECORDS, NEED TO CLEAN JSON FILE ###
#print([i["papers"] for i in paperIDs_lookup[0:len(paperIDs_lookup)-1]])

# print(type(papers))
# print(len(papers[0]))
# print(papers[0]["papers"])
# print(len(paperIDs))

#### Building the sample authorID corpus

In [None]:
# with open("authorCorpusLookup.json", 'r', encoding='utf-8') as paper_json:
#     authorCorpusLookup = json.load(paper_json)

sch = SemanticScholar(api_key=s2_api_key)
count = 0
AuthDict = {}
PapersDict = {}
papers = []
failed = []
authors_corpus_sample = []

for authorid in authorCorpusLookup:
    count += 1  
    try:
        for paperId in authorCorpusLookup[authorid]:
            paperInfo = sch.paper(paperId)
            #print(authorid, paperId)

            if paperInfo["abstract"]:
                PapersDict["paperId"] = paperId
                PapersDict["abstract"] = paperInfo["abstract"]
                PapersDict["topics"] = paperInfo["topics"]
                PapersDict["s2FieldsOfStudy"] = paperInfo["s2FieldsOfStudy"]
                PapersDict["fieldsOfStudy"] = paperInfo["fieldsOfStudy"]
                papers.append(PapersDict)
                PapersDict = {}
            else:
                continue

        AuthDict["authorId"] = authorid
        AuthDict["papers"] = papers
        #print(AuthDict)
        authors_corpus_sample.append(AuthDict)
        AuthDict = {}
        papers = []

    except:
        print(sys.exc_info()[0], "writing to json, sleep 30 sec, continue", count)
        write_to_json(authors_corpus_sample, "authors_corpus_sample")
        failed.append(authorid)
        time.sleep(30) # to work around 403 errors    
    
write_to_json(authors_corpus_sample, "authors_corpus_sample")
print("completed")