Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved SemanticScholar search #96

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions litstudy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
load_ieee_csv,
load_ris_file,
load_scopus_csv,
load_semanticscholar_json,
load_springer_csv,
refine_crossref,
refine_scopus,
Expand All @@ -16,6 +17,9 @@
search_dblp,
search_scopus,
search_semanticscholar,
fastsearch_semanticscholar,
generate_reference_list,
mass_fetch_semanticscholar,
)
from .stats import (
compute_year_histogram,
Expand Down Expand Up @@ -120,6 +124,8 @@
"load_csv",
"load_ieee_csv",
"load_ris_file",
"load_scopus_csv",
"load_semanticscholar_json",
"load_springer_csv",
"refine_crossref",
"refine_scopus",
Expand All @@ -129,6 +135,9 @@
"search_dblp",
"search_scopus",
"search_semanticscholar",
"fastsearch_semanticscholar",
"generate_reference_list",
"mass_fetch_semanticscholar",
"Affiliation",
"Author",
"Document",
Expand Down
6 changes: 5 additions & 1 deletion litstudy/sources/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .scopus import search_scopus, refine_scopus, fetch_scopus
from .bibtex import load_bibtex
from .semanticscholar import fetch_semanticscholar, search_semanticscholar, refine_semanticscholar
from .semanticscholar import fetch_semanticscholar, search_semanticscholar, fastsearch_semanticscholar, refine_semanticscholar, load_semanticscholar_json, generate_reference_list, mass_fetch_semanticscholar
from .crossref import fetch_crossref, refine_crossref, search_crossref
from .ieee import load_ieee_csv
from .springer import load_springer_csv
Expand All @@ -19,6 +19,7 @@
"load_ieee_csv",
"load_ris_file",
"load_scopus_csv",
"load_semanticscholar_json",
"load_springer_csv",
"refine_crossref",
"refine_scopus",
Expand All @@ -28,4 +29,7 @@
"search_dblp",
"search_scopus",
"search_semanticscholar",
"fastsearch_semanticscholar",
"generate_reference_list",
"mass_fetch_semanticscholar",
]
4 changes: 2 additions & 2 deletions litstudy/sources/crossref.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(self, entry):

@property
def name(self) -> str:
return self.entry["name"]
return self.entry.get("name")


def _extract_title(entry):
Expand Down Expand Up @@ -80,7 +80,7 @@ def publisher(self):

@property
def language(self):
return self.get("language")
return self.entry.get("language")

@property
def publication_date(self):
Expand Down
164 changes: 157 additions & 7 deletions litstudy/sources/semanticscholar.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import logging
import requests
import shelve
from ..common import robust_open
import json

from ..common import progress_bar
from ..types import Document, Author, DocumentSet, DocumentIdentifier
Expand All @@ -18,6 +20,7 @@ def extract_id(item):
doi=item.get("doi"),
arxivid=item.get("arxivId"),
s2id=item.get("paperId"),
pubmed=item.get("pubmed"),
)


Expand Down Expand Up @@ -96,26 +99,34 @@ def load(id):
DEFAULT_TIMEOUT = 3.05 # 100 requests per 5 minutes


def request_query(query, offset, limit, cache, session, timeout=DEFAULT_TIMEOUT):
params = urlencode(dict(query=query, offset=offset, limit=limit))
url = f"{S2_QUERY_URL}?{params}"
def request_query(query, offset, limit, cache, session, timeout=DEFAULT_TIMEOUT, extraParams=dict()):
params=dict(query=query, offset=offset, limit=limit)
params.update(extraParams)
encparams = urlencode(params)
url = f"{S2_QUERY_URL}?{encparams}"

if url in cache:
return cache[url]
sleep(timeout)

reply = session.get(url)
reply = session.get(url,timeout=60*10)
response = reply.json()

if "data" not in response:
msg = response.get("error") or response.get("message") or "unknown"
raise Exception(f"error while fetching {reply.url}: {msg}")
if msg.find("Too Many Requests.")>-1 or msg.find("Endpoint request timed out")>-1:
logging.info(f"request_query: Timeout error while fetching {reply.url}: {msg}")
return "TIMEOUT"
else:
raise Exception(f"error while fetching {reply.url}: {msg}")

cache[url] = response
return response


def request_paper(key, cache, session, timeout=DEFAULT_TIMEOUT):
url = S2_PAPER_URL + quote_plus(key)
def request_paper(key, cache, session, timeout=DEFAULT_TIMEOUT, extraParams=dict()):
encparams = urlencode(extraParams)
url = S2_PAPER_URL + quote_plus(key)+"?"+encparams

if url in cache:
return cache[url]
Expand Down Expand Up @@ -224,6 +235,7 @@ def search_semanticscholar(

with shelve.open(CACHE_FILE) as cache:
paper_ids = []
to=0

while True:
offset = len(paper_ids)
Expand All @@ -232,6 +244,15 @@ def search_semanticscholar(
if not response:
break

#retry in case of timeout
if response == "TIMEOUT":
to=to+1
logging.info("Timeout:",DEFAULT_TIMEOUT*4*to)
sleep(DEFAULT_TIMEOUT*4*to)
continue
else:
to=0

records = response["data"]
total = response["total"]

Expand All @@ -256,3 +277,132 @@ def search_semanticscholar(
logging.warn(f"could not find paper id {paper_id}")

return DocumentSet(docs)

def load_semanticscholar_json(path: str) -> DocumentSet:
"""Import json file exported from SemanticScholar"""
docs = []
with robust_open(path) as f:
result = json.load(f)
data=result["data"]
for doc in data:
ids=doc.pop("externalIds")
for i in ids:
if i=="DOI":
doc["doi"]=ids.get("DOI").lower()
elif i=="ArXiv":
doc["arxivId"]=ids.get("ArXiv")
elif i=="PubMed":
doc["pubmed"]=ids.get("PubMed")
docs.append(ScholarDocument(doc))
return DocumentSet(docs)

def fastsearch_semanticscholar(
query: str, *, limit: int = 1000, batch_size: int = 100, session=None
) -> DocumentSet:
"""Submit the given query to SemanticScholar API and return the results
as a `DocumentSet`.

:param query: The search query to submit.
:param limit: The maximum number of results to return. Must be at most 1,000
:param batch_size: The number of results to retrieve per request. Must be at most 100.
:param session: The `requests.Session` to use for HTTP requests.
"""

if not query:
raise Exception("no query specified in `search_semanticscholar`")

if session is None:
session = requests.Session()

docs = []

with shelve.open(CACHE_FILE) as cache:
paper_ids = []
to=0
while True:
offset = len(docs)

response = request_query(query, offset, batch_size, cache, session,extraParams={"fields":"title,authors,year,venue,abstract,citations,references,externalIds"})
if not response:
break

#Retry in case of timeout
if response == "TIMEOUT":
to=to+1
logging.info("Timeout:",DEFAULT_TIMEOUT*4*to)
sleep(DEFAULT_TIMEOUT*4*to)
continue
else:
to=0

records = response["data"]
total = response["total"]
print("Gesamt:",total,"Offset:",offset)
for record in records:
ids=record.pop("externalIds")
for i in ids:
if i=="DOI":
record["doi"]=ids.get("DOI").lower()
elif i=="ArXiv":
record["arxivId"]=ids.get("ArXiv")
elif i=="PubMed":
record["pubmed"]=ids.get("PubMed")
docs.append(ScholarDocument(record))


# Check if we reached the total number of papers
if len(docs) >= total:
break

# Check if we exceeded the user-defined limit
if limit is not None and len(docs) >= limit:
docs = docs[:limit]
break

return DocumentSet(docs)

def generate_reference_list(docs: DocumentSet):
"""Returns a list of referenced documents formattet for a fetch_semanticscholar request.
s2id: <id>
PubMed: PMID:<id>
DOI: DOI:<id>
ArXiv: ARXIV:<id>
"""
references=[]
for u in range(len(docs)):
if docs[u].references == None:
continue
for i in range(len(docs[u].references)):
doi=docs[u].references[i].doi
s2id=docs[u].references[i].s2id
arxivid=docs[u].references[i].arxivid
pubmed=docs[u].references[i].pubmed
title=docs[u].references[i].title
if doi != None:
references.append("DOI:"+doi)
elif s2id != None:
references.append(s2id)
elif pubmed != None:
references.append("PMID:"+pubmed)
elif arxivid != None:
references.append("ARXIV:"+arxivid)
else:
continue
return references

def mass_fetch_semanticscholar(paper_ids: list, session=None) -> DocumentSet:
if session is None:
session = requests.Session()
#remove duplicates:
paper_ids=list(set(paper_ids))

docs = []

with shelve.open(CACHE_FILE) as cache:
for paper_id in progress_bar(paper_ids):
record = request_paper(paper_id, cache, session)
if record:
docs.append(ScholarDocument(record))
else:
logging.warn(f"could not find paper id {paper_id}")
return DocumentSet(docs)
2 changes: 2 additions & 0 deletions litstudy/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,8 @@ def matches(self, other: "DocumentIdentifier") -> bool:

# Two identifiers match if all keys that they have in common are equal
for key in self._attr:
if self._attr[key] == None:
continue
if key in other._attr:
if self._attr[key] != other._attr[key]:
return False
Expand Down