In [1]:
import requests

import pandas as pd

BASE_OPENALEX = "https://api.openalex.org"

def get_papers_from_dois(dois_list):
    results = []
    for doi in dois_list:
        results.extend(get_paper_from_doi(doi))
    return results

def get_paper_from_doi(doi: str) -> list[dict]:
    base_doi = "https://doi.org"
    if doi[:15] != base_doi: doi = f"{base_doi}/{doi}"
    url = f"{BASE_OPENALEX}/works/{doi}"
    response = requests.get(url).json()
    response["abstract"] = reconstruct_abstract(response["abstract_inverted_index"])
    return response

def reconstruct_abstract(index: dict) -> str:
    if isinstance(index, type(None)): return "MISSING_ABSTRACT" # TODO: expand with scraping methods TODO: decide if to return None instead
    max_position_sum = sum([len(position)+1 for position in index.values()]) + 500 # for safety 
    abstract_array = max_position_sum*[None]
    for word, positions in index.items():
        for position in positions:
            abstract_array[position] = word
    abstract_array = [i for i in abstract_array if i is not None]
    abstract_string = ' '.join(abstract_array)
    abstract_string = abstract_string.replace(r'^abstract\s+', '')
    return abstract_string


In [2]:
dois = [
    "https://doi.org/10.48550/arXiv.2411.19865",
    "https://doi.org/10.48550/arXiv.2009.13207",
    "https://doi.org/10.1007/s11047-013-9380-y"
]
results = []
for doi in dois:
    results.append(get_paper_from_doi(doi))


In [3]:
df = pd.DataFrame(results)