In [1]:
import os
import sys
from pathlib import Path
import json
import pickle
import re
import requests
from tqdm import tqdm

In [2]:
import pandas as pd
import numpy as np

In [3]:
df_accession_match = pd.read_csv("../data/poster_acrl2025/openalex_data_20250205/df_accession_match.csv")

In [4]:
df_accession_match_dois = df_accession_match[df_accession_match["repository_europepmc"]=="doi"]

In [5]:
df_accession_match_dois["accession_number"].nunique()

1387

In [6]:
# base_url = "https://api.datacite.org/dois"
# params = {
#     "affiliations": True,
#     "publishers": True,
# }
# results = {}
# for doi in tqdm(df_accession_match_dois["accession_number"].unique()):
#     r = requests.get(f"{base_url}/{doi}", params=params)
#     if r.status_code == 200:
#         results[doi] = r.json()
#     else:
#         results[doi] = r.status_code

In [7]:
outfp = Path("../data/poster_acrl2025/dois_datacite_api_responses.pickle")

# outfp.write_bytes(pickle.dumps(results))

results = pickle.loads(outfp.read_bytes())

In [20]:
def get_row_from_crossref_api(doi: str) -> dict:
    params = {
        "mailto": "jason.portenoy@gmail.com",
    }
    r = requests.get(f"https://api.crossref.org/works/{doi}", params=params)
    meta = r.json()["message"]
    prefix, suffix = doi.split("/", maxsplit=1)
    has_affiliation_data = any([a.get("affiliation") for a in meta.get("author", [])])
    title = meta["title"][0] if meta.get("title") else ""
    subjects = meta.get("subject", [])
    return {
        "doi": doi,
        "repository": meta.get("publisher"),
        "has_affiliation_data": has_affiliation_data,
        "title": title,
        "subjects": subjects,
        "funders": meta.get("funding", []),
        "prefix": prefix,
        "suffix": suffix,
    }

rows = []
for doi, v in results.items():
    if str(v) == "404":
        rows.append(get_row_from_crossref_api(doi))
        continue
    meta = v["data"]["attributes"]
    has_affiliation_data = any([c.get("affiliation") for c in meta.get("creators", [])])
    title = meta.get("titles", [])[0]
    title = meta["titles"][0]["title"] if meta.get("titles") else ""
    subjects = [item["subject"] for item in meta.get("subjects", [])]
    rows.append({
        "doi": doi,
        "repository": meta.get("publisher"),
        "has_affiliation_data": has_affiliation_data,
        "title": title,
        "subjects": subjects,
        "funders": meta.get("fundingReferences"),
        "prefix": meta.get("prefix"),
        "suffix": meta.get("suffix"),
    })
df_datacite_api_results = pd.DataFrame(rows)

In [23]:
df_datacite_api_results.to_csv("../data/poster_acrl2025/df_datacite_and_crossref_metadata.csv", index=False)