# Europe PMC data citations for ACRL 2025

Use citations (annotations) from Europe PMC, and bring in data from multiple other data sources

In [1]:
import sys

In [2]:
sys.path.append("..")

In [3]:
from clean_doi import clean_doi, NoDoiException

def is_it_a_doi(doi: str) -> bool:
    # if clean_doi is unsuccessful, return False indicating that the input was not actually a doi
    try:
        cleaned_doi = clean_doi(doi)
        return True
    except NoDoiException:
        return False

In [4]:
import pandas as pd
import numpy as np

In [5]:
from openalex_utils import get_openalex_dataframe_from_multiple_works_files, get_ror_map_from_institutions_file

### OpenAlex data

OpenAlex was used to collect DOIs for Northwestern University and University of Colorado, Boulder for the years 2013-2024.

In [6]:
ror_map = get_ror_map_from_institutions_file("../data/openalex_data_20250130/openalex_institutions.gz")
df_openalex = get_openalex_dataframe_from_multiple_works_files("../data/poster_acrl2025/openalex_data_20250205/", ror_map=ror_map)
df_openalex = df_openalex.drop_duplicates(subset=["doi"])

In [7]:
ucboulder_openalex_id = "I188538660"
northwestern_openalex_id = "I111979921"
df_openalex["affil_ucboulder"] = df_openalex["lineage"].apply(lambda institutions: ucboulder_openalex_id in institutions)
df_openalex["affil_northwestern"] = df_openalex["lineage"].apply(lambda institutions: northwestern_openalex_id in institutions)

### Europe PMC citations

Combine the OpenAlex data with Europe PMC

In [8]:
df_doi_accession_europepmc = pd.read_parquet('../data/europepmc/df_doi_accession_europepmc.parquet').dropna(subset=["DOI"])

In [9]:
oax_fields_to_merge = [
    "publication_date",
    "is_oa",
    "type",
    "type_crossref",
    "institutions_ror",
    "primary_topic",
    "topic_subfield",
    "topic_field",
    "topic_domain",
    "funders",
    "datasets",
    "affil_ucboulder",
    "affil_northwestern",
]
to_merge = df_openalex.rename(columns={"doi": "DOI"})[["DOI"] + oax_fields_to_merge].reset_index()
df_accession_match = df_doi_accession_europepmc.merge(to_merge, how="inner", on="DOI")

### Data Citation Corpus

Bring in Corpus data

In [10]:
df_corpus = pd.read_parquet('../data/df_2024-08-23-data-citation-corpus-v2.0.parquet')

In [11]:
# identify matches in Corpus
df_accession_match["target_upper"] = df_accession_match["accession_number"].str.upper()
to_merge = df_corpus.reset_index(names="corpus_id")
to_merge["corpus_has_affils"] = to_merge["affiliations"].apply(lambda x: len(x) > 0)
to_merge["target_upper"] = to_merge["dataset"].str.upper()
to_merge = to_merge.rename(columns={"publication": "DOI"})
to_merge = to_merge.rename(columns={"funders": "cited_funders", "subjects": "cited_subjects"})
to_merge = to_merge[["DOI", "target_upper", "corpus_id", "corpus_has_affils", "cited_funders", "cited_subjects"]]
df_accession_match = df_accession_match.merge(to_merge, how="left", on=["DOI", "target_upper"])

In [12]:
cols_rename = {
    "accession_number": "target",
    "repository_europepmc": "cited_repository",
    "DOI": "citing_doi",
    "openalex_id": "citing_openalex_id",
    "is_oa": "citing_is_oa",
    "type": "citing_openalex_type",
    "institutions_ror": "citing_institutions_ror",
    "primary_topic": "citing_primary_topic",
    "topic_subfield": "citing_topic_subfield",
    "topic_field": "citing_topic_field",
    "topic_domain": "citing_topic_domain",
    "funders": "citing_openalex_funders",
    "datasets": "citing_openalex_datasets",
}
df_accession_match = df_accession_match.rename(columns=cols_rename)

In [13]:
df_accession_match["publication_date"] = pd.to_datetime(df_accession_match["publication_date"])
df_accession_match["publication_year"] = df_accession_match["publication_date"].dt.year

In [14]:
df_accession_match["cited_is_doi"] = df_accession_match["target"].apply(is_it_a_doi)

### DOI Metadata

Data for target DOIs was pulled from the DataCite and Crossref APIs

In [15]:
df_datacite_and_crossref_metadata = pd.read_csv("../data/poster_acrl2025/df_datacite_and_crossref_metadata.csv")

In [16]:
# new column for cited title from DataCite or Crossref API
title_map = df_datacite_and_crossref_metadata.set_index("doi", verify_integrity=True)["title"]
df_accession_match["cited_title"] = df_accession_match["target"].map(title_map)

In [17]:
# new column for whether the DataCite or Crossref API has any affiliation info
has_affil_map = df_datacite_and_crossref_metadata.set_index("doi", verify_integrity=True)["has_affiliation_data"]
df_accession_match["doi_api_has_affils"] = df_accession_match["target"].map(has_affil_map)

In [18]:
# update repository information for cited DOIs (leave accession numbers alone)
df_accession_match["cited_repository"] = df_accession_match["cited_repository"].astype(str).replace("doi", np.nan)
repo_map = df_datacite_and_crossref_metadata.set_index("doi", verify_integrity=True)["repository"]
update_repo = df_accession_match["target"].map(repo_map)
df_accession_match["cited_repository"] = df_accession_match["cited_repository"].combine_first(update_repo)

In [19]:
# update missing subjects with info from DataCite or Crossref API
df_accession_match["cited_subjects"] = df_accession_match["cited_subjects"].apply(lambda x: np.nan if str(x) == '[]' else x)
subj_map = df_datacite_and_crossref_metadata.set_index("doi", verify_integrity=True)["subjects"]
update_subjs = df_accession_match["target"].map(subj_map)
df_accession_match["cited_subjects"] = df_accession_match["cited_subjects"].combine_first(update_subjs)

In [20]:
# update missing funders with info from DataCite or Crossref API
df_accession_match["cited_funders"] = df_accession_match["cited_funders"].apply(lambda x: np.nan if str(x) == '[]' else x)
fund_map = df_datacite_and_crossref_metadata.set_index("doi", verify_integrity=True)["funders"]
update_funders = df_accession_match["target"].map(fund_map)
df_accession_match["cited_funders"] = df_accession_match["cited_funders"].combine_first(update_funders)

In [21]:
# rearrange columns
columns = """
        target
        PMCID
        EXTID
        SOURCE
        cited_repository
        cited_is_doi
        citing_doi
        cited_subjects
        cited_funders
        cited_title
        citing_openalex_id
        citing_is_oa
        citing_openalex_type
        citing_institutions_ror
        citing_primary_topic
        citing_topic_subfield
        citing_topic_field
        citing_topic_domain
        citing_openalex_funders
        citing_openalex_datasets
        affil_ucboulder
        affil_northwestern
        target_upper
        corpus_id
        corpus_has_affils
        doi_api_has_affils
        publication_year
""".split()
df_accession_match = df_accession_match[columns]

### Bring in manual mapping of repository -> subjects

In [22]:
# bring in repository-based subject (discipline) info
# from https://docs.google.com/spreadsheets/d/1TddyO5We5mJidWBC_FTiZY0kpvJXutHJMqKv1sLK82c/edit?gid=2047674143#gid=2047674143
df_repo_subj = pd.read_csv("../data/poster_acrl2025/assertion-count-by-repo - accession-numbers-only.csv", usecols=["title", "abbreviation", "FOS mapping by DataCite staff"])
df_repo_subj = df_repo_subj.rename(columns={"FOS mapping by DataCite staff": "subjects"})
df_repo_subj = df_repo_subj.dropna()
df_repo_subj["abbreviation"] = df_repo_subj["abbreviation"].str.strip()
df_repo_subj = df_repo_subj[~(df_repo_subj["subjects"].str.contains("omit from v2"))]

In [23]:
# Some repo abbrevs are misaligned between the Europe PMC data and DataCite
# Map DataCite abbreviations to Europe PMC abbreviations
abbrevs_map = {
    "dbsnp": "refsnp",
    "ena.embl": "gen",
    "GO": "go",
    "insdc.gca": "gca",
    "pride": "pxd",
    "ega.dataset": "ega",
    "biomodels.db": "biomodels",
}

In [24]:
repos_mapped = df_repo_subj["abbreviation"].map(lambda x: abbrevs_map.get(x, x))
repo_subj_map = df_repo_subj.set_index(repos_mapped, verify_integrity=True)["subjects"]
repo_subj_map = repo_subj_map.apply(lambda x: x.split("; "))


In [25]:
update_subjs = df_accession_match["cited_repository"].map(repo_subj_map)
df_accession_match["cited_subjects"] = df_accession_match["cited_subjects"].apply(lambda x: np.nan if str(x) == '[]' else x)
df_accession_match["cited_subjects"] = df_accession_match["cited_subjects"].combine_first(update_subjs)

In [33]:
df_accession_match.to_csv("../data/poster_acrl2025/europepmc_data_citations_northwestern_and_ucboulder.csv")