In [1]:
import sys

In [2]:
sys.path.append("..")

In [3]:
from clean_doi import clean_doi, NoDoiException

def is_it_a_doi(doi: str) -> bool:
    # if clean_doi is unsuccessful, return False indicating that the input was not actually a doi
    try:
        cleaned_doi = clean_doi(doi)
        return True
    except NoDoiException:
        return False

In [4]:
import pandas as pd
import numpy as np

In [5]:
df_accession_match = pd.read_csv("../data/poster_acrl2025/openalex_data_20250205/df_accession_match.csv")

In [6]:
df_datacite_and_crossref_metadata = pd.read_csv("../data/poster_acrl2025/df_datacite_and_crossref_metadata.csv")

In [9]:
left_rename = {
    "accession_number": "target",
    "repository_europepmc": "cited_repository",
    "DOI": "citing_doi",
    "openalex_id": "citing_openalex_id",
    "is_oa": "citing_is_oa",
    "type": "citing_openalex_type",
    "institutions_ror": "citing_institutions_ror",
    "primary_topic": "citing_primary_topic",
    "topic_subfield": "citing_topic_subfield",
    "topic_field": "citing_topic_field",
    "topic_domain": "citing_topic_domain",
    "funders": "citing_openalex_funders",
    "datasets": "citing_openalex_datasets",
}

right_rename = {
    "doi": "target",
    "has_affiliation_data": "doi_api_has_affils",
    "title": "cited_title",
    "subjects": "cited_subjects",
    "funders": "cited_funders",
}

columns = """
        target
        PMCID
        EXTID
        SOURCE
        cited_repository
        cited_is_doi
        citing_doi
        cited_subjects
        cited_funders
        cited_title
        citing_openalex_id
        citing_is_oa
        citing_openalex_type
        citing_institutions_ror
        citing_primary_topic
        citing_topic_subfield
        citing_topic_field
        citing_topic_domain
        citing_openalex_funders
        citing_openalex_datasets
        affil_ucboulder
        affil_northwestern
        target_upper
        corpus_id
        corpus_has_affils
        doi_api_has_affils
        publication_year
""".split()

In [10]:
left = df_accession_match.rename(columns=left_rename)
left = left[[col for col in columns if col in left.columns]]
left["cited_is_doi"] = left["target"].apply(is_it_a_doi)

In [11]:
right = df_datacite_and_crossref_metadata.rename(columns=right_rename)
right = right[[col for col in columns if col in right.columns]]

In [12]:
merged = left.merge(right, how="left", on=["target"])

In [13]:
# fill in repository information for cited DOIs (leave accession numbers alone)
merged["cited_repository"] = merged["cited_repository"].replace("doi", np.nan)
merged["cited_repository"] = merged["cited_repository"].fillna(value=df_datacite_and_crossref_metadata["repository"])

In [14]:
# rearrange columns
merged = merged[columns]