# NMNH data in GBIF

This workbook collects data from GBIF, Crossref, Altmetrics, and OpenAlex as the
basis for a Tableau dashboard about NMNH data in GBIF. API calls are confined to
this workbook, but the resulting data is further processed in 2-combine-data.ipynb.

In [None]:
import json
import re
import time
from pathlib import Path

import pandas as pd
import requests
import requests_cache

DEBUG = None
EMAIL = ""
ORG_UUID = "bc092ff0-02e4-11dc-991f-b8a03c50a862"  # NMNH organization UUID

In [None]:
def get(*args, **kwargs):
    """Performs a GET request with a brief delay for fresh requests"""
    resp = session.get(*args, **kwargs)
    if not resp.from_cache:
        print(resp.status_code, resp.url)
        time.sleep(0.1)
    return resp


def get_gbif(*args, **kwargs):
    """Gets results from all pages of a GBIF request"""
    results = []
    while True:
        rec = get(*args, **kwargs).json()
        results.extend(rec["results"])
        if rec.get("endOfRecords", True):
            break
        time.sleep(0.1)
        try:
            kwargs["params"]["offset"] += rec["limit"]
        except KeyError:
            kwargs.setdefault("params", {})["offset"] = rec["limit"]
    return {
        "offset": 0,
        "limit": rec["count"],
        "endOfRecords": True,
        "count": rec["count"],
        "results": results,
    }


def filter_fn(resp):
    """Caches requests with 200 or 404 with certain text"""
    cache_404s = {
        "doi not found",
        "not found",
        "<!doctype html>\n<html lang=en>\n<title>404 not found</title>\n<h1>not found</h1>\n<p>the requested url was not found on the server. if you entered the url manually please check your spelling and try again.</p>",
    }
    return (
        resp.status_code == 200
        or resp.status_code == 404
        and resp.text.strip().lower() in cache_404s
    )


def is_paleo(row):
    """Simplistically check if paleo data"""
    val = str(row).lower()
    return "fossil" in val or "paleo" in val

In [None]:
# Cache records with 200 status code or 404s that indicate a record was not found
session = requests_cache.CachedSession(allowable_codes=(200, 404), filter_fn=filter_fn)

if not EMAIL:
    raise ValueError("You must specify an email using the EMAIL constant")
headers = {"User-Agent": f"python-requests/{requests.__version__}/{EMAIL}"}

data_dir = Path("data")
lit_dir = data_dir / "literature"
lit_dir.mkdir(exist_ok=True, parents=True)

In [None]:
# Get SRO data. Note that you must be on the domain to run this one.
resp = get(
    "https://staff.research.si.edu/export/srb_search_export_action.cfm",
    params={
        "search_term": "",
        "submit": "Export data",
        "format": "JSON",
        "Unit": "330000",
        "count": 100000,
    },
    headers=headers,
)

# The JSON provided by SRO includes imporperly escaped double quotes. Clear those
# via a regex, first clearing pairs of quotes, then single quotes.
text = re.sub('(: ".*?)"(.*?)"(.*?")', r"\1\2\3", resp.text)
text = re.sub('(: ".*?)"(.*?")', r"\1\2", text)

sro = pd.DataFrame([list(i["reference"].values())[0] for i in json.loads(text)])[
    ["doi"]
]
sro["doi"] = sro["doi"].str.split("/", n=3, expand=True)[3]
sro = sro[~pd.isna(sro["doi"])]
sro.to_csv("data/sro.csv", index=False)
sro_dois = set(sro["doi"])

In [None]:
# Get GBIF datasets published by NMNH
rec = get_gbif(
    "https://api.gbif.org/v1/dataset/search",
    params={"publishingOrg": ORG_UUID},
    headers=headers,
    expire_after=7 * 24 * 60 * 60,
)

datasets = pd.DataFrame(rec["results"])
datasets = datasets[datasets["title"].str.contains("NMNH")]
datasets.to_csv(data_dir / "datasets.csv", index=False)
org_title = datasets.iloc[0]["publishingOrganizationTitle"]
datasets

In [None]:
# Get stats for all downloads
rows = []
for key in datasets["key"]:
    # Request cached for one week
    rec = get_gbif(
        "https://api.gbif.org/v1/occurrence/download/statistics",
        params={"datasetKey": key},
        expire_after=7 * 24 * 60 * 60,
    )
    rows.extend(rec["results"])

stats = pd.DataFrame(rows)
stats = stats.groupby(["year"]).sum()[["numberDownloads", "totalRecords"]].reset_index()
stats.to_csv("data/downloads_all.csv", index=False)

In [None]:
# Get literature citing each dataset
for key in datasets["key"]:
    # Request cached for one week
    resp = session.get(
        "https://api.gbif.org/v1/literature/export",
        params={"format": "CSV", "gbifDatasetKey": key},
        headers=headers,
        expire_after=7 * 24 * 60 * 60,
    )
    with open(lit_dir / (key + ".csv"), "wb") as f:
        f.write(resp.content)

# Filter non-paleo papers from the paleo dataset
path = lit_dir / "c8681cc2-9d0a-4c5f-b620-5c753abfe2bc.csv"
pd.DataFrame([r for _, r in pd.read_csv(path).iterrows() if is_paleo(r)]).to_csv(path)

In [None]:
# Get citations for each publication with a DOI
for path in lit_dir.glob("*.csv"):

    start_time = time.time()

    df = pd.read_csv(path).sort_values("gbif_download_key")

    # If a dataset has not been cited, the literature CSV will be empty. Remove
    # these files.
    if not len(df):
        path.unlink()
        continue

    rows = []
    for i, row in df.iterrows():

        # For records that specify a DOI (which is most of them), use the DOI to
        # grab a citation and an Altmetric score.
        if not pd.isna(row["identifiers"]):
            for val in row["identifiers"].split("|"):
                if val.startswith("10."):

                    # Check if DOI appears in SRO
                    row["sro"] = val in sro_dois

                    # Get citation from Crossref
                    resp = get(
                        "https://citation.doi.org/format",
                        params={
                            "doi": val,
                            "style": "apa",
                            "lang": "en-US",
                        },
                        headers=headers,
                    )
                    if resp.text != "DOI not found":
                        row["citation"] = resp.text.strip()

                    # Get citation count from OpenAlex
                    resp = get(
                        f"https://api.openalex.org/works/doi:{val}",
                        params={
                            "select": "cited_by_count,citation_normalized_percentile"
                        },
                        headers=headers,
                    )
                    if not resp.text.startswith("<"):
                        row["cited_by_count"] = resp.json()["cited_by_count"]

                    # Get Altmetric score. Altmetric only allow 1200 calls per day
                    # to its counts-only endpoint without an API key, so this may
                    # take some time.
                    resp = get(
                        f"https://api.altmetric.com/v1/doi/{val}",
                        headers=headers,
                    )
                    if resp.status_code not in (200, 404):
                        # The API is supposed to throw a 429 error if you exceed the
                        # daily limit. I've never seen it, but this should kill the
                        # script if it shows up.
                        raise ValueError(f"Invalid response: {resp.headers}")
                    if resp.text != "Not Found":
                        data = resp.json()
                        row["altmetric_id"] = data["details_url"].split("=")[-1]
                        row["altmetric_score"] = resp.json()["score"]

                    break

        rows.append(row)

        if time.time() - start_time >= 5:
            print(f"{len(rows):,} works processed")
            start_time = time.time()

    print(f"{len(rows):,} works processed")
    pd.DataFrame(rows).to_csv(path, index=False)

In [None]:
# Get metadata for GBIF downloads
keys = {}
downloads = []
start_time = time.time()
no_nmnh_records = {}
for path in lit_dir.glob("*.csv"):
    print(f"Processing {path.name}...")
    df = pd.read_csv(path).sort_values("gbif_download_key")
    notify = 1000
    for i, row in df.iterrows():

        # Filter non-paleo works from the paleo dataset
        if path.stem == "c8681cc2-9d0a-4c5f-b620-5c753abfe2bc" and not is_paleo(row):
            continue

        # Get additional info about each linked download
        has_nmnh = []
        if not pd.isna(row["gbif_download_key"]):

            for key in row["gbif_download_key"].split("|"):

                if DEBUG is not None and key != DEBUG:
                    continue

                # The same download may be associated with multiple datasets, so
                # only check each key once. It does not seem to be possible to
                # determine which dataset records come from if multiple datasets are
                # associated with a download.
                try:
                    keys[key]
                except KeyError:
                    keys[key] = None
                else:
                    has_nmnh.append(key)
                    continue

                download = {}

                # Get number of NMNH specimens in the dataset
                data = get_gbif(
                    f"https://api.gbif.org/v1/occurrence/download/{key}/organizations",
                    params={"organizationTitle": org_title},
                )
                for result in data["results"]:
                    if result["organizationKey"] == ORG_UUID:
                        download["nmnhRecords"] = result["numberRecords"]
                        break
                else:
                    # Omit datasets that do not include NMNH records
                    continue

                has_nmnh.append(key)

                # Get general information about the dataset
                resp = get(
                    f"https://api.gbif.org/v1/occurrence/download/{key}",
                    headers=headers,
                )
                download.update(
                    {
                        k: json.dumps(v) if isinstance(v, (bool, dict, list)) else v
                        for k, v in resp.json().items()
                    }
                )

                # Counts for SPECIES_LIST downloads are for the number of taxa, not
                # the number of occurrences. We can use the organizations endpoint to
                # get the number of occurrences. Not positive this is the best
                # approach, but it is consistent with the other download types.
                if "SPECIES_LIST" in str(download):
                    data = get_gbif(
                        f"https://api.gbif.org/v1/occurrence/download/{key}/organizations",
                    )
                    download["totalRecords"] = sum(
                        [r["numberRecords"] for r in data["results"]]
                    )

                downloads.append(download)

                if time.time() - start_time >= 5:
                    print(f"{len(keys):,} downloads processed")
                    start_time = time.time()

                if not resp.from_cache:
                    time.sleep(0.1)

        # Note papers that do not seem to link to any NMNH records for further review
        # NOTE: Discussed these with GBIF. Do not need to flag these records.
        # if not has_nmnh:
        #    key = row["identifiers"]
        #    if pd.isna(key):
        #        key = row["title"]
        #    try:
        #        no_nmnh_records[key]
        #    except KeyError:
        #        no_nmnh_records[key] = True
        #        print(f"{len(no_nmnh_records)}. {key}")

print(f"{len(keys):,} downloads processed")

downloads = pd.DataFrame(downloads).drop_duplicates()
downloads.to_csv(data_dir / "downloads_published.csv", index=False)