In [None]:
import json
import re
import shutil
from pathlib import Path

import numpy as np
import pandas as pd
import yaml

In [None]:
def build_citation(row):
    """Build citation for row with a DOI"""
    authors = row["authors"] if not pd.isna(row["authors"]) else ""
    authors = re.sub("^null ", "", authors).replace("|", ", ")
    return f"{authors} ({row['published'][:4]}). {row['title']}"

def get_categories(pred, queried=None):
    return sorted({cats[f] for f in get_fields(pred, queried=queried)})

def get_doi(val):
    """Gets the first DOI from a pipe-delimited list"""
    try:
        for val in val.split("|"):
            if val.startswith("10."):
                return val
        return np.nan
    except (AttributeError, IndexError):
        return np.nan

def get_fields(pred, queried=None):
    """Get the filters applied to a GBIF download"""
    if isinstance(pred, str):
        pred = json.loads(pred)
    pred = pred.get("predicate", pred)
    if queried is None:
        queried = {}
    try:
        key = pred["key"]
        try:
            val = [pred["value"]]
        except KeyError:
            val = pred["values"]
    except KeyError:
        pass
    else:
        queried.setdefault(key, []).extend(val)
    for pred in pred.get("predicates", []):
        get_fields(pred, queried)
    return list(queried)

def get_multivalue(df):
    """Get pipe-delimited columns in a dataframe"""
    keys = []
    for key in df.columns:
        is_multivalue = any((isinstance(s, list) for s in df[key]))
        if not is_multivalue:
            try:
                is_multivalue = df[key].str.contains("|", regex=False).any()
            except AttributeError:
                pass
        if is_multivalue:
            keys.append(key)
    return keys

def format_keywords(val):
    """Titlecase and remove underscores from a value"""
    return val.replace("_", " ").title()

def format_list(vals):
    """Join and remove underscores from a list of values"""
    return "|".join(vals).upper().replace(" ", "_")

def split_multivalue(df, key, field):
    """Split a multivalue field in a dataframe into multiple rows"""

    def as_list(val):
        if isinstance(val, list) or pd.isna(val):
            return val
        if isinstance(val, (int, float)):
            return [val]
        if not val:
            return []
        if isinstance(val, str):
            return val.split("|")
        return list(val)

    df2 = df[[key, field]].dropna().copy()
    df2[field] = df2[field].apply(as_list)
    df2.explode(field).drop_duplicates().to_csv(f"output/sub_{field}.csv", index=False)
    del df[field]
    return df

In [None]:
data_dir = Path("data")
KEEP = {"gbif_download_key", "dat_id"}

Path("output").mkdir(exist_ok=True)

with open("categories.yml") as f:
    cats = yaml.safe_load(f)

In [None]:
# Copy all downloads
shutil.copy2("data/downloads_all.csv", "output/downloads_all.csv")

In [None]:
# Combine literature for all NMNH datasets
dfs = []
for path in (data_dir / "literature").glob("*.csv"):
    df = pd.read_csv(path)
    df["dataset_id"] = path.stem
    dfs.append(df)
lit = pd.concat(dfs)

# Get the DOI for all publications
lit["doi"] = lit["identifiers"].apply(get_doi)

lit["dataset"] = lit["citation"].str.contains("[Dataset]", regex=False)

# Truncate the author list
def truncate_authors(val):
    if pd.isna(val):
        return val
    val = val.strip()
    try:
        authors, year, rest = re.split(r"( *\(\d{4}\))", val, 1)
    except ValueError:
        return val
    else:
        if authors.count(",") <= 2:
            return val
        return authors.split(",")[0] + " et al." + year + rest

lit["citation"] = lit["citation"].apply(truncate_authors)

cond = pd.isna(lit["citation"])
lit.loc[cond, "citation"] = lit.loc[cond].apply(build_citation, axis=1)

lit["sro"] = lit["sro"].fillna(False)

# Remove extraneous fields
lit = lit[
    [
        "id",
        "doi",
        "citation",
        "gbif_download_key",
        "published",
        "dataset",
        "open_access",
        "peer_review",
        "topics",
        "sro",
        "cited_by_count",
        "altmetric_score",
    ]
]

# Create separate tables for multivalue fields
for key in get_multivalue(lit):
    lit = split_multivalue(lit, "id", key)

# Drop duplicate rows and save
lit = lit.drop_duplicates()
lit.to_csv("output/literature.csv", index=False)
lit

In [None]:
# Read occurrence metadata. These are custom downloads. Fields use file prefix.
downloads = pd.read_csv(data_dir / "downloads_published.csv")
downloads = downloads.rename(
    columns={"key": "gbif_download_key", "doi": "gbif_download_doi"}
)

downloads["format"] = downloads["request"].apply(lambda r: json.loads(r)["format"])

downloads["fieldsQueried"] = downloads["request"].apply(get_fields)
downloads.loc[~downloads["fieldsQueried"].astype(bool), "fieldsQueried"] = np.nan

downloads["topicsQueried"] = downloads["request"].apply(get_categories)
downloads.loc[~downloads["topicsQueried"].astype(bool), "topicsQueried"] = np.nan

downloads = downloads[
    [
        "gbif_download_key",
        "gbif_download_doi",
        "created",
        "format",
        "fieldsQueried",
        "topicsQueried",
        "numberDatasets",
        "nmnhRecords",
        "totalRecords",
    ]
]

# Split multivalue fields into separate tables
for key in get_multivalue(downloads):
    downloads = split_multivalue(downloads, "gbif_download_key", key)

# Drop duplicate rows and save
downloads = downloads.drop_duplicates()
downloads.to_csv("output/downloads_published.csv", index=False)
downloads

In [None]:
# Deprecated based on an email from the GBIF help desk but kept for posterity. 
# The short reason for why publications are linked to NMNH without inlcuding any
# downloads with NMNH records is that GBIF uses other data to link records to
# providers besides download keys. For example, a paper may use data from a previous
# paper that was itself based on GBIF data.

# The link table includes downloads that do not have any NMNH records. Remove those here.

#vals = set(downloads["gbif_download_key"])
#for path in Path("output").glob("sub_*.csv"):
#    links = pd.read_csv(path)
#    if "gbif_download_key" in links.columns:
#        links = links[links["gbif_download_key"].isin(vals)]
#        links.to_csv(path, index=False)

#vals = set(lit["id"])
#for path in Path("output").glob("sub_*.csv"):
#    links = pd.read_csv(path)
#    if "id" in links.columns:
#        links = links[links["id"].isin(vals)]
#        links.to_csv(path, index=False)

# The GBIF literature exports for NMNH datasets include several hundred papers that
# do not appear to cite datasets featuring NMNH specimens (although this conclusion
# is based on the results of another GBIF API). Moreover, there are incongruities
# between the information from the export API and the literature API, for example
# literature records that do not specify a GBIF download. For now, remove all papers
# that cannot be associated with a download.
# links = pd.read_csv("output/sub_gbif_download_key.csv")
# lit[lit["id"].isin(links["id"])].to_csv("output/literature.csv", index=False)

# These are the records that do not appear to be associated with datasets with
# NMNH specimens.
# lit[~lit["id"].isin(links["id"])]