In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
%%appyter hide_code

{% do SectionField(
    name='primary',
    title='C2M2 FAIR Assessment',
    subtitle='Assessing c2m2 datapackages for FAIRness',
    img='insignia.png',
) %}

{% set file = FileField(
    name='file',
    label='A zipped [C2M2 Datapackage](https://docs.nih-cfde.org/en/latest/c2m2/draft-C2M2_specification/)',
    help='Provide your zipped c2m2 datapackage',
    examples={'example.zip': url_for('static', path='example.zip')},
    default='example.zip',
    section='primary',
) %}


{% do SectionField(
    name='advanced',
    title='Advanced Configuration',
    subtitle='For tweaking the report',
) %}

{% set n_bins = IntField(
    name='n_bins',
    label='Number of bins for discretization of answers',
    help='When turning the continuous valued answer into a discrete bucket, how many bins to use',
    default=3,
    min=2,
    max=10,
    section='advanced',
) %}

{% set n_comments = IntField(
    name='n_comments',
    label='Number of top/bottom comments',
    help='When showing comments of unsatisfied answers, how many should be shown?',
    default=10,
    min=0,
    max=100,
    section='advanced',
) %}


# C2M2 Assessment

We perform a file-centric FAIR Assessment on all files defined in a [C2M2 datapackage](https://docs.nih-cfde.org/en/latest/c2m2/draft-C2M2_specification/) according to the [C2M2 Rubric](https://fairshake.cloud/rubric/36); descriptions of each metric and how we assess them are provided below, along with the actual code to perform the assesssment.

In [None]:
import os
import re
import glob
import requests
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from textwrap import dedent
from IPython.display import display, Markdown

In [None]:
def mean(V):
    return sum(V) / len(V)

def one_and_only(it):
  ''' Select one and only item from an iterable, otherwise throw an exception.
  '''
  it = iter(it)
  ret = next(it)
  try:
    next(it)
    raise Exception('Expected one')
  except StopIteration:
    return ret

def deep_find(root, file):
  ''' Helper for finding a filename in a potentially deep directory
  '''
  return set(glob.glob(os.path.join(root, '**', file), recursive=True))

def fetch_cache(url, filename, cachedir='.cached'):
  ''' Download a {file} from a {url} if it hasn't already been downloaded, storing it in {cachedir}.
  '''
  import os, urllib.request
  os.makedirs(cachedir, exist_ok=True)
  if not os.path.exists(os.path.join(cachedir, filename)):
    urllib.request.urlretrieve(url, filename=os.path.join(cachedir, filename))
  return os.path.join(cachedir, filename)

def url_join(*args):
  ''' Join urls by slashes, not worrying about duplicated trailing slashes
  '''
  return '/'.join([arg.rstrip('/') for arg in args[:-1]]+[args[-1]])

def filter_empty(val):
  ''' Attempt to catch some actual null values that aren't really null.
  '''
  return [
    v
    for v in val
    if v is not None and (
      type(v) != str or v.strip().lower() not in {
        '-',
        '-666',
        '',
        'empty',
        'n/a',
        'na',
        'nan',
        'nil',
        'none',
        'not defined',
        'null',
        'undef',
        'undefined',
      }
    )
  ]

_lazy = {}
def lazy(cb):
    import functools
    @functools.wraps(cb)
    def wrapper():
        global _lazy
        if cb not in _lazy:
            _lazy[cb] = cb()
        return _lazy[cb]
    return wrapper

## Step 1. Load DERIVA compatible client from URL or datapackage

Given a datapackage, access it through DERIVA-compatible client. This client package <https://github.com/nih-cfde/deriva-datapackage> permits accesisng offline datapackages in the same way that the online DERIVA client operates, thus the assessment can be performed online or offline.

In [None]:
%%appyter code_exec
import zipfile
import tempfile

file = {{ file }}
basename, ext = os.path.splitext(file)
assert ext == '.zip', 'Expected .zip file'
directory = tempfile.mkdtemp()

with zipfile.ZipFile(file, 'r') as z:
    z.extractall(directory)

In [None]:
from deriva_datapackage import create_offline_client

# sometimes zip files zip the leading directory, which may be named anything,
#  deep_find lets us locate the datapackage wherever it is.
CFDE = create_offline_client(
    *(
        deep_find(directory, 'C2M2_datapackage.json')
        | deep_find(directory, 'datapackage.json')
    ),
    cachedir=directory,
)

## Step 2. Load External Ontologies for Validation

We download the most up to date ontologies from their public releases and load them with our `ontology_parsing.py` module which parses the ontology format and gathers the list of identifiers and synonyms so that we can validate terms.

In [None]:
from ontology_parsing import OBOOntology, CellosaurusOntology
OBI = lazy(lambda: OBOOntology.parse(fetch_cache('https://raw.githubusercontent.com/obi-ontology/obi/master/views/obi.obo', 'OBI.obo', cachedir=directory)))
UBERON = lazy(lambda: OBOOntology.parse(fetch_cache('http://purl.obolibrary.org/obo/uberon.obo', 'uberon.owl', cachedir=directory)))
DOID = lazy(lambda: OBOOntology.parse(fetch_cache('https://github.com/DiseaseOntology/HumanDiseaseOntology/raw/main/src/ontology/releases/doid.obo', 'doid.obo', cachedir=directory)))
EDAM = lazy(lambda: OBOOntology.parse(fetch_cache('http://edamontology.org/EDAM.obo', 'EDAM.obo', cachedir=directory)))
# NCBITaxon = lazy(lambda: OBOOntology.parse(fetch_cache('http://purl.obolibrary.org/obo/ncbitaxon.obo', 'ncbitaxon.obo', cachedir=directory)))
Cellosaurus = lazy(lambda: CellosaurusOntology.parse(fetch_cache('ftp://ftp.expasy.org/databases/cellosaurus/cellosaurus.xml', 'cellosaurus.xml', cachedir=directory)))

## Step 3. Prepare C2M2 Rubric

We use a python decorator for registering each metric into the rubric. This lets us define each metric in its own cell with its description and code to assert it. All metric functions receive as parameters the file being assessed and the CFDE client for querying other information about that file and [*generate*](https://wiki.python.org/moin/Generators) compatible answers.

This paradigm can be used for any rubric allowing assessment code to remain the same even with changing metrics, furthermore this is compatible with [FAIRshake](https://fairshake.cloud/) assessments, adopting FAIRshake metric identifiers allowing the results to be easily registered with FAIRshake.

In [None]:
rubric = {
  '@id': 36,
  'name': 'NIH CFDE Interoperability',
  'description': 'This rubric identifies aspects of the metadata models which promote interoperable dataset querying and filtering',
  'metrics': {},
}

def _register_metric(schema):
  global metrics
  def wrapper(func):
    rubric['metrics'][schema['@id']] = dict(schema, func=func)
  setattr(wrapper, '__name__', schema['name'])
  display(Markdown(dedent(f'''
    ### Metric ([{schema['@id']}](https://fairshake.cloud/metric/{schema['@id']})): {schema['name']}
    **{schema['description']}**

    {schema['detail']}
  ''')))
  return wrapper

In [None]:
@_register_metric({
  # standardized metadata format (107), machine readable metadata (106)
  # metadata license (117) (c2m2 ?)
  '@id': 106,
  'name': 'Metadata conformance',
  'description': 'The metadata properly conforms with the CFDE perscribed metadata model specification',
  'detail': '''Starting from a file, traverse all associated tables and calculate a ratio of missing fields vs complete of fields. 0.25 * (file_complete + biosample_complete + subject_complete + project_complete) where x_complete is n_fields_with_values / n_fields for field in all_records.''',
  'principle': 'Findable',
})
def _(file, CFDE=None, **kwargs):
  file_query = lambda: CFDE.tables['file'].filter((
    CFDE.tables['file'].id_namespace == file['id_namespace']
  ) & (
    CFDE.tables['file'].local_id == file['local_id']
  ))
  # 25% file completeness
  file_completeness = [len(list(filter_empty(file.values()))) / len(file.keys())]
  # 25% biosample completeness
  biosample_completeness = []
  biosamples = file_query().link(
    CFDE.tables['file_describes_biosample'], on=((
      CFDE.tables['file'].id_namespace == CFDE.tables['file_describes_biosample'].file_id_namespace
    ) & (
      CFDE.tables['file'].local_id == CFDE.tables['file_describes_biosample'].file_local_id
    ))
  ).link(
    CFDE.tables['biosample'], on=((
      CFDE.tables['file_describes_biosample'].biosample_id_namespace == CFDE.tables['biosample'].id_namespace
    ) & (
      CFDE.tables['file_describes_biosample'].biosample_local_id == CFDE.tables['biosample'].local_id
    ))
  ).entities()
  for biosample in biosamples:
    biosample_completeness.append(
      len(list(filter_empty(biosample.values()))) / len(biosample.keys())
    )
  # 25% subject completeness
  subject_completeness = []
  subjects = file_query().link(
    CFDE.tables['file_describes_subject'], on=((
       CFDE.tables['file'].id_namespace == CFDE.tables['file_describes_subject'].file_id_namespace
    ) & (
       CFDE.tables['file'].local_id == CFDE.tables['file_describes_subject'].file_local_id
    ))
  ).link(
    CFDE.tables['subject'], on=((
       CFDE.tables['file_describes_subject'].subject_id_namespace == CFDE.tables['subject'].id_namespace
    ) & (
       CFDE.tables['file_describes_subject'].subject_local_id == CFDE.tables['subject'].local_id
    ))
  ).entities()
  for subject in subjects:
    subject_completeness.append(
      len(list(filter_empty(subject.values()))) / len(subject.keys())
    )
  # 25% project completeness
  project_completeness = {}
  #
  projects = CFDE.tables['project'].filter((
    CFDE.tables['project'].id_namespace == file['project_id_namespace']
  ) & (
    CFDE.tables['project'].local_id == file['project_local_id']
  ))
  project_entities = list(projects.entities())
  #
  while project_entities:
    project = one_and_only(project_entities)
    project_completeness[url_join(project['id_namespace'], project['local_id'])] = len(list(filter_empty(project.values()))) / len(project.keys())
    #
    p1, pip, p2 = CFDE.tables['project'].alias('p1'), CFDE.tables['project_in_project'].alias('pip'), CFDE.tables['project'].alias('p2')
    path = p1.path.filter(((p1.id_namespace == project['id_namespace']) & (p1.local_id == project['local_id'])))
    path = path.link(pip, on=((path.p1.id_namespace == pip.child_project_id_namespace) & (path.p1.local_id == pip.child_project_local_id)))
    path = path.link(p2, on=((path.pip.parent_project_id_namespace == p2.id_namespace) & (path.pip.parent_project_local_id == p2.local_id)))
    projects = path
    project_entities = list(projects.entities())
  #
  file_completeness = (sum(file_completeness) / len(file_completeness)) if file_completeness else 0.
  biosample_completeness = sum(biosample_completeness) / len(biosample_completeness) if biosample_completeness else 0.
  subject_completeness = sum(subject_completeness) / len(subject_completeness) if subject_completeness else 0.
  project_completeness = sum(project_completeness.values()) / len(project_completeness) if project_completeness else 0.
  complete_completeness = mean([
    file_completeness,
    biosample_completeness,
    subject_completeness,
    project_completeness,
  ])
  #
  yield {
    'value': complete_completeness,
    'comment': 'Computed based on completeness of file ({:.2f}) and associated biosample ({:.2f}), subject ({:.2f}), and projects ({:.2f})'.format(
      file_completeness,
      biosample_completeness,
      subject_completeness,
      project_completeness,
    )
  }


In [None]:
@_register_metric({
  '@id': 136,
  'name': 'Program name',
  'description': 'Program name is available for querying',
  'detail': '''From a given file, find the root project and ensure it corresponds to a valid DCC.''',
  'principle': 'Findable',
})
def _(file, CFDE=None, **kwargs):
  # the program name is the root project
  #
  project = None
  projects = CFDE.tables['project'].filter((
    CFDE.tables['project'].id_namespace == file['project_id_namespace']
  ) & (
    CFDE.tables['project'].local_id == file['project_local_id']
  ))
  project_entities = list(projects.entities())
  #
  while project_entities:
    project = one_and_only(project_entities)
    #
    p1, pip, p2 = CFDE.tables['project'].alias('p1'), CFDE.tables['project_in_project'].alias('pip'), CFDE.tables['project'].alias('p2')
    path = p1.path.filter(((p1.id_namespace == project['id_namespace']) & (p1.local_id == project['local_id'])))
    path = path.link(pip, on=((path.p1.id_namespace == pip.child_project_id_namespace) & (path.p1.local_id == pip.child_project_local_id)))
    path = path.link(p2, on=((path.pip.parent_project_id_namespace == p2.id_namespace) & (path.pip.parent_project_local_id == p2.local_id)))
    projects = path
    project_entities = list(projects.entities())
  # at this point 'project' contains the top level project
  if project is None:
    yield {
      'value': 0.0,
      'comment': 'Could not identify top level project',
    }
  elif project['abbreviation'] in {'4DN', 'GTEx', 'HMP', 'KidsFirst', 'LINCS', 'Metabolomics', 'MoTrPAC'}:
    yield {
      'value': 1,
      'comment': 'Identified known program {}'.format(project['name'])
    }
  else:
    yield {
      'value': 0.75,
      'comment': 'Identified unknown top level project {}'.format(project['name'])
    }


In [None]:
@_register_metric({
  '@id': 137,
  'name': 'Project name',
  'description': 'Project name is available for querying',
  'detail': '''Ensure the direct parent project for a given file is available.''',
  'principle': 'Findable',
})
def _(file, CFDE=None, **kwargs):
  # the project name is the direct parent project
  project = one_and_only(
    CFDE.tables['project'].filter((
      CFDE.tables['project'].id_namespace == file['project_id_namespace']
    ) & (
      CFDE.tables['project'].local_id == file['project_local_id']
    )).entities()
  )
  if project.get('name'):
    yield {
      'value': 1,
      'comment': 'Identified project: {}'.format(project['name'])
    }
  else:
    yield {
      'value': 0.5,
      'comment': 'Project identified, but it had no name'
    }


In [None]:
@_register_metric({
  '@id': 27,
  'name': 'PI Contact',
  'description': 'PI Contact is available for dataset',
  'detail': '''Ensure primary_dcc_contact is present for the file and it's not empty.''',
  'principle': 'Reusable',
})
def _(file, CFDE=None, **kwargs):
  try:
    contact = one_and_only(
      CFDE.tables['primary_dcc_contact'].filter(
        CFDE.tables['primary_dcc_contact'].project_id_namespace == file['id_namespace']
      ).entities()
    )
    if contact.get('contact_email'):
      yield {
        'value': 0.75,
        'comment': 'Contact email found, possibly PI'
      }
    elif contact.get('dcc_url'):
      yield {
        'value': 0.5,
        'comment': 'DCC website available, contact information might be discoverable'
      }
    else:
      yield {
        'value': 0,
        'comment': 'No contact information was located for this file'
      }
  except:
    yield {
      'value': 0,
      'comment': 'No contact information was located for this file'
    }


In [None]:
@_register_metric({
  '@id': 138,
  'name': 'Responsible institution',
  'description': 'The institution that created this dataset is available',
  'detail': '''This is not available in the current iteration of the C2M2.''',
  'principle': 'Findable',
})
def _(file, CFDE=None, **kwargs):
  yield {
    'value': 0,
    'comment': 'No information about the contributing institution is available in the C2M2 Level 1'
  }


In [None]:
@_register_metric({
  # Access protocol (110)
  '@id': 110,
  'name': 'Access protocol',
  'description': 'The protocol for accessing the data is available and described with a URI',
  'detail': '''This is not available in the current iteration of the C2M2.''',
  'principle': 'Accessible',
})
def _(file, CFDE=None, **kwargs):
  yield {
    'value': 0,
    'comment': 'The C2M2 Level 1 does not provide a means of capturing information about file access'
  }


In [None]:
@_register_metric({
  '@id': 139,
  'name': 'Assay',
  'description': 'Assay is present and a proper CFDE-specified ontological term is found in the CFDE-specified ontologies.',
  'detail': '''Ensure the assay_type is in the latest version of OBI.''',
  'principle': 'Interoperable',
})
def _(file, CFDE=None, **kwargs):
  # TODO: check names
  assay = file.get('assay_type')
  if not assay:
    yield {
      'value': 0.0,
      'comment': 'No assay_type found associated with the file',
    }
  elif OBI().get(assay) is not None:
    yield {
      'value': 1,
      'comment': 'Ontological IRI for Assay found in OBI.',
      'url_comment': assay,
    }
  else:
    yield {
      'value': 0.5,
      'comment': 'Assay found but not verified in OBI.',
      'url_comment': assay,
    }


In [None]:
@_register_metric({
  '@id': 140,
  'name': 'Anatomical Part',
  'description': 'An anatomical part is present and the CFDE-specified ontological term is found in the CFDE-specified ontologies',
  'detail': '''For each file, ensure we can find at least one anatomy term and any anatomy that is found can be associated with a file (through biosample), ensure it's present in the latest version of UBERON.''',
  'principle': 'Interoperable',
})
def _(file, CFDE=None, **kwargs):
  # TODO: check names
  biosamples = list(CFDE.tables['file_describes_biosample'].filter((
    CFDE.tables['file_describes_biosample'].file_id_namespace == file['id_namespace']
  ) & (
    CFDE.tables['file_describes_biosample'].file_local_id == file['local_id']
  )).link(
    CFDE.tables['biosample'], on=((
      CFDE.tables['file_describes_biosample'].biosample_id_namespace == CFDE.tables['biosample'].id_namespace
    ) & (
      CFDE.tables['file_describes_biosample'].biosample_local_id == CFDE.tables['biosample'].local_id
    ))
  ).entities())
  if len(biosamples) < 1:
    yield {
      'value': 0.0,
      'comment': 'No biosamples found described by the file',
    }
  else:
    for biosample in biosamples:
      anatomy = biosample.get('anatomy')
      if not anatomy:
        yield {
          'value': 0.0,
          'comment': 'No anatomy found on the biosample',
        }
      elif UBERON().get(anatomy) is not None:
        yield {
          'value': 1,
          'comment': 'Ontological IRI for Anatomy found in UBERON.',
          'url_comment': anatomy,
        }
      else:
        yield {
          'value': 0.5,
          'comment': 'Anatomy found but not verified in UBERON.',
          'url_comment': anatomy,
        }


In [None]:
@_register_metric({
  '@id': 141,
  'name': 'Disease',
  'description': 'A disease is present and the CFDE-specified ontological term is found in the CFDE-specified ontologies',
  'detail': '''This is not available in the current iteration of the C2M2.''',
  'principle': 'Interoperable',
})
def _(file, CFDE=None, **kwargs):
  if 'subject_disease' in CFDE.tables:
    # TODO: check names
    path = CFDE.tables['file'].filter((CFDE.tables['file'].id_namespace == file['id_namespace']) & (CFDE.tables['file'].local_id == file['local_id']))
    subject_path = path.link(
      CFDE.tables['file_describes_subject'], on=((
        CFDE.tables['file'].id_namespace == CFDE.tables['file_describes_subject'].file_id_namespace
      ) & (
        CFDE.tables['file'].local_id == CFDE.tables['file_describes_subject'].file_local_id
      ))
    )
    subject_path = subject_path.link(
      CFDE.tables['subject'], on=((
        CFDE.tables['file_describes_subject'].subject_id_namespace == CFDE.tables['subject'].id_namespace
      ) & (
        CFDE.tables['file_describes_subject'].subject_local_id == CFDE.tables['subject'].local_id
      ))
    )
    subject_path = subject_path.link(
      CFDE.tables['subject_disease'], on=((
        CFDE.tables['subject'].id_namespace == CFDE.tables['subject_disease'].subject_id_namespace
      ) & (
        CFDE.tables['subject'].local_id == CFDE.tables['subject_disease'].subject_local_id
      ))
    )
    biosample_path = path.link(
      CFDE.tables['file_describes_biosample'], on=((
        CFDE.tables['file'].id_namespace == CFDE.tables['file_describes_biosample'].file_id_namespace
      ) & (
        CFDE.tables['file'].local_id == CFDE.tables['file_describes_biosample'].file_local_id
      ))
    )
    biosample_path = biosample_path.link(
      CFDE.tables['biosample'], on=((
        CFDE.tables['file_describes_biosample'].biosample_id_namespace == CFDE.tables['biosample'].id_namespace
      ) & (
        CFDE.tables['file_describes_biosample'].biosample_local_id == CFDE.tables['biosample'].local_id
      ))
    )
    biosample_path = biosample_path.link(
      CFDE.tables['biosample_disease'], on=((
        CFDE.tables['biosample'].id_namespace == CFDE.tables['biosample_disease'].biosample_id_namespace
      ) & (
        CFDE.tables['biosample'].local_id == CFDE.tables['biosample_disease'].biosample_local_id
      ))
    )
    for label, path in [('biosample', biosample_path), ('subject', subject_path)]:
      for entity in path.entities():
        disease = entity.get('disease')
        if not disease:
          yield {
            'value': 0.0,
            'comment': f'No disease found attached to the {label}',
          }
        elif DOID().get(disease) is not None:
          yield {
            'value': 1,
            'comment': f"Ontological IRI for disease associated with {label} found in Disease Ontology.",
            'url_comment': disease,
          }
        else:
          yield {
            'value': 0.5,
            'comment': f"Disease found in {label} but not verified in Disease Ontology.",
            'url_comment': disease,
          }
  else:
    yield {
      'value': 0.0,
      'comment': 'Disease information not supported by this version of C2M2',
    }

In [None]:
@_register_metric({
  '@id': 142,
  'name': 'File type',
  'description': 'A file type is present and the CFDE-specified ontological term is found in the CFDE-specified ontologies',
  'detail': '''Ensure the file_format & data_type is in the latest version of EDAM.''',
  'principle': 'Interoperable',
})
def _(file, CFDE=None, **kwargs):
  # TODO: check names
  for term_type, term in [('file format', file.get('file_format')), ('data type', file.get('data_type'))]:
    if not term:
      yield {
        'value': 0.0,
        'comment': 'No {} found on the biosample'.format(term_type),
      }
    elif EDAM().get("EDAM_{term}".format(term=term)) is not None:
      yield {
        'value': 1,
        'comment': 'Ontological IRI for {} found in EDAM.'.format(term_type),
        'url_comment': term,
      }
    else:
      yield {
        'value': 0.5,
        'comment': '{} found but not verified in EDAM.'.format(term_type.capitalize()),
        'url_comment': term,
      }


In [None]:
@_register_metric({
  '@id': 143,
  'name': 'Taxonomy',
  'description': 'A taxonomy is present and the CFDE-specified ontological term is found in the CFDE-specified ontologies',
  'detail': '''For each file, ensure we can find at least one taxonomy term and any taxonomy that is found can be associated with a file (through subject & subject_role_taxonomy), ensure it's present in the latest version of NCBI.''',
  'principle': 'Interoperable',
})
def _(file, CFDE=None, ncbi_taxon_client=None, **kwargs):
  # TODO: check names
  path = CFDE.tables['file'].filter((CFDE.tables['file'].id_namespace == file['id_namespace']) & (CFDE.tables['file'].local_id == file['local_id']))
  path = path.link(
    CFDE.tables['file_describes_subject'], on=((
      CFDE.tables['file'].id_namespace == CFDE.tables['file_describes_subject'].file_id_namespace
    ) & (
      CFDE.tables['file'].local_id == CFDE.tables['file_describes_subject'].file_local_id
    ))
  )
  path = path.link(
    CFDE.tables['subject'], on=((
      CFDE.tables['file_describes_subject'].subject_id_namespace == CFDE.tables['subject'].id_namespace
    ) & (
      CFDE.tables['file_describes_subject'].subject_local_id == CFDE.tables['subject'].local_id
    ))
  )
  path = path.link(
    CFDE.tables['subject_role_taxonomy'], on=((
      CFDE.tables['subject'].id_namespace == CFDE.tables['subject_role_taxonomy'].subject_id_namespace
    ) & (
      CFDE.tables['subject'].local_id == CFDE.tables['subject_role_taxonomy'].subject_local_id
    ))
  )
  for entity in path.entities():
    if entity.get('taxonomy_id') is None:
      yield {
        'value': 0,
        'comment': 'Taxonomy is not present in subject_role_taxonomy',
      }
    elif entity['taxonomy_id'].startswith('NCBI:txid'):
      taxon = ncbi_taxon_client.fetch(entity['taxonomy_id'][len('NCBI:txid'):])
      if taxon is not None:
        yield {
          'value': 1,
          'comment': 'Taxonomy is present and validated in ncbi',
          'url_comment': entity['taxonomy_id'],
        }
      else:
        yield {
          'value': 0.5,
          'comment': 'Taxonomy is present but not NCBI',
          'url_comment': entity['taxonomy_id'],
        }
    else:
      yield {
        'value': 0.5,
        'comment': 'Taxonomy is present but not NCBI',
        'url_comment': entity['taxonomy_id'],
      }


In [None]:
@_register_metric({
  '@id': 144,
  'name': 'Cell Line',
  'description': 'A cell line is present and the CFDE-specified ontological term is found in the CFDE-specified ontologies',
  'detail': '''For each file, ensure we can find at least one subject corresponding to a cell line and that cell line's name is present in Cellosaurus.''',
  'principle': 'Interoperable',
})
def _(file, CFDE=None, **kwargs):
  path = CFDE.tables['file'].filter((CFDE.tables['file'].id_namespace == file['id_namespace']) & (CFDE.tables['file'].local_id == file['local_id']))
  path = path.link(
    CFDE.tables['file_describes_subject'], on=((
      CFDE.tables['file'].id_namespace == CFDE.tables['file_describes_subject'].file_id_namespace
    ) & (
      CFDE.tables['file'].local_id == CFDE.tables['file_describes_subject'].file_local_id
    ))
  )
  path = path.link(
    CFDE.tables['subject'], on=((
      CFDE.tables['file_describes_subject'].subject_id_namespace == CFDE.tables['subject'].id_namespace
    ) & (
      CFDE.tables['file_describes_subject'].subject_local_id == CFDE.tables['subject'].local_id
    ))
  )
  # https://github.com/nih-cfde/specifications-and-documentation/blob/master/draft-C2M2_internal_CFDE_CV_tables/subject_granularity.tsv#L2
  path = path.filter(CFDE.tables['subject'].granularity == 'cfde_subject_granularity:4')
  path = path.subject
  cell_lines = path.entities() # contain all cell line subjects
  for cell_line in cell_lines:
    cellosaurus_cell_line = Cellosaurus().get(cell_line['persistent_id']) if 'persistent_id' in cell_line else None
    if cellosaurus_cell_line and cell_line.get('name') == cellosaurus_cell_line.get('name') and cell_line.get('name') is not None:
      yield {
        'value': 1,
        'comment': 'Ontological IRI for cell line and term match what is found in Cellosaurus.',
        'url_comment': cell_line['persistent_id']
      }
    elif cellosaurus_cell_line is not None:
      yield {
        'value': 0.75,
        'comment': 'Ontological IRI for cell line was found in Cellosaurus.',
        'url_comment': cell_line['persistent_id']
      }
    elif 'name' in cell_line and Cellosaurus().get(cell_line['name']):
      yield {
        'value': 0.75,
        'comment': 'Ontological IRI found in Cellosaurus was in the cell_line name field.',
        'url_comment': cell_line['name'],
      }
    elif 'name' in cell_line:
      yield {
        'value': 0.5,
        'comment': 'Cell line found but not in Cellosaurus',
        'url_comment': cell_line.get('name', ''),
      }
    else:
      yield {
        'value': 0,
        'comment': 'Cell line found but missing any information',
      }


In [None]:
@_register_metric({
  # License (116)
  '@id': 116,
  'name': 'Data Usage License',
  'description': 'A Data usage license is described',
  'detail': '''This is not available in the current iteration of the C2M2.''',
  'principle': 'Reusable',
})
def _(file, CFDE=None, **kwargs):
  yield {
    'value': 0,
    'comment': 'No information about data usage licenses are described in the C2M2 Level 1'
  }


In [None]:
@_register_metric({
  # Persistent identifier (105)
  '@id': 104,
  'name': 'Persistent identifier',
  'description': 'Globally unique, persistent, and valid identifiers (preferrably DOIs) are present for the dataset',
  'detail': '''We check that the persistent id is present and whether or not it is a DOI.''',
  'principle': 'Findable',
})
def _(file, CFDE=None, **kwargs):
  persistent_id = file.get('persistent_id')
  if persistent_id:
    if re.match(r'^https?://[^/]+\.doi\.org/.+$', persistent_id):
      yield {
        'value': 1,
        'comment': 'A DOI was identified in the persistent_id',
        'url_comment': persistent_id,
      }
    else:
      yield {
        'value': 0.5,
        'comment': 'A persistent_id was identified but it is not a doi',
        'url_comment': persistent_id,
      }
  else:
    yield {
      'value': 0,
      'comment': 'No persistent_id defined'
    }


In [None]:
@_register_metric({
  # Resource identifier (108)
  '@id': 108,
  'name': 'Resource identifier',
  'description': 'An identifier for the resource is present',
  'detail': '''Likely guaranteed by the c2m2 model, checks for presence of local_id.''',
  'principle': 'Findable',
})
def _(file, CFDE=None, **kwargs):
  if file.keys() >= {'local_id', 'id_namespace'}:
    yield {
      'value': 1,
      'comment': 'An id and namespace were provided for the resource',
      'url_comment': '{} {}'.format(file['local_id'], file['id_namespace']),
    }
  else:
    yield {
      'value': 0,
      'comment': 'An id and namespace were not present for the resource',
    }


In [None]:
@_register_metric({
  '@id': 145,
  'name': 'Landing Page',
  'description': 'A landing page exists and is accessible for the identifiers',
  'detail': '''Checks to make sure the persistent_id is resolvable with a HEAD request. if it is not http/https it is assumed to be an identifiers.org-resolvable CURIE. note that this is still error prone, some identifier websites do not follow HTTP standards and may not report 404s with ids that aren't found.''',
  'principle': 'Findable',
})
def _(file, CFDE=None, **kwargs):
  persistent_id = file.get('persistent_id')
  if persistent_id:
    if not re.match(r'^https?://', persistent_id):
      persistent_id = 'https://identifiers.org/{}'.format(persistent_id)
    #
    try:
      status_code = requests.head(persistent_id, headers={'User-Agent': None}).status_code
      if status_code >= 200 and status_code < 300:
        yield {
          'value': 1,
          'comment': 'valid and HEAD reports {}'.format(status_code),
          'url_comment': persistent_id,
        }
      elif status_code >= 300 and status_code < 399:
        yield {
          'value': 0.5,
          'comment': 'valid url but HEAD reported {}, status cannot be determined'.format(status_code),
          'url_comment': persistent_id,
        }
      elif status_code >= 400:
        yield {
          'value': 0.25,
          'comment': 'valid url but HEAD reported {}'.format(status_code),
          'url_comment': persistent_id,
        }
    except Exception as e:
      yield {
        'value': 0.25,
        'comment': 'received error: {}'.format(e),
        'url_comment': persistent_id,
      }
  else:
    yield {
      'value': 0,
      'comment': 'A persistent_id was not provided for the resource',
    }


## Step 4. Perform assessment using rubric

With the C2M2 rubric initialized in `rubric`, we can now execute an automated assessment, dispatching each file to all the metrics and collecting all the answers.

In [None]:
from tqdm import tqdm
from ncbi_taxon import create_ncbi_taxon_client
with create_ncbi_taxon_client(cachedir=directory) as ncbi_taxon_client:
  ctx = dict(CFDE=CFDE, ncbi_taxon_client=ncbi_taxon_client)
  n_files = CFDE.tables['file'].count()
  answers = [
    dict(
      **answer,
      metric=metric['@id'],
      target=url_join(file['id_namespace'], file['local_id']),
    )
    for file in tqdm(
        CFDE.tables['file'].entities(),
        total=n_files,
        miniters=n_files//100,
    )
    for metric in rubric['metrics'].values()
    for answer in metric['func'](file, **ctx)
  ]

## Step 5. Review results

With the assessment complete, we're ready to review the results.

### Table 1. A simple look at the structure of the answers dataframe (joined with metrics for readability of metrics)

- `target` is the URI for the digital object being assessed, in this case it's the file global id formed by `id_namespace` + `local_id`
- `metric` this is the id of the metric being assessed
- `name` this is the human readable name of the metric
- `principle` this is the F.A.I.R category of the metric
- `value` represents the quantitative value assigned to the given answer. It ranges between 0.0 and 1.0, 0.0 representing complete lack of *compliance* with a metric, and 1.0 representing complete satisfaction of a metric. 
- `comment` is a human-description describing why the `value` is what it is.
- `url_comment` is available when a url/uri is available as evidence for metric satisfaction

In [None]:
df_metrics = pd.DataFrame(rubric['metrics']).T
df_answers = pd.merge(
    left=pd.DataFrame(answers), left_on='metric',
    right=df_metrics[['name', 'principle']], right_index=True,
)
df_answers

In [None]:
%%appyter markdown
### Figure 1. Heatmap of answers

We discretize the values into {{ n_bins }} bins to get a sense of how many metrics are being satisfied and by how well.
We show the percentage of answers for that metric which fall into that bucket alongside the number of answers.

In [None]:
%%appyter code_exec

d = df_answers.groupby([
    pd.cut(
        df_answers['value'],
        bins={{ n_bins }},
{% if n_bins.raw_value == 2 %}
        labels=('poor', 'good'),
{% elif n_bins.raw_value == 3 %}
        labels=('poor', 'okay', 'good'),
{% elif n_bins.raw_value == 4 %}
        labels=('poor', 'okay', 'good', 'great'),
{% else %}
        labels=np.arange({{ n_bins }})+1,
{% endif %}
    ),
    'name',
])['value'].count().unstack().T

d_pct = d.divide(d.sum(axis=1), axis=0)*100

f, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 12))
#
sns.heatmap(
    d_pct,
    annot=True, fmt='.1f',
    square=True,
    ax=ax1,
)
for t in ax1.get_yticklabels():
  t.set_rotation(0)
ax1.set_xlabel('')
ax1.set_ylabel('')
#
sns.barplot(
    data=d.sum(axis=1).to_frame('Number of Answers').reset_index(),
    x='Number of Answers', y='name',
    order=d.index,
    orient='h',
    ax=ax2,
)
ax2.set_ylabel('')
ax2.set_yticks([])
plt.show()

In [None]:
%%appyter markdown
{% if n_comments.raw_value > 0 %}
### Figure 2. The top and bottom {{ n_comments }} most frequent comments occuring on unsatisfied metrics.
{% endif %}

In [None]:
%%appyter code_exec
{% if n_comments.raw_value > 0 %}
comment_vc = df_answers[df_answers['value'] < 1.0]['comment'].value_counts()
comment_vc = comment_vc[comment_vc > 1]
display(comment_vc.head({{ n_comments }}))
display(comment_vc.tail({{ n_comments }}))
{% endif %}

## Cleanup

No need to run this locally, but useful for appyter.

In [None]:
import shutil
shutil.rmtree(directory)