In [1]:
# import argparse
# import os, sys
# sys.path.append(os.path.abspath('..'))
# import test_metric

import json
import requests

import rdflib
import pyshacl

import extruct
from extruct.jsonld import JsonLdExtractor

In [2]:
# List of URIs to be tested
# http://lod.openaire.eu/documentation 

input_URIs = ['https://data.inra.fr/dataset.xhtml?persistentId=doi:10.15454/TKMGCQ', 
    'https://doi.org/10.5281/zenodo.1147435',
    'https://data.inra.fr/dataset.xhtml?persistentId=doi:10.15454/5K9HCS', 
    'https://search.datacite.org/works/10.7892/boris.108387', 
    'https://doi.org/10.6084/m9.figshare.c.3607916_d7.v1', 
    'https://search.datacite.org/works/10.6084/m9.figshare.c.3607916_d7.v1', 
    'https://api.datacite.org/dois/application/ld+json/10.6084/m9.figshare.c.3607916_d7.v1', 
    'https://bio.tools/rsat_peak-motifs',
    'https://workflowhub.eu/workflows/18', 
    'https://tess.elixir-europe.org/materials/train-the-trainer', 
    'https://fairsharing.org/FAIRsharing.ZPRtfG', 
    'https://www.ebi.ac.uk/ols/ontologies/ncit/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FNCIT_C2985', 
    'https://biit.cs.ut.ee/gprofiler/page/apis', 
    'https://bioportal.bioontology.org/ontologies/OCRE', 
]

In [None]:
# input of URIs by type

input_data = [
    "https://data.inra.fr/dataset.xhtml?persistentId=doi:10.15454/TKMGCQ", # dataset INRA Dataverse
    "https://doi.pangaea.de/10.1594/PANGAEA.914331", # dataset in PANGAEA
]

input_software = [
    "https://zenodo.org/record/3349821#.Xp7m9SNR2Uk", # VM image in zenodo
    "https://explore.openaire.eu/search/software?softwareId=r37b0ad08687::275ecd99e516ed1b863e2a7586063a64", # same VM image in OpenAir
    "https://data.inra.fr/dataset.xhtml?persistentId=doi:10.15454/5K9HCS", # code in INRA Dataverse
    "https://bio.tools/rsat_peak-motifs", # Tool in biotools
    "https://workflowhub.eu/workflows/18", # Workflow in WorkflowHub
    "http://tara-oceans.mio.osupytheas.fr/ocean-gene-atlas/", # OGA Main page of webtool
]

input_database = [
    "https://fairsharing.org/FAIRsharing.ZPRtfG", # knowledge base in FAIRsharing (AgroLD)
    "http://remap.univ-amu.fr" # Database of transcriptional regulators
]

input_ontology = [
    "https://bioportal.bioontology.org/ontologies/OCRE" # Ontology in bioportal
    "https://www.ebi.ac.uk/ols/ontologies/ncit/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FNCIT_C2985" # OLS entry
]

input_publication = [
    "https://search.datacite.org/works/10.7892/boris.108387", # Publication in Datacite
    "https://doi.org/10.6084/m9.figshare.c.3607916_d7.v1", # Publication figure in FigShare
    "https://search.datacite.org/works/10.6084/m9.figshare.c.3607916_d7.v1", # Publication figure in Datacite (same as previous)
    "https://api.datacite.org/dois/application/ld+json/10.6084/m9.figshare.c.3607916_d7.v1" # Publication figure with Datacite API
]

input_training = [
    "https://tess.elixir-europe.org/materials/train-the-trainer", # Training material in TeSS
]

In [3]:
# Retrieving of RDF linked-data in web pages

for URI in input_URIs:
    page = requests.get(URI)
    html = page.content
    data = extruct.extract(html, syntaxes=['microdata', 'rdfa', 'json-ld'], errors='ignore')
    print(json.dumps(data, indent=2))
    break

{
  "microdata": [],
  "json-ld": [
    {
      "@context": "http://schema.org",
      "@type": "Dataset",
      "@id": "https://doi.org/10.15454/TKMGCQ",
      "identifier": "https://doi.org/10.15454/TKMGCQ",
      "name": "Robert et al. Combining crop growth modelling with trait-assisted prediction improved the prediction of genotype by environment interactions",
      "creator": [
        {
          "name": "Rincent, Renaud",
          "affiliation": "INRA - Institut National de la Recherche Agronomique",
          "@id": "https://orcid.org/0000-0003-0885-0969",
          "identifier": "https://orcid.org/0000-0003-0885-0969"
        }
      ],
      "author": [
        {
          "name": "Rincent, Renaud",
          "affiliation": "INRA - Institut National de la Recherche Agronomique",
          "@id": "https://orcid.org/0000-0003-0885-0969",
          "identifier": "https://orcid.org/0000-0003-0885-0969"
        }
      ],
      "datePublished": "2020-04-20",
      "dateModified"

In [4]:
# Exemple constraint Graph

str_ttl_constraint = """@prefix dash: <http://datashapes.org/dash#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix schema: <http://schema.org/> .
@prefix sh: <http://www.w3.org/ns/shacl#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

schema:DataCatalogShape
    a sh:NodeShape ;
    sh:targetClass schema:DataCatalog ;
    sh:property [
        sh:path schema:name ;
        sh:datatype xsd:string ;
        sh:name "given name" ;
    ] ;
    sh:property [
        sh:path schema:description ;
        sh:maxCount 1 ;
    ] ;
    sh:property [
        sh:path schema:url ;
    ] .
"""

shacl_graph = rdflib.ConjunctiveGraph()
print(str_ttl_constraint)
result = shacl_graph.parse(data=str_ttl_constraint, format='turtle')
print(len(shacl_graph))

@prefix dash: <http://datashapes.org/dash#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix schema: <http://schema.org/> .
@prefix sh: <http://www.w3.org/ns/shacl#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

schema:DataCatalogShape
    a sh:NodeShape ;
    sh:targetClass schema:DataCatalog ;
    sh:property [
        sh:path schema:name ;
        sh:datatype xsd:string ;
        sh:name "given name" ;
    ] ;
    sh:property [
        sh:path schema:description ;
        sh:maxCount 1 ;
    ] ;
    sh:property [
        sh:path schema:url ;
    ] .

11


In [5]:
# Example data Graph
data_graph_str = """
    {
        "@context": "http://schema.org",
        "@type": "DataCatalog",
        "identifier":"https://identifiers.org/MIR:00000364",
        "name": "FAIRsharing.org",
        "description": "A manually curated, informative and educational resource on data and metadata standards, inter-related to databases/data repositories and funder and journal publisher data policies from across disciplines. FAIRsharing is an ELIXIR-UK node resource and has an active role in the RDA and Force11 data initiatives.",
        "url": "https://fairsharing.org/",
        "dataset": [
            {
                "@type" : "Dataset",
                "name" : "Metadata Standard",
                "description" : "A manually curated registry of standards, split into three types - Terminology Artifacts (ontologies, e.g. Gene Ontology), Models and Formats (conceptual schema, formats, data models, e.g. FASTA), and Reporting Guidelines (e.g. the ARRIVE guidelines for in vivo animal testing). These are linked to the databases that implement them and the funder and journal publisher data policies that recommend or endorse their use.",
                "identifier": "https://www.fairsharing.org/bsg-sXXXXXX",
                "url" : "https://fairsharing.org/standards",
                "keywords" : "Standards, Metadata, Formats, Ontologies, Terminology Artifacts, Reporting Guidelines",
                "includedInDataCatalog" : "https://www.fairsharing.org"
            },
            {
                "@type" : "Dataset",
                "name" : "Database",
                "description": "A manually curated registry of databases/data repositories, conforming to the BioDBcore standard (from the Life Sciences). These are linked to the standards that they use and the funder and journal publisher data policies that recommend or endorse their use.",
                "identifier": "https://www.fairsharing.org/bsg-dXXXXXX",
                "url": "https://fairsharing.org/databases",
                "keywords": "Database, Data repository",
                "includedInDataCatalog" : "https://www.fairsharing.org"
            },
            {
                "@type" : "Dataset",
                "name" : "Data Policy",
                "description": "A manually curated registry of data policies from research funders, journal publishers, societies, and other organisations. These are linked to the databases and standards that they recommend for use",
                "identifier": "https://www.fairsharing.org/bsg-pXXXXXX",
                "url": "https://fairsharing.org/policies",
                "keywords": "Data policy, journal, funder, society"
            }
            ],
        "keywords": "registry, life science, natural science, social science, ",
        "provider":
            [
            {
            "@type":"Person",
            "name": "FAIRsharing.org support",
            "email": "contact@fairsharing.org"
            },
            {
            "@type":"Organization",
            "name": "Oxford e-Research Centre, University of Oxford",
            "url": "http://www.oerc.ox.ac.uk/"
            },
            {
            "@type":"Organization",
            "name": "ELIXIR-UK",
            "url": "http://www.elixir-uk.org"
            },
            {
            "alternateName": ["FAIRsharing.org Registry", "BioSharing.org"]
            }
            ],
        "license": {
            "@type": "CreativeWork",
            "name": "Creative Commons CC BY-SA 4.0 Attribution",
            "url": "https://creativecommons.org/licenses/by-sa/4.0/"
            },
        "publication":[
            {
                "@type":"PublicationEvent",
                "name": "BioSharing: curated and crowd-sourced metadata standards, databases and data policies in the life sciences.",
                "url": "https://academic.oup.com/database/article-lookup/doi/10.1093/database/baw075"
            }
        ]

}
"""

data_graph = rdflib.ConjunctiveGraph()
result = data_graph.parse(data=data_graph_str, format='json-ld')
print(len(data_graph))

52


In [6]:
# Exemple SHACL test

r = pyshacl.validate(data_graph = data_graph, 
             data_graph_format='json-ld', 
             shacl_graph = shacl_graph, 
             #shacl_graph = my_shacl_constraint,
             shacl_graph_format = 'turtle', 
             ont_graph = None, 
             inference = 'rdfs', 
             abort_on_error = False, 
             meta_shacl = False, 
             debug = True)

conforms, results_graph, results_text = r

print(conforms)
print(results_graph)
print(results_text)

report_query = """
    SELECT ?node ?path WHERE {
        ?v rdf:type sh:ValidationReport ;
           sh:result ?r .
        ?r sh:focusNode ?node ;
           sh:sourceShape ?s . 
        ?s sh:path ?path . 
    }
"""

results = results_graph.query(report_query)

for r in results :
    print(r)

True
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'IOMemory']].
Validation Report
Conforms: True



In [7]:
# FAIRMetrics API execution

data = b'{"subject": "10.5281/zenodo.1147435"}'
r = requests.request(method="POST", url='https://w3id.org/FAIR_Tests/tests/gen2_unique_identifier', data=data)

print(json.dumps(r.json(), indent=2))

[
  {
    "@id": "https://w3id.org//FAIR_Tests/tests/gen2_unique_identifier#10.5281/zenodo.1147435/result-2020-04-20T15:33:51+00:00",
    "http://purl.obolibrary.org/obo/date": [
      {
        "@value": "2020-04-20T15:33:51+00:00",
        "@type": "http://www.w3.org/2001/XMLSchema#date"
      }
    ],
    "http://schema.org/comment": [
      {
        "@value": "INFO: TEST VERSION 'Hvst-1.1.1:Tst-0.2.2'\n\nSUCCESS: Found an identifier of type 'doi'",
        "@language": "en"
      }
    ],
    "@type": [
      "http://fairmetrics.org/resources/metric_evaluation_result"
    ],
    "http://semanticscience.org/resource/SIO_000300": [
      {
        "@value": "1",
        "@type": "http://www.w3.org/2001/XMLSchema#int"
      }
    ],
    "http://semanticscience.org/resource/SIO_000332": [
      {
        "@value": "10.5281/zenodo.1147435",
        "@type": "http://www.w3.org/2001/XMLSchema#float"
      }
    ],
    "http://schema.org/softwareVersion": [
      {
        "@value": "Hvst