In [25]:
# import argparse
# import os, sys
# sys.path.append(os.path.abspath('..'))
# import test_metric

import json
import requests

import rdflib
from rdflib import ConjunctiveGraph
from rdflib.compare import to_isomorphic, graph_diff
import pyshacl

import extruct
from extruct.jsonld import JsonLdExtractor

import re

# DOI regex
regex = r"10.\d{4,9}\/[-._;()\/:A-Z0-9]+"

In [2]:
# List of URIs to be tested
# http://lod.openaire.eu/documentation 

input_URIs = ['https://data.inra.fr/dataset.xhtml?persistentId=doi:10.15454/TKMGCQ', 
    'https://doi.org/10.5281/zenodo.1147435',
    'https://data.inra.fr/dataset.xhtml?persistentId=doi:10.15454/5K9HCS', 
    'https://search.datacite.org/works/10.7892/boris.108387', 
    'https://doi.org/10.6084/m9.figshare.c.3607916_d7.v1', 
    'https://search.datacite.org/works/10.6084/m9.figshare.c.3607916_d7.v1', 
    'https://api.datacite.org/dois/application/ld+json/10.6084/m9.figshare.c.3607916_d7.v1', 
    'https://bio.tools/rsat_peak-motifs',
    'https://workflowhub.eu/workflows/18', 
    'https://tess.elixir-europe.org/materials/train-the-trainer', 
    'https://fairsharing.org/FAIRsharing.ZPRtfG', 
    'https://www.ebi.ac.uk/ols/ontologies/ncit/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FNCIT_C2985', 
    'https://biit.cs.ut.ee/gprofiler/page/apis', 
    'https://bioportal.bioontology.org/ontologies/OCRE', 
]

In [3]:
# input of URIs by type

input_data = [
    "https://data.inra.fr/dataset.xhtml?persistentId=doi:10.15454/TKMGCQ", # dataset INRA Dataverse
    "https://doi.pangaea.de/10.1594/PANGAEA.914331", # dataset in PANGAEA
]

input_software = [
    "https://zenodo.org/record/3349821#.Xp7m9SNR2Uk", # VM image in zenodo
    "https://explore.openaire.eu/search/software?softwareId=r37b0ad08687::275ecd99e516ed1b863e2a7586063a64", # same VM image in OpenAir
    "https://data.inra.fr/dataset.xhtml?persistentId=doi:10.15454/5K9HCS", # code in INRA Dataverse
    "https://bio.tools/rsat_peak-motifs", # Tool in biotools
    "https://workflowhub.eu/workflows/18", # Workflow in WorkflowHub
    "http://tara-oceans.mio.osupytheas.fr/ocean-gene-atlas/", # OGA Main page of webtool
]

input_database = [
    "https://fairsharing.org/FAIRsharing.ZPRtfG", # knowledge base in FAIRsharing (AgroLD)
    "http://remap.univ-amu.fr" # Database of transcriptional regulators
]

input_ontology = [
    "https://bioportal.bioontology.org/ontologies/OCRE", # Ontology in bioportal
    "https://www.ebi.ac.uk/ols/ontologies/ncit/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FNCIT_C2985" # OLS entry
]

input_publication = [
    "https://doi.org/10.1145/1614320.1614332", # Paper from lod.openair
    "https://search.datacite.org/works/10.7892/boris.108387", # Publication in Datacite
    "https://doi.org/10.6084/m9.figshare.c.3607916_d7.v1", # Publication figure in FigShare
    "https://search.datacite.org/works/10.6084/m9.figshare.c.3607916_d7.v1", # Publication figure in Datacite (same as previous)
    "https://api.datacite.org/dois/application/ld+json/10.6084/m9.figshare.c.3607916_d7.v1" # Publication figure with Datacite API
]

input_training = [
    "https://tess.elixir-europe.org/materials/train-the-trainer", # Training material in TeSS
]

In [4]:
# Retrieving of RDF linked-data in web pages

# for URI in input_URIs:
# for URI in input_training:
# for URI in input_publication:
# for URI in input_ontology:
# for URI in input_database:
def getKG(URI):
    print("Retrieve KG for URI: " + URI)
    page = requests.get(URI)
    html = page.content
    d = extruct.extract(html, syntaxes=['microdata', 'rdfa', 'json-ld'], errors='ignore')
    #print(json.dumps(d, indent=2))
    kg = ConjunctiveGraph()
    kg.parse(data = json.dumps(d['json-ld']), format="json-ld")
    kg.parse(data = json.dumps(d['rdfa']), format="json-ld")
    kg.parse(data = json.dumps(d['microdata']), format="json-ld")
    print(kg.serialize(format="turtle").decode())
    #break
    print(len(kg))
    return kg


# Augmenting embedded semantic annotation from multiple sources
1. for each ressource find all typed or identified entities 
2. for each entity: retrieve their semantic description based on other SPARQL endpoints. 'DESCRIBE queries' 
   - datacite sparql endpoint
   - sameAs.cc sparql endpoint
   - BioPortal sparql endpoint
   - OpenAire sparql endpoint
   - PubMed sparql endpoint (http://linkedlifedata.com/sparql)
   - DBPedia
   - WikiData

In [5]:
# retrieve entities of specific types

from SPARQLWrapper import SPARQLWrapper, N3, JSON, RDF, TURTLE, JSONLD
from rdflib import Graph, Namespace
from rdflib.namespace import RDF

schema = Namespace("http://schema.org/")

#sparql_endpoints = ['http://linkedlifedata.com/sparql', 'http://lod.openaire.eu/sparql']
#sparql_endpoints = ['http://lod.openaire.eu/sparql', 'http://sparql.bioontology.org/ontologies/sparql']
sparql_endpoints = [
                    'http://sparql.wikipathways.org/sparql',
                    #'https://opencitations.net/sparql',
                    'http://lod.openaire.eu/sparql', 
                    'https://dbpedia.org/sparql', 
                    #'https://bio2rdf.org/sparql',
                    #'https://sparql.uniprot.org/sparql',
                    'https://www.ebi.ac.uk/rdf/services/sparql',
                    #'https://query.wikidata.org/sparql',
                    'http://www.scholarlydata.org/sparql',
                    'https://io.datascience-paris-saclay.fr/sparql',
                    'https://opendata1.opendata.u-psud.fr/sparql',
                    'http://data.bnf.fr/sparql',
                    #'http://sparql.wikipathways.org/',
                    'https://sparql.orthodb.org/sparql',
                    'https://data.idref.fr/sparql',
                    #'https://go3.opendata.u-psud.fr/sparql',
                    #'http://data.persee.fr/explorer/sparql-endpoint/',
                   ]

def retrieve_entities(G):
    entities = []
    for s, p, o in G.triples((None, RDF.type, None)):
        #print(f'{s} - {p} - {o}')
        print(str(o))
        if type(s) != rdflib.term.BNode: entities.append(str(s))
    for s, p, o in G.triples((None, schema.identifier, None)):
        #print(f'identifier: {s} - {p} - {o}')
        if type(o) != rdflib.term.BNode: entities.append(str(o))
    for s, p, o in G.triples((None, schema.sameAs, None)):
        #print(f'sameAs: {s} - {p} - {o}')
        if type(o) != rdflib.term.BNode: entities.append(str(o))
    for s, p, o in G.triples((None, schema.url, None)):
        #print(f'url: {s} - {p} - {o}')
        if type(o) != rdflib.term.BNode: entities.append(str(o))
            
    for s, p, o in G.triples((None, schema.license, None)):
        #print(f'url: {s} - {p} - {o}')
        if type(o) != rdflib.term.BNode: entities.append(str(o))
    for s, p, o in G.triples((None, schema.distribution, None)):
        #print(f'url: {s} - {p} - {o}')
        if type(o) != rdflib.term.BNode: entities.append(str(o))
    for s, p, o in G.triples((None, schema.citation, None)):
        #print(f'url: {s} - {p} - {o}')
        if type(o) != rdflib.term.BNode: entities.append(str(o))
    for s, p, o in G.triples((None, schema.provider, None)):
        #print(f'url: {s} - {p} - {o}')
        if type(o) != rdflib.term.BNode: entities.append(str(o))
        #if type(o) != rdflib.term.URIRef: entities.append(str(o))
            
    #for s, p, o in G.triples((None, None, None)):
        #print(type(o))
        #print(f'Schema: {s} - {p} - {o}')
        #if type(o) != rdflib.term.BNode: entities.append(str(o))
    return entities



In [6]:
# Describe lod.openaire
def describe_loa(uri, g):
    #g = Graph()
    print(f'SPARQL for [ {uri} ] with enpoint [ LOA ]')
    sparql = SPARQLWrapper("http://lod.openaire.eu/sparql")
    sparql.setQuery("""
            DESCRIBE ?x WHERE {   
            ?x <http://lod.openaire.eu/vocab/resPersistentID> '""" + uri + """' 
            }
    """)

    g_len = Graph()
    sparql.setReturnFormat(N3)
    results = sparql.query().convert()
    print("Results: " + str(len(g_len.parse(data=results, format="n3"))))
    g.parse(data=results, format="n3")
    
    #print(g.serialize(format='turtle').decode())
    return g
    
#describe_loa("10.1145/1614320.1614332")

In [7]:
# Describe datacite
def describe_opencitation(uri, g):
    #g = Graph()
    print(f'SPARQL for [ {uri} ] with enpoint [ Opencitation ]')
    sparql = SPARQLWrapper("https://opencitations.net/sparql")
    sparql.setQuery("""
            PREFIX cito: <http://purl.org/spar/cito/>
            PREFIX dcterms: <http://purl.org/dc/terms/>
            PREFIX datacite: <http://purl.org/spar/datacite/>
            PREFIX literal: <http://www.essepuntato.it/2010/06/literalreification/>
            PREFIX biro: <http://purl.org/spar/biro/>
            PREFIX frbr: <http://purl.org/vocab/frbr/core#>
            PREFIX c4o: <http://purl.org/spar/c4o/>
            
            DESCRIBE ?x WHERE {
                ?x datacite:hasIdentifier/literal:hasLiteralValue '""" + uri + """' 
            }
    """)

    sparql.setReturnFormat(TURTLE)
    results = sparql.query().convert()
    print("Results: " + str(len(results)))
    
    results = results.serialize(format='turtle').decode()
    
    g.parse(data=results, format="turtle")
    
    #print(g.serialize(format='turtle').decode())
    return g

#describe_opencitation("10.1186/1471-2105-11-244")

In [8]:
# Describe Wikidata

def describe_wikidata(uri, g):
    #g = Graph()
    print(f'SPARQL for [ {uri} ] with enpoint [ Wikidata ]')
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setQuery("""
            PREFIX wd: <http://www.wikidata.org/entity/>
            PREFIX wdt: <http://www.wikidata.org/prop/direct/>
            PREFIX wikibase: <http://wikiba.se/ontology#>
            PREFIX p: <http://www.wikidata.org/prop/>
            PREFIX ps: <http://www.wikidata.org/prop/statement/>
            PREFIX pq: <http://www.wikidata.org/prop/qualifier/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
            PREFIX bd: <http://www.bigdata.com/rdf#>
            
            DESCRIBE ?x WHERE {   
                ?x wdt:P356 '10.6084/M9.FIGSHARE.4539889' 
            }
    """)

    sparql.setReturnFormat(N3)
    results = sparql.query().convert()
    print("Results: " + str(len(results)))
    results = results.serialize(format='turtle').decode()
    
    g.parse(data=results, format="n3")
    
    #print(g.serialize(format='turtle').decode())
    return g

#describe_wikidata("10.6084/M9.FIGSHARE.4539889")

In [None]:
#  interogate SPARQL endpoints for supplementary data
def describe(uri):
    g = Graph()

    for endpoint in sparql_endpoints:
        print(f'SPARQL for [ {uri} ] with enpoint [ {endpoint} ]')
        sparql = SPARQLWrapper(endpoint)
        sparql.setQuery("""
                DESCRIBE <"""+uri+""">
            """)

        sparql.setReturnFormat(N3)
        results = sparql.query().convert()
        g.parse(data=results, format="n3")
    
    return g

def isDOI(uri):
    return bool(re.search(regex, uri, re.MULTILINE | re.IGNORECASE))

def getDOI(uri):
    match = re.search(regex, uri, re.MULTILINE | re.IGNORECASE)
    return match.group(0)

def compareGraph(g1, g2):
    iso1 = to_isomorphic(g1)
    iso2 = to_isomorphic(g2)

    iso1 == iso2

    in_both, in_first, in_second = graph_diff(iso1, iso2)
    print(in_both.serialize(format='turtle').decode())

for URI in input_publication:
    kg = getKG(URI)
    entities = retrieve_entities(kg)
    augmented_kg = Graph()
    for uri in entities:
        # test DOI if one is found
        print("DOI: " + str(isDOI(uri)))
        if isDOI(uri): 
            uri = getDOI(uri)
            # describe on lod.openair
        augmented_kg = describe_loa(uri, augmented_kg)
        augmented_kg = describe_opencitation(uri, augmented_kg)
        augmented_kg = describe_wikidata(uri, augmented_kg)
        
        #augmented_kg = describe(uri)
        print(f'Augmented KG len: {len(augmented_kg)}')
        print(augmented_kg.serialize(format='turtle').decode())
        #print(augmented_kg.serialize(format='turtle').decode())
        
    # compare two graph
    compareGraph(kg, augmented_kg)

Retrieve KG for URI: https://doi.org/10.1145/1614320.1614332


0


Retrieve KG for URI: https://search.datacite.org/works/10.7892/boris.108387
@prefix ns1: <http://schema.org/> .
@prefix ns2: <http://www.w3.org/1999/xhtml/vocab#> .

<https://doi.org/10.7892/boris.108387> a ns1:ScholarlyArticle ;
    ns1:author [ a ns1:Person ;
            ns1:familyName "Klink" ;
            ns1:givenName "Thorsten" ;
            ns1:name "Thorsten Klink" ],
        [ a ns1:Person ;
            ns1:familyName "Bonel" ;
            ns1:givenName "Harald Marcel" ;
            ns1:name "Harald Marcel Bonel" ],
        [ a ns1:Person ;
            ns1:familyName "Sprecher" ;
            ns1:givenName "Marco" ;
            ns1:name "Marco Sprecher" ],
        [ a ns1:Person ;
            ns1:familyName "Adler" ;
            ns1:givenName "Sabine" ;
            ns1:name "Sabine Adler" ],
        [ a ns1:Person ;
            ns1:familyName "Wermelinger" ;
            ns1:givenName "Felix" ;
            ns1:na

In [24]:
# Test Diff
from rdflib.compare import to_isomorphic, graph_diff

g1 = Graph().parse(format='n3', data='''
    @prefix : <http://example.org/ns#> .
    <http://example.org> :rel
        <http://example.org/same>,
        [ :label "Same" ],
        <http://example.org/a>,
        [ :label "A" ] .
''')

g2 = Graph().parse(format='n3', data='''
    @prefix : <http://example.org/ns#> .
    <http://example.org> :rel
        <http://example.org/same>,
        [ :label "Same" ],
        <http://example.org/b>,
        [ :label "B" ] .
''')

iso1 = to_isomorphic(g1)
iso2 = to_isomorphic(g2)

iso1 == iso2

in_both, in_first, in_second = graph_diff(iso1, iso2)

print(in_both.serialize(format='turtle').decode())

@prefix ns1: <http://example.org/ns#> .

<http://example.org> ns1:rel [ ns1:label "Same" ],
        <http://example.org/same> .




In [None]:
# Exemple constraint Graph

str_ttl_constraint = """@prefix dash: <http://datashapes.org/dash#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix schema: <http://schema.org/> .
@prefix sh: <http://www.w3.org/ns/shacl#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

schema:DataCatalogShape
    a sh:NodeShape ;
    sh:targetClass schema:DataCatalog ;
    sh:property [
        sh:path schema:name ;
        sh:datatype xsd:string ;
        sh:name "given name" ;
    ] ;
    sh:property [
        sh:path schema:description ;
        sh:maxCount 1 ;
    ] ;
    sh:property [
        sh:path schema:url ;
    ] .
"""

shacl_graph = rdflib.ConjunctiveGraph()
print(str_ttl_constraint)
result = shacl_graph.parse(data=str_ttl_constraint, format='turtle')
print(len(shacl_graph))

In [None]:
# Example data Graph
data_graph_str = """
    {
        "@context": "http://schema.org",
        "@type": "DataCatalog",
        "identifier":"https://identifiers.org/MIR:00000364",
        "name": "FAIRsharing.org",
        "description": "A manually curated, informative and educational resource on data and metadata standards, inter-related to databases/data repositories and funder and journal publisher data policies from across disciplines. FAIRsharing is an ELIXIR-UK node resource and has an active role in the RDA and Force11 data initiatives.",
        "url": "https://fairsharing.org/",
        "dataset": [
            {
                "@type" : "Dataset",
                "name" : "Metadata Standard",
                "description" : "A manually curated registry of standards, split into three types - Terminology Artifacts (ontologies, e.g. Gene Ontology), Models and Formats (conceptual schema, formats, data models, e.g. FASTA), and Reporting Guidelines (e.g. the ARRIVE guidelines for in vivo animal testing). These are linked to the databases that implement them and the funder and journal publisher data policies that recommend or endorse their use.",
                "identifier": "https://www.fairsharing.org/bsg-sXXXXXX",
                "url" : "https://fairsharing.org/standards",
                "keywords" : "Standards, Metadata, Formats, Ontologies, Terminology Artifacts, Reporting Guidelines",
                "includedInDataCatalog" : "https://www.fairsharing.org"
            },
            {
                "@type" : "Dataset",
                "name" : "Database",
                "description": "A manually curated registry of databases/data repositories, conforming to the BioDBcore standard (from the Life Sciences). These are linked to the standards that they use and the funder and journal publisher data policies that recommend or endorse their use.",
                "identifier": "https://www.fairsharing.org/bsg-dXXXXXX",
                "url": "https://fairsharing.org/databases",
                "keywords": "Database, Data repository",
                "includedInDataCatalog" : "https://www.fairsharing.org"
            },
            {
                "@type" : "Dataset",
                "name" : "Data Policy",
                "description": "A manually curated registry of data policies from research funders, journal publishers, societies, and other organisations. These are linked to the databases and standards that they recommend for use",
                "identifier": "https://www.fairsharing.org/bsg-pXXXXXX",
                "url": "https://fairsharing.org/policies",
                "keywords": "Data policy, journal, funder, society"
            }
            ],
        "keywords": "registry, life science, natural science, social science, ",
        "provider":
            [
            {
            "@type":"Person",
            "name": "FAIRsharing.org support",
            "email": "contact@fairsharing.org"
            },
            {
            "@type":"Organization",
            "name": "Oxford e-Research Centre, University of Oxford",
            "url": "http://www.oerc.ox.ac.uk/"
            },
            {
            "@type":"Organization",
            "name": "ELIXIR-UK",
            "url": "http://www.elixir-uk.org"
            },
            {
            "alternateName": ["FAIRsharing.org Registry", "BioSharing.org"]
            }
            ],
        "license": {
            "@type": "CreativeWork",
            "name": "Creative Commons CC BY-SA 4.0 Attribution",
            "url": "https://creativecommons.org/licenses/by-sa/4.0/"
            },
        "publication":[
            {
                "@type":"PublicationEvent",
                "name": "BioSharing: curated and crowd-sourced metadata standards, databases and data policies in the life sciences.",
                "url": "https://academic.oup.com/database/article-lookup/doi/10.1093/database/baw075"
            }
        ]

}
"""

data_graph = rdflib.ConjunctiveGraph()
result = data_graph.parse(data=data_graph_str, format='json-ld')
print(len(data_graph))

In [None]:
# Exemple SHACL test

r = pyshacl.validate(data_graph = data_graph, 
             data_graph_format='json-ld', 
             shacl_graph = shacl_graph, 
             #shacl_graph = my_shacl_constraint,
             shacl_graph_format = 'turtle', 
             ont_graph = None, 
             inference = 'rdfs', 
             abort_on_error = False, 
             meta_shacl = False, 
             debug = True)

conforms, results_graph, results_text = r

print(conforms)
print(results_graph)
print(results_text)

report_query = """
    SELECT ?node ?path WHERE {
        ?v rdf:type sh:ValidationReport ;
           sh:result ?r .
        ?r sh:focusNode ?node ;
           sh:sourceShape ?s . 
        ?s sh:path ?path . 
    }
"""

results = results_graph.query(report_query)

for r in results :
    print(r)

In [None]:
# FAIRMetrics API execution

data = b'{"subject": "10.5281/zenodo.1147435"}'
r = requests.request(method="POST", url='https://w3id.org/FAIR_Tests/tests/gen2_unique_identifier', data=data)

print(json.dumps(r.json(), indent=2))