In [186]:
import sys
sys.path.append("../")

from rdflib import ConjunctiveGraph, URIRef
import requests
from requests.exceptions import InvalidURL
import random
import json
from tqdm.notebook import tqdm
import pandas as pd
from metrics.WebResource import WebResource

In [181]:
def get_live_deploys_urls():
    results = []
    live_deploys_remote_file = "https://raw.githubusercontent.com/BioSchemas/bioschemas.github.io/master/_data/live_deployments.json"
    res = requests.get(live_deploys_remote_file)
    live_deploys = res.json()
    
    for r in live_deploys["resources"]:
        for p in r["profiles"]:
            if "exampleURL" in p.keys():
                results.append(p["exampleURL"])
            else : 
                print(f"no example URL for profile {p['profileName']} and url {r['url']}")
    print(f"found {len(set(results))} live deploys")
    return list(set(results))



def test_ld_bioschemas_annot():
        res = get_live_deploys_urls()
        errors = []
        results = []
        
        i = 0
        for r in tqdm(res):
        #for r in reversed(res):
            row = {}
            status_code = requests.head(r).status_code
            print(r)
            row['url'] = r
            print("Status code: " + str(status_code))
            row['status code'] = str(status_code)
            try :
                kg = WebResource(r).get_rdf()
                print("Triples: " + str(len(kg)))
                row['Retrieved triples'] = len(kg)
                if not len(kg) > 0:
                    print(f"# Error with {r}")
                    row['Error'] = True
                    errors.append(r)
            except UnicodeDecodeError as err:
                print(err)
            results.append(row)

        print(f"{len(res)} tested URLS")
        print(f"{len(errors)} failing URLS")
        print(errors)
        df = pd.read_json(json.dumps(results))
        return df
    
def retrieve_bioschemas(url):  
    FC_get_md = "https://fair-checker.france-bioinformatique.fr/api/inspect/get_rdf_metadata"
    kg = ConjunctiveGraph()
    res = requests.get(url=FC_get_md, params={"url": url})
    try:
        kg.parse(data=res.text, format="json-ld")
    except Exception as e:
            print(e)
    print(f"Loaded {len(kg)} RDF triples from {url}")
    return kg

def check_bioschemas(url):  
    #FC_bs_valid = "https://fair-checker.france-bioinformatique.fr/api/inspect/bioschemas_validation"
    FC_bs_valid = "https://fair-checker.france-bioinformatique.fr/api/inspect/bioschemas_validation_by_conformsto"
    res = requests.get(url=FC_bs_valid, params={"url": url})
    #print(res.text)
    #try:
    #    print(json.dumps(res.json(), indent=True))
    #except Exception as e:
    #    print(e)
    return res.json()

def has_conforms_to(kg):
    conformsTo = URIRef("http://purl.org/dc/terms/conformsTo")
    return (None, conformsTo, None) in kg

In [182]:
lds = get_live_deploys_urls()
#lds = random.sample(lds,20)
print(f"retrieved {len(lds)} live deploys")

found 136 live deploys
retrieved 136 live deploys


In [188]:
rows = []
i = 0
for ld in tqdm(lds):
    row = {"URL": ld}    
    try :
        status_code = requests.head(ld).status_code
        row["HTTP_status"] = status_code
        kg = retrieve_bioschemas(ld)
        #print(kg.serialize(format="ntriples"))
        row["nb_triples"] = len(kg)
        row["has_conforms_to"] = has_conforms_to(kg)
    except InvalidURL as e :
        print(f"Invalid url: {ld}")
    rows.append(row)
    
df1 = pd.DataFrame.from_records(rows)
df1.to_csv("bioschemas_harvesting.csv")
df1

  0%|          | 0/136 [00:00<?, ?it/s]

Loaded 14 RDF triples from https://bridgedb.github.io/data/gene_database/
Loaded 95 RDF triples from http://itsonedb.cloud.ba.infn.it/
Loaded 124 RDF triples from https://www.sib.swiss/training/course/20230426_DOCK
Loaded 0 RDF triples from https://proteinensemble.org/PED00014
Loaded 25 RDF triples from https://bioregistry.io/registry/hgnc
Loaded 373 RDF triples from https://humanmine.org/
Loaded 17 RDF triples from https://modelarchive.org/
Loaded 13 RDF triples from https://www.orpha.net/consor/cgi-bin/OC_Exp.php?Expert=141189&lng=en
Loaded 0 RDF triples from https://deb-central.org/
Loaded 34 RDF triples from https://bio.tools/blast
Loaded 10 RDF triples from https://scholia.toolforge.org/chemical/Q18216
Loaded 36 RDF triples from https://bmrb.io/data_library/summary/index.php?bmrbId=30309
Loaded 26 RDF triples from https://bioschemas.org/tutorials/what_why_bioschemas
Loaded 0 RDF triples from https://proteinensemble.org/
Loaded 28 RDF triples from http://edgar.biocomp.unibo.it/
Loa

Loaded 20 RDF triples from https://www.elixir-europe.org/events/biohackathon-2018-paris
Expecting value: line 1 column 1 (char 0)
Loaded 0 RDF triples from https://pippa.psb.ugent.be/pippa_experiments/consult_experiment_basic_info/55
Loaded 10 RDF triples from https://psnpbind.org/protein/1owh
Loaded 44 RDF triples from http://www.cathdb.info
Loaded 0 RDF triples from https://wikipathways.org/
Loaded 35 RDF triples from https://bioschemas.org/meetings/2022-01_SWAT4HCLS_leiden
Loaded 19 RDF triples from https://www.omicsdi.org/#/dataset/arrayexpress-repository/E-MTAB-6848
Loaded 0 RDF triples from https://www.gbif.org/species/5220113
Loaded 48 RDF triples from http://phenpath.biocomp.unibo.it/phenpath/
Loaded 0 RDF triples from http://159.149.160.88/pscan_chip_dev/
Loaded 27 RDF triples from https://identifiers.org/
Loaded 31 RDF triples from https://www.ensembl.org/
Loaded 49 RDF triples from https://zbmed-semtec.github.io/dome-galaxy-training/docs/DOME.html
Loaded 46 RDF triples from 

Unnamed: 0,URL,HTTP_status,nb_triples,has_conforms_to
0,https://bridgedb.github.io/data/gene_database/,301.0,14.0,True
1,http://itsonedb.cloud.ba.infn.it/,200.0,95.0,False
2,https://www.sib.swiss/training/course/20230426...,200.0,124.0,True
3,https://proteinensemble.org/PED00014,200.0,0.0,False
4,https://bioregistry.io/registry/hgnc,200.0,25.0,True
...,...,...,...,...
131,https://www.guidetopharmacology.org/,200.0,44.0,False
132,https://string-db.org/,200.0,182.0,True
133,https://ega-archive.org/datasets/EGAD00000000001,200.0,25.0,False
134,https://tess.elixir-europe.org/events/1st-inte...,200.0,35.0,True


In [187]:
rows = []
#print(lds)
for ld in tqdm(lds):
    try: 
        bs_valid = check_bioschemas(ld)
        for entity in bs_valid.keys():
            row = {"Bioschemas Live Deploy URL": ld}  
            row["Evaluated entity"] = entity
            row["Reference profile"] = bs_valid[entity]["ref_profile"]
            row["Is valid"] = bs_valid[entity]["conforms"]
            row["Nb errors"] = len(bs_valid[entity]["errors"])
            row["Nb warnings"] = len(bs_valid[entity]["warnings"])
            row["Is the latest profile"] = bs_valid[entity]["latest_profile"]
            row["Is deprecated profile"] = bs_valid[entity]["deprecated"]
            rows.append(row)
    except Exception as e: 
        print(f"Error while validating {ld}")
df2 = pd.DataFrame.from_records(rows)
df2.to_csv("bioschemas_validation.csv")
df2

['https://bridgedb.github.io/data/gene_database/', 'http://itsonedb.cloud.ba.infn.it/', 'https://www.sib.swiss/training/course/20230426_DOCK', 'https://proteinensemble.org/PED00014', 'https://bioregistry.io/registry/hgnc', 'https://humanmine.org/', 'https://modelarchive.org/', 'https://www.orpha.net/consor/cgi-bin/OC_Exp.php?Expert=141189&lng=en', 'https://deb-central.org/', 'https://bio.tools/blast', 'https://scholia.toolforge.org/chemical/Q18216', 'https://bmrb.io/data_library/summary/index.php?bmrbId=30309', 'https://bioschemas.org/tutorials/what_why_bioschemas', 'https://proteinensemble.org/', 'http://edgar.biocomp.unibo.it/', 'https://www.alliancegenome.org/gene/MGI:2442292', 'https://datacatalog.elixir-luxembourg.org/e/dataset/924dfe7a-71e8-11eb-bafe-3e22fbb3883f', 'https://www.metanetx.org/chem_info/MNXM680', 'https://nanocommons.github.io/tutorials/enteringData', 'https://zbmed.github.io/damalos/', 'https://swissmodel.expasy.org/', 'https://bridgedb.github.io/', 'https://www.bi

  0%|          | 0/136 [00:00<?, ?it/s]

Error while validating https://disprot.org/DP00086r026
Error while validating https://disprot.org/DP00086
Error while validating https://scholia.toolforge.org/taxon/Q15978631
Error while validating http://
Error while validating https://www.ensembl.org/id/ENSG00000139618
Error while validating https://corkoakdb.org/gene/13014
Error while validating https://pippa.psb.ugent.be/pippa_experiments/consult_experiment_basic_info/55
Error while validating https://www.gbif.org/species/5220113
Error while validating https://mobidb.org/P04637
Error while validating https://inpn.mnhn.fr/espece/cd_nom/60878/


Unnamed: 0,Bioschemas Live Deploy URL,Evaluated entity,Reference profile,Is valid,Nb errors,Nb warnings,Is the latest profile,Is deprecated profile
0,https://bridgedb.github.io/data/gene_database/,https://bridgedb.github.io/data/gene_database/...,https://bioschemas.org/profiles/Dataset/1.0-RE...,True,0,9,True,False
1,https://www.sib.swiss/training/course/20230426...,https://www.sib.swiss/training/course/20231013...,https://bioschemas.org/profiles/CourseInstance...,True,0,0,True,False
2,https://www.sib.swiss/training/course/20230426...,https://www.sib.swiss/training/course/20230426...,https://bioschemas.org/profiles/CourseInstance...,True,0,0,True,False
3,https://www.sib.swiss/training/course/20230426...,https://sib-swiss.github.io/containers-introdu...,https://bioschemas.org/profiles/TrainingMateri...,True,0,8,True,False
4,https://www.sib.swiss/training/course/20230426...,Nc1d43fc4da0a48e7aabdec02e3633157,https://bioschemas.org/profiles/Course/1.0-REL...,True,0,8,True,False
...,...,...,...,...,...,...,...,...
73,https://www.ensembl.org/,http://www.ensembl.org/#project,https://bioschemas.org/profiles/DataCatalog/0....,False,1,6,True,False
74,https://zbmed-semtec.github.io/dome-galaxy-tra...,N425edc506725425e8e3e4459ad8aa342,https://bioschemas.org/profiles/TrainingMateri...,True,0,6,True,False
75,https://zbmed-semtec.github.io/BioMedSem-IR-KD...,N82cb704e1bf94a75af22d75f6de3e3b3,https://bioschemas.org/profiles/TrainingMateri...,True,0,6,True,False
76,https://scholia.toolforge.org/gene/Q18030793,N6cdc8012ac1b49ce9388e72b2db6ef88,https://bioschemas.org/profiles/Gene/0.7-RELEASE,True,0,3,https://bioschemas.org/profiles/Gene/1.0-RELEASE,False


In [None]:
from IPython.display import display, Markdown, Latex
my_md = '<i class="fa fa-camera"></i>'
display(Markdown(my_md))

In [None]:
md1 = """
{"@id": "_:N0b2cabb67c8a461a8c1c84367ef89fb1", "http://www.w3.org/1999/xhtml/vocab#role": [{"@id": "http://www.w3.org/1999/xhtml/vocab#img"}]}
"""

md2 = """
{"type": "http://schema.org/Organization", "properties": {"logo": "http://bioschemas.org/assets/img/square_logo2.png", "url": "http://bioschemas.org", "email": "enquiries@bioschemas.org", "name": "Bioschemas Community"}}
"""

md3 = """

"""

In [152]:
#live_deploys_remote_file = "https://raw.githubusercontent.com/BioSchemas/bioschemas.github.io/master/_data/live_deployments.json"
#res = requests.get(live_deploys_remote_file)
#live_deploys = res.json()
#print(json.dumps(live_deploys, indent=True))

In [153]:
mobidb_valid = check_bioschemas("https://mobidb.org")
print(json.dumps(mobidb_valid, indent=True))

{
 "https://biocomputingup.it/#Organization": {
  "method": "by_conformsto",
  "type": "http://schema.org/Organization",
  "ref_profile": "https://bioschemas.org/profiles/Organization/0.2-DRAFT-2019_07_19",
  "conforms": false,
   "https://schema.org/alternateName",
   "https://schema.org/contactPoint",
   "https://discovery.biothings.io/view/bioschemas/fundingModel",
   "https://schema.org/keywords",
   "https://schema.org/location",
   "https://schema.org/logo",
   "https://schema.org/member",
   "https://schema.org/memberOf",
   "https://discovery.biothings.io/view/bioschemas/membershipCategory",
   "https://discovery.biothings.io/view/bioschemas/status",
   "https://schema.org/url"
  ],
  "errors": [
   "https://discovery.biothings.io/view/bioschemas/topic"
  ],
  "deprecated": false,
  "latest_profile": "https://bioschemas.org/profiles/Organization/0.3-DRAFT"
 },
 "https://mobidb.org/#DataCatalog": {
  "method": "by_conformsto",
  "type": "http://schema.org/DataCatalog",
  "ref_pr

In [179]:
g = ConjunctiveGraph()
g.parse("http://bioschemas.org/", format="n3")
#print(g.serialize(format="turtle"))

BadSyntax: at line 4 of <>:
Bad syntax (expected '.' or '}' or ']' at end of statement) at ^ in:
"b'<!DOCTYPE html>\n<html lang="en">\n<html>\n  '^b'<head>\n  <meta charset="utf-8">\n  <meta http-equiv="X-UA-Com'..."