In [106]:
import json
import io

from rdflib import ConjunctiveGraph, RDFS, Namespace, URIRef, BNode, Literal
from rdflib.namespace import RDF

#import networkx as nx
#from nxpd import draw

g = ConjunctiveGraph()
g.bind('prov', Namespace('http://www.w3.org/ns/prov#'))
g.bind('p-plan', Namespace('http://purl.org/net/p-plan#'))
g.bind('dc', Namespace('http://dcterms/'))
g.bind('nb-core', Namespace('http://bise-eu.info/core-ontology#'))

nb_core = Namespace("http://bise-eu.info/core-ontology#")
pplan = Namespace("http://purl.org/net/p-plan#")
bise = Namespace("http://biii.eu/node/")

# 1. Load the EDAM Bioimaging ontology

In [107]:
def load_rdf(filename):
    with io.open(filename,'r',encoding='utf8') as f:
        g.load(f, format='turtle')  
        
def load_remote_rdf(url):
    g.load(url, format='turtle')
    
def load_remote_owl(url):
    g.load(url, format='xml')


def load_owl(filename):
    with io.open(filename,'r',encoding='utf8') as f:
        g.load(f, format='xml')  

#with open('test.json', 'r') as data_file:
#    data_str = data_file.read()
#try:
#    data = json.loads(data_str)
#except json.decoder.JSONDecodeError as e:
#    print("parsing ERROR "+e.msg)

#import requests
#r = requests.get('http://test.biii.eu/wfsteps?_format=json') 
#print(json.dumps(r.json(), indent=4))
#data = r.json()

#load_remote_rdf('https://raw.githubusercontent.com/NeuBIAS/bise-core-ontology/master/data-dumps/latest/neubias-latest.ttl')
#load_owl('./bise-linked-data-webapp/static/data/EDAM-bioimaging_alpha03.owl')
load_remote_owl('https://github.com/edamontology/edam-bioimaging/raw/master/releases/EDAM-bioimaging_alpha06.owl')

# 2. Indexing all synonyms and labels associated to EDAM concepts

In [108]:
import pandas as pd

synonyms_query = """
    SELECT ?edam_class ?label ?related_syn ?narrow_syn ?exact_syn ?broad_syn WHERE {
        ?edam_class rdfs:label ?label .
        OPTIONAL {?edam_class oboInOwl:hasRelatedSynonym ?related_syn .}
        OPTIONAL {?edam_class oboInOwl:hasNarrowSynonym ?narrow_syn .}
        OPTIONAL {?edam_class oboInOwl:hasExactSynonym ?exact_syn .}
        OPTIONAL {?edam_class oboInOwl:hasBroadSynonym ?broad_syn .}
    } 
"""

labels_index = {}
exact_syn_index = {}
narrow_syn_index = {}
broad_syn_index = {}
related_syn_index = {}

edam_syn_index = {}

results = g.query(synonyms_query)

for r in results :
#    print(r)
    if r['label']:
        labels_index[str(r['label'])] = str(r['edam_class'])
        edam_syn_index[str(r['label'])] = {'uri': str(r['edam_class'])}
    
    if r['exact_syn']:
        exact_syn_index[str(r['exact_syn'])] = str(r['edam_class'])
        
        if 'exact_syn' not in edam_syn_index[str(r['label'])].keys():
            edam_syn_index[str(r['label'])]['exact_syn'] = [str(r['exact_syn'])]
        else:
            edam_syn_index[str(r['label'])]['exact_syn'].append(str(r['exact_syn']))
        
    if r['narrow_syn']:
        narrow_syn_index[str(r['narrow_syn'])] = str(r['edam_class'])
        
        if 'narrow_syn' not in edam_syn_index[str(r['label'])].keys():
            edam_syn_index[str(r['label'])]['narrow_syn'] = [str(r['narrow_syn'])]
        else:
            edam_syn_index[str(r['label'])]['narrow_syn'].append(str(r['narrow_syn']))
            
    if r['broad_syn']:
        broad_syn_index[str(r['broad_syn'])] = str(r['edam_class'])
        
        if 'broad_syn' not in edam_syn_index[str(r['label'])].keys():
            edam_syn_index[str(r['label'])]['broad_syn'] = [str(r['broad_syn'])]
        else:
            edam_syn_index[str(r['label'])]['broad_syn'].append(str(r['broad_syn']))
            
    if r['related_syn']:
        related_syn_index[str(r['related_syn'])] = str(r['edam_class'])
        
        if 'related_syn' not in edam_syn_index[str(r['label'])].keys():
            edam_syn_index[str(r['label'])]['related_syn'] = [str(r['related_syn'])]
        else:
            edam_syn_index[str(r['label'])]['related_syn'].append(str(r['related_syn']))
            
print(json.dumps(edam_syn_index, indent=4))

{
    "Geometric distortion correction": {
        "uri": "http://edamontology.org/operation__R3BGvVm77d5vFMF6f28ZiW",
        "narrow_syn": [
            "Lens distortion correction"
        ]
    },
    "Electron cryotomography": {
        "uri": "http://edamontology.org/topic__RDCjx86atJP685sX1zrE2ia",
        "exact_syn": [
            "CryoET"
        ],
        "narrow_syn": [
            "CET"
        ],
        "broad_syn": [
            "CryoTEM"
        ]
    },
    "Binary format": {
        "uri": "http://edamontology.org/format_Binary_format"
    },
    "Smoothing": {
        "uri": "http://edamontology.org/operation_Image_smoothing",
        "narrow_syn": [
            "Gaussian filtering"
        ],
        "related_syn": [
            "Blurring"
        ]
    },
    "Serial block-face scanning electron microscopy": {
        "uri": "http://edamontology.org/topic__RByvmgDpAo4FOrMdvTm1gwR",
        "exact_syn": [
            "SBF SEM"
        ],
        "narrow_syn": [
  

# 3. Compute syntactic distance between a set of tags and synonyms
the distance is is a floating number between 0 and 1. The weighted distance, not very meaningfull, is a weighted mean between distance considering this order of priority : label >> exact synonym >> narrow synonym >> broad synonym >> related synonym. 

In [119]:
import jellyfish


#tags = ['Filament track']
tags = ['Fourier transformation']

#iterate over tags
for t in tags:
    distances = [] 
    
    #iterate over edam concepts and their synonyms
    for k,v in edam_syn_index.items():
        dist_es = []
        dist_rs = []
        dist_ns = []
        dist_bs = []
        #print(v)
        #print(v.keys())
        #print(f"computing distance between <{t}> and <{k}>")
        #print(jellyfish.jaro_winkler(t,k))
        
        # distance with the concept label
        d_label = jellyfish.jaro_winkler(t,k)
        
        # distance with all of the concept exact synonyms
        if 'exact_syn' in v.keys():
            for exact in v['exact_syn']:
                dist_es.append(jellyfish.jaro_winkler(t,exact))
                
        # distance with all of the concept narrow synonyms
        if 'narrow_syn' in v.keys():
            for narrow in v['narrow_syn']:
                dist_ns.append(jellyfish.jaro_winkler(t,narrow))
                
        # distance with all of the concept broad synonyms
        if 'broad_syn' in v.keys():
            for broad in v['broad_syn']:
                dist_bs.append(jellyfish.jaro_winkler(t,broad))
                
        # distance with all of the concept related synonyms
        if 'related_syn' in v.keys():
            for related in v['related_syn']:
                #print(f"distance betwen {t} and {exact}")
                dist_rs.append(jellyfish.jaro_winkler(t,related))
        
        d_es, d_ns, d_bs, d_rs = [0,0,0,0]
        
        # for each kind of synonym we peak only the closest one
        if len(dist_es) > 0:
            d_es = sorted(dist_es, reverse=True)[0]
        if len(dist_ns) > 0:
            d_ns = sorted(dist_ns, reverse=True)[0]
        if len(dist_bs) > 0:
            d_bs = sorted(dist_bs, reverse=True)[0]
        if len(dist_rs) > 0:
            d_rs = sorted(dist_rs, reverse=True)[0]
        
        # we compute a weighted distance to consider all kinds of synonyms
        weighted_d = ((10 * d_label) + (8 * d_es) + (5 * d_ns) + (3 * d_bs) + (1 * d_rs)) / (10 + 8 + 5 + 3 + 1)
        #print(f"weighted dist({t},{k}) = {weighted_d}")
        #print(f"exact dist({t},{k}) = {d_es}")
        #print(f"narrow dist({t},{k}) = {d_ns}")
        #print(f"broad dist({t},{k}) = {d_bs}")
        #print(f"related dist({t},{k}) = {d_rs}")
        distances.append({'tag': t, 'edam_concept': k, 'edam_concept_uri': v['uri'], 
                         'weighted_distance': weighted_d, 
                          'label_distance': d_label,
                         'exact_distance': d_es,
                         'narrow_distance': d_ns, 
                         'broad_distance': d_bs, 
                         'related_distance': d_rs})
    
    # after having screened all EDAM concepts, we print the top-5 suggestions. 
    print(f'****** {t} ******')
              
    print("top-5 closest concept (based on labels)")
    for c in sorted(distances, key = lambda x : (x['label_distance']), reverse=True)[0:5]:
        print(f"\t{c['edam_concept']}: {c['edam_concept_uri']} (d={c['label_distance']})")
    print()
              
    print("top-5 closest concept (based on exact synonyms)")
    for c in sorted(distances, key = lambda x : (x['exact_distance']), reverse=True)[0:5]:
        print(f"\t{c['edam_concept']}: {c['edam_concept_uri']} (d={c['exact_distance']})")
    print()
              
    print("top-5 closest concept (based on narrow synonyms)")
    for c in sorted(distances, key = lambda x : (x['narrow_distance']), reverse=True)[0:5]:
        print(f"\t{c['edam_concept']}: {c['edam_concept_uri']} (d={c['narrow_distance']})")
    print()
              
    print("top-5 closest concept (based on broad synonyms)")
    for c in sorted(distances, key = lambda x : (x['broad_distance']), reverse=True)[0:5]:
        print(f"\t{c['edam_concept']}: {c['edam_concept_uri']} (d={c['broad_distance']})")
    print()
              
    print("top-5 closest concept (based on related synonyms)")
    for c in sorted(distances, key = lambda x : (x['related_distance']), reverse=True)[0:5]:
        print(f"\t{c['edam_concept']}: {c['edam_concept_uri']} (d={c['related_distance']})")
    print()
              
    print("top-5 closest concept (based on weighted synonyms)")
    for c in sorted(distances, key = lambda x : (x['weighted_distance']), reverse=True)[0:5]:
        print(f"\t{c['edam_concept']}: {c['edam_concept_uri']} (d={c['weighted_distance']})")
    print()
    print()

****** Fourier transformation ******
top-5 closest concept (based on labels)
	Fourier-transform infrared spectroscopy: http://edamontology.org/topic______Fourier-transform_infrared_spectroscopy (d=0.8491841491841492)
	Linear transformation: http://edamontology.org/operation_Linear_transformation (d=0.7991822991822991)
	Affine transformation: http://edamontology.org/operation__R8ANfLkuHu47r739AkhxrId (d=0.7823189882013412)
	Frequency-domain transformation: http://edamontology.org/operation_____Frequency_domain (d=0.7467322999581063)
	Geometrical transformation: http://edamontology.org/operation_Geometrical_transform (d=0.7244509876088824)

top-5 closest concept (based on exact synonyms)
	Image processing: http://edamontology.org/operation_Image_processing (d=0.7799242424242424)
	Affine registration: http://edamontology.org/operation__RLtOI0poPsYRHzgXbAcr2E (d=0.6849242424242424)
	Frequency-domain transformation: http://edamontology.org/operation_____Frequency_domain (d=0.656787169945064