In [1]:
from rdflib import Graph, ConjunctiveGraph, Literal, BNode, Namespace, RDF, URIRef

In [61]:
# load the keyword instances graph and bnode graph because this is the data we want to link (decided to focus ont these two for now and then later attempt with mesh)
kw = Graph()
ne = Graph()
kw.parse("./ParsingSpaqrlIntoTTLFile/Output-Graphs/HypothesisKeywordInstances.ttl", format="turtle")
ne.parse("./ParsingSpaqrlIntoTTLFile/Output-Graphs/bnode_graph_new.ttl", format="turtle")

<Graph identifier=Nfe3b98f93ce64510a000bfba38d5a2ae (<class 'rdflib.graph.Graph'>)>

In [3]:
# get the namespaces for running the sparql queries in python
def create_namespace(graph, namespace, prefix):

    ns = Namespace(namespace)
    graph.namespace_manager.bind(prefix, namespace)
    
    return ns


hyp_namespace = create_namespace(ne, "http://example.org/hypothesis_ontology/", 'hyp')
oa_namespace = create_namespace(ne, "http://www.w3.org/ns/oa#", 'oa')

hyp_namespace = create_namespace(kw, "http://example.org/hypothesis_ontology/", 'hyp')
oa_namespace = create_namespace(kw, "http://www.w3.org/ns/oa#", 'oa')

# covidpr_namespace = create_namespace(bnode_g, "http://ns.inria.fr/covid19/property/", 'covidpr')
# dct_namespace = create_namespace(bnode_g, "http://purl.org/dc/terms/", 'dct')
# covid_namespace = create_namespace(bnode_g, "http://ns.inria.fr/covid19/", 'covid')
# prov_namespace = create_namespace(bnode_g, 'http://www.w3.org/TR/prov-o/', 'prov')

Notes: From this point, I was trying to get the data from the graph and retrieve it in a way that makes it easier to compare the literals of the instances. First I though going through the triples is a possible way, but now I think performing sparql queries and organzing the retrieved data might be more promising... I decided to try storing the info in dictionaries for now and then my idea was to loop through them both and perform some comparing methods. Through this, I would store the  keyword uri and the named entity uri that shared similar literals in a tuple. Finally, I would create triples of these with rdflib through g.add((keywordURI, skos.related, namedentityURI)) and save these to a turtle file. 

In [5]:
def get_abstracts(graph):
    abstracts = []
    for s, p, o in graph.triples( (None, oa_namespace.hasSource, None) ):
        keyword = graph.value(s,p)
        abstracts.append(keyword)
    return abstracts

get_abstracts(kw)

[rdflib.term.URIRef('http://ns.inria.fr/covid19/ce3238bca808f3cd61da0eabc5547f1cc4d7b97b#abstract'),
 rdflib.term.URIRef('http://ns.inria.fr/covid19/7c0168a4b858a8680f305fc4d8cc3c787ff4e110#abstract'),
 rdflib.term.URIRef('http://ns.inria.fr/covid19/078b6935fa30cd20a00da6c6182e85b41ab4d9c5#abstract'),
 rdflib.term.URIRef('http://ns.inria.fr/covid19/cdc339a7fc1848e5e3fad95ee783a931aac0e770#abstract'),
 rdflib.term.URIRef('http://ns.inria.fr/covid19/491de5948b2b58ded0ed7b6f1c1a0adf761997ef#abstract'),
 rdflib.term.URIRef('http://ns.inria.fr/covid19/879b0904d411e88c1be90f0660e7bb4377ae29d8#abstract'),
 rdflib.term.URIRef('http://ns.inria.fr/covid19/9cfada440e426dd4dabdb499d47ab2e7f9bad08c#abstract'),
 rdflib.term.URIRef('http://ns.inria.fr/covid19/e930e83f886d66fa4be64fb0e90d9022b6123220#abstract'),
 rdflib.term.URIRef('http://ns.inria.fr/covid19/e3a94077b5bd542508ac66b664fce9222b5423ff#abstract'),
 rdflib.term.URIRef('http://ns.inria.fr/covid19/e4c82fce8e83365c3de134fc96250509c39479d6#ab

In [7]:
len(set(get_abstracts(kw)))

840

In [62]:
ne_q = ne.query(
"""
prefix dct: <http://purl.org/dc/terms/> 
prefix hyp: <http://example.org/hypothesis_ontology/> 
prefix oa: <http://www.w3.org/ns/oa#>
SELECT ?ne ?literal ?abstract
WHERE {
  ?ne oa:hasSource ?abstract; oa:exact ?literal.
}
"""
)

ne_data = []
for i in ne_q:
    ne_dict = dict()
    ne_dict['ne_id'] = i.ne
    ne_dict['literal'] = i.literal
    ne_dict['abstract'] = i.abstract
    ne_data.append(ne_dict)

In [63]:
kw_q = kw.query(
"""
prefix dct: <http://purl.org/dc/terms/> 
prefix hyp: <http://example.org/hypothesis_ontology/> 
prefix oa: <http://www.w3.org/ns/oa#>
SELECT ?kw ?literal ?abstract
WHERE {
  ?hyp oa:hasSource ?abstract; hyp:contains ?kw.
  ?kw oa:hasTarget ?literal.
}
"""
)

kw_data = []
for i in kw_q:
    kw_dict = dict()
    kw_dict['kw_id'] = i.kw
    kw_dict['literal'] = i.literal
    kw_dict['abstract'] = i.abstract
    kw_data.append(kw_dict)

In [64]:
ne_data[10]['literal']

rdflib.term.Literal('biological activity')

In [65]:
print(len(ne_data))

9996


In [66]:
print(len(kw_data))

5710


In [67]:
for ne in ne_data:
    for ne2 in ne_data:
        if ne == ne2:
            continue
        if ne['abstract'] != ne2['abstract']:
            continue
        if str(ne['literal']) == str(ne2['literal']):
            ne_data.remove(ne2)

In [68]:
print(len(ne_data))

8058


In [69]:
for kw in kw_data:
    for kw2 in kw_data:
        if kw == kw2:
            continue
        if kw['abstract'] != kw2['abstract']:
            continue
        if str(kw['literal']) == str(kw2['literal']):
            kw_data.remove(kw2)

In [70]:
print(len(kw_data))

5593


In [77]:
g = Graph()
skos_namespace = create_namespace(g, "http://www.w3.org/2004/02/skos/core#", 'skos')
for kw in kw_data:
    if len(str(kw['literal'])) < 3:
        continue
    for ne in ne_data:
        if len(str(ne['literal'])) < 3:
            continue
        if kw['abstract'] != ne['abstract']:
            continue
        if str(kw['literal']) == str(ne['literal']):
            g.add((kw['kw_id'],skos_namespace.related, ne['ne_id']))
            
g.serialize('./Output-Graphs/skos_same_abstract_equal.ttl', format="turtle")
print(len(g))

433


In [78]:
g = Graph()
skos_namespace = create_namespace(g, "http://www.w3.org/2004/02/skos/core#", 'skos')
for kw in kw_data:
    if len(str(kw['literal'])) < 3:
        continue
    for ne in ne_data:
        if len(str(ne['literal'])) < 3:
            continue
        if str(kw['literal']) == str(ne['literal']):
            g.add((kw['kw_id'],skos_namespace.related, ne['ne_id']))
            
g.serialize('./Output-Graphs/skos_equal.ttl', format="turtle")
print(len(g))

6987


In [79]:
g = Graph()
skos_namespace = create_namespace(g, "http://www.w3.org/2004/02/skos/core#", 'skos')
for kw in kw_data:
    if len(str(kw['literal'])) < 3:
        continue
    for ne in ne_data:
        if len(str(ne['literal'])) < 3:
            continue
        if kw['abstract'] != ne['abstract']:
            continue
        if str(kw['literal']) in str(ne['literal']):
            g.add((kw['kw_id'],skos_namespace.related, ne['ne_id']))
        elif str(ne['literal']) in str(kw['literal']):
            g.add((kw['kw_id'],skos_namespace.related, ne['ne_id']))
            
g.serialize('./Output-Graphs/skos_same_abstract_contain.ttl', format="turtle")
print(len(g))

1790


In [80]:
g = Graph()
skos_namespace = create_namespace(g, "http://www.w3.org/2004/02/skos/core#", 'skos')
for kw in kw_data:
    if len(str(kw['literal'])) < 3:
        continue
    for ne in ne_data:
        if len(str(ne['literal'])) < 3:
            continue
        if str(kw['literal']) in str(ne['literal']):
            g.add((kw['kw_id'],skos_namespace.related, ne['ne_id']))
        elif str(ne['literal']) in str(kw['literal']):
            g.add((kw['kw_id'],skos_namespace.related, ne['ne_id']))
            
g.serialize('./Output-Graphs/skos_contain.ttl', format="turtle")
print(len(g))

92880
