In [1]:
import pickle
import re
import os

from SPARQLWrapper import SPARQLWrapper
from rdflib import Graph, URIRef, Literal, RDF, RDFS
from tqdm import tqdm

from hkg.graph_utils import save_graph, add_namespaces, subgraph
from hkg.labels import URI2labels
from hkg.namespaces import DBP, DBO, DBR, HKG

  from .autonotebook import tqdm as notebook_tqdm


Create the folders where the graphs will be stored in different formats

In [2]:
os.makedirs("./data/graphs/pickle", exist_ok=True)
os.makedirs("./data/graphs/csv", exist_ok=True)
os.makedirs("./data/graphs/ttl", exist_ok=True)

In [3]:
save_pickle = True
save_turtle = True
save_csv = True
serialize_graph = lambda graph, graph_name, folder: save_graph(graph, graph_name, folder, save_pickle=save_pickle, save_turtle=save_turtle, save_csv=save_csv)

Initialize SPARQLWrapper. Running on "http://localhost:8890/sparql" there should be a local instance of Virtuoso with the DBpedia KG.

In [4]:
local_sparql = SPARQLWrapper("http://localhost:8890/sparql")

A few functions for filtering terms

In [5]:
# regex to exlude some chemical formulas.
exclude = re.compile(r"[α-ωΑ-Ω]|\d+-\D+|skeletal formula")

# skip common useless terms
skip = {"other", "others", "many others",
        "many", "none", "see article",
        "unknown", "unclear", "uncertain", "often not identified",
        "n", "", "etc.",
        "ball-and-stick model", "structural formula", "space-filling model"
       } # "medication", "medications", "surgical", "conservative"

def valid_literal(l):
    return l.strip() not in skip and len(l) > 2 and not l.strip().isnumeric() and not exclude.search(l)

def valid_iri(iri):
    if iri.removeprefix(DBR).startswith('ATC_code'):
        return False
    if iri.removeprefix(DBR).startswith('ATCvet_code'):
        return False
    return True

A function for splitting a label containing alternatives into multiple labels.

In [6]:
ALTERNATIVE = re.compile(r"(?P<left>\S+)/(?P<right>\S+)")
def split_alternatives(string, min_length=3):
    alternatives = set()
    match = ALTERNATIVE.match(string)
    if not match:
        return alternatives
    left = match.group("left")
    right = match.group("right")
    if len(left) < min_length or len(right) < min_length:
        return alternatives
    if not valid_literal(left) or not valid_literal(right):
        return alternatives
    alternatives.add(string.replace(match.group(), left))
    alternatives.add(string.replace(match.group(), right))
    return alternatives

A function to run a query on DBpedia:

In [7]:
def run_query(query, split=True):
    split_pattern = ",|;|\*"
    graph = Graph()
    local_sparql.setQuery(query)
    local_sparql.setReturnFormat(format="xml")
    results = local_sparql.queryAndConvert()
    for s, p, o in results:
        if not valid_iri(s):
            continue
        if isinstance(o, URIRef) and not valid_iri(o):
            continue
        if isinstance(o, Literal) and p == RDFS.label:
            labels = URI2labels(s)
            for label in labels:
                graph.add((s, p, Literal(label, "en")))
            if o.value.lower() in labels and ", " in o.value:
                # Take care of IRIs with a comma in the value: e.g.,
                # "<http://dbpedia.org/resource/Aminoaciduria,_renal>"
                # -> "aminoaciduria" and "aminuaciduria renal"
                before_comma = o.value.lower().split(", ")[0]
                without_comma = o.value.lower().replace(", ", " ")
                graph.add((s, p, Literal(before_comma, o.language, o.datatype)))
                graph.add((s, p, Literal(without_comma, o.language, o.datatype)))
            if o.value.lower() not in labels and split:
                for l in re.split(split_pattern, o.value.lower()):
                    l = l.strip()
                    if valid_literal(l):
                        graph.add((s, p, Literal(l, o.language, o.datatype)))
                        if l.count("/") == 1:
                            # Split alternatives e.g., dbr:Disease hkg:relation "a/b c" ->
                            # Split alternatives e.g., dbr:Disease hkg:relation "a c" and
                            # Split alternatives e.g., dbr:Disease hkg:relation "b c"
                            for alt in split_alternatives(l):
                                graph.add((s, p, Literal(alt.strip(), o.language, o.datatype)))
        elif isinstance(o, Literal) and split and o.value and isinstance(o.value, str):
            for l in re.split(split_pattern, o.value.lower()):
                l = l.strip()
                if valid_literal(l):
                    graph.add((s, p, Literal(l, o.language, o.datatype)))
                    if l.count("/") == 1:
                        for alt in split_alternatives(l):
                            trip = (s, p, Literal(alt.strip(), o.language, o.datatype))
                            graph.add(trip)
        elif o:
            graph.add((s, p, o))
    return graph

## All diseases, their types, labels, sameAs

Get all DBpedia diseases, and if available also their English labels, their equivalent entities and their labels.

In [8]:
disease_types, disease_labels, disease_sameas = Graph(), Graph(), Graph()
query = """construct { ?disease a dbo:Disease ; owl:sameAs ?same_ent ; rdfs:label ?label . ?same_ent rdfs:label ?same_ent_label }
           where
           { ?disease a dbo:Disease .
             OPTIONAL { ?disease rdfs:label ?label . FILTER(lang(?label) = "en")  }
             OPTIONAL { ?disease dbo:wikiPageRedirects|^dbo:wikiPageRedirects ?same_ent .
                     OPTIONAL { ?same_ent rdfs:label ?same_ent_label
                                FILTER (lang(?same_ent_label) = "en")
                     } }
           }"""
graph = run_query(query, split=True)

In [9]:
disease_types += subgraph(graph, "rdf:type")
disease_labels += subgraph(graph, "rdfs:label")
disease_sameas += subgraph(graph, "owl:sameAs")

In [10]:
len(disease_types), len(disease_labels), len(disease_sameas)

(18333, 108149, 73718)

In [11]:
disease_properties_comments = ["dbo:abstract", "rdfs:comment"]
disease_properties_labels= ["rdfs:label", "dbp:synonyms", "dbo:alias", "dbo:geneReviewsName", "dbo:synonym", "dbp:alt", "foaf:name"]

Get also the abstract and comments for diseases.

In [12]:
query = """construct {{ ?disease a dbo:Disease ; {out_prop} ?o ; owl:sameAs ?same_ent . ?same_ent {out_prop} ?same_ent_label }}
           where
           {{ ?disease a dbo:Disease ; {in_prop} ?o . FILTER (lang(?o) = "en")
           OPTIONAL {{ ?disease dbo:wikiPageRedirects|^dbo:wikiPageRedirects ?same_ent . ?same_ent {in_prop} ?same_ent_label
                       FILTER (lang(?same_ent_label) = "en")
           }}
           }}"""

for prop in disease_properties_comments:
    q = query.format(in_prop=prop, out_prop=prop)
    graph = run_query(q, split=False)
    add_namespaces(graph)
    
    before = len(disease_labels)
    disease_labels += subgraph(graph, prop)
    after = len(disease_labels)
    print(f"Added {after-before} triples for {prop}")
    
    before = len(disease_sameas)
    disease_sameas += subgraph(graph, "owl:sameAs")
    after = len(disease_sameas)
    print(f"Added {after-before} triples for owl:sameAs")

Added 16644 triples for dbo:abstract
Added 0 triples for owl:sameAs
Added 16644 triples for rdfs:comment
Added 0 triples for owl:sameAs


Get also synonyms and other aliases.

In [13]:
for prop in disease_properties_labels:
    q = query.format(in_prop=prop, out_prop="rdfs:label")
    graph = run_query(q, split=True)
    add_namespaces(graph)
    
    before = len(disease_labels)
    disease_labels += subgraph(graph, "rdfs:label")
    after = len(disease_labels)
    print(f"Added {after-before} triples for {prop}")
    
    before = len(disease_sameas)
    disease_sameas += subgraph(graph, "owl:sameAs")
    after = len(disease_sameas)
    print(f"Added {after-before} triples for owl:sameAs")

Added 0 triples for rdfs:label
Added 0 triples for owl:sameAs
Added 7963 triples for dbp:synonyms
Added 0 triples for owl:sameAs
Added 62 triples for dbo:alias
Added 0 triples for owl:sameAs
Added 126 triples for dbo:geneReviewsName
Added 0 triples for owl:sameAs
Added 0 triples for dbo:synonym
Added 0 triples for owl:sameAs
Added 430 triples for dbp:alt
Added 0 triples for owl:sameAs
Added 1035 triples for foaf:name
Added 0 triples for owl:sameAs


In [14]:
len(disease_types), len(disease_labels), len(disease_sameas)

(18333, 151053, 73718)

In [15]:
all_diseases = disease_types+disease_labels+disease_sameas

In [16]:
%%time
serialize_graph(all_diseases, "all_diseases", "./data/graphs/")

Saving ./data/graphs/pickle/all_diseases.pickle
Saving ./data/graphs/csv/all_diseases.csv
Saving ./data/graphs/ttl/all_diseases.ttl
CPU times: user 2min 17s, sys: 344 ms, total: 2min 17s
Wall time: 2min 17s


## All drugs, their types, labels, sameAs

Get all DBpedia drugs, and if available also their English labels, their equivalent entities and their labels.

In [17]:
drug_types, drug_labels, drug_sameas = Graph(), Graph(), Graph()
query = """construct { ?drug a dbo:Drug ; owl:sameAs ?same_ent ; rdfs:label ?label . ?same_ent rdfs:label ?same_ent_label }
           where
           { ?drug a dbo:Drug .
             OPTIONAL { ?drug rdfs:label ?label . FILTER(lang(?label) = "en")  }
             OPTIONAL { ?drug dbo:wikiPageRedirects|^dbo:wikiPageRedirects ?same_ent .
                     OPTIONAL { ?same_ent rdfs:label ?same_ent_label
                                FILTER (lang(?same_ent_label) = "en")
                     } }
           }"""
graph = run_query(query, split=True)

In [18]:
drug_types += subgraph(graph, "rdf:type")
drug_labels += subgraph(graph, "rdfs:label")
drug_sameas += subgraph(graph, "owl:sameAs")

In [19]:
len(drug_types), len(drug_labels), len(drug_sameas)

(11234, 84537, 55885)

In [20]:
drug_properties_comments = ["dbo:abstract", "rdfs:comment"]
drug_properties_labels = ["rdfs:label", "dbo:alternativeName", "dbp:alt", "dbp:dailymedid", "foaf:name", "dbp:drugName", "dbp:synonyms", "dbp:tradename"]

Get also the abstract and comments for drugs.

In [21]:
query = """construct {{ ?drug a dbo:Drug ; {out_prop} ?o ; owl:sameAs ?same_ent . ?same_ent {out_prop} ?same_ent_label }}
           where
           {{ ?drug a dbo:Drug ; {in_prop} ?o . FILTER (lang(?o) = "en")
           OPTIONAL {{ ?drug dbo:wikiPageRedirects|^dbo:wikiPageRedirects ?same_ent . ?same_ent {in_prop} ?same_ent_label
                       FILTER (lang(?same_ent_label) = "en")
           }}
           }}"""

for prop in drug_properties_comments:
    q = query.format(in_prop=prop, out_prop=prop)
    graph = run_query(q, split=False)
    add_namespaces(graph)
    
    before = len(drug_labels)
    drug_labels += subgraph(graph, prop)
    after = len(drug_labels)
    print(f"Added {after-before} triples for {prop}")
    
    before = len(drug_sameas)
    drug_sameas += subgraph(graph, "owl:sameAs")
    after = len(drug_sameas)
    print(f"Added {after-before} triples for owl:sameAs")

Added 10743 triples for dbo:abstract
Added 0 triples for owl:sameAs
Added 10743 triples for rdfs:comment
Added 0 triples for owl:sameAs


Get also synonyms and other aliases.

In [22]:
for prop in drug_properties_labels:
    q = query.format(in_prop=prop, out_prop="rdfs:label")
    graph = run_query(q, split=True)
    add_namespaces(graph)
    
    before = len(drug_labels)
    drug_labels += subgraph(graph, "rdfs:label")
    after = len(drug_labels)
    print(f"Added {after-before} triples for {prop}")
    
    before = len(drug_sameas)
    drug_sameas += subgraph(graph, "owl:sameAs")
    after = len(drug_sameas)
    print(f"Added {after-before} triples for owl:sameAs")

Added 0 triples for rdfs:label
Added 0 triples for owl:sameAs
Added 6539 triples for dbo:alternativeName
Added 0 triples for owl:sameAs
Added 498 triples for dbp:alt
Added 0 triples for owl:sameAs
Added 315 triples for dbp:dailymedid
Added 0 triples for owl:sameAs
Added 306 triples for foaf:name
Added 0 triples for owl:sameAs
Added 34 triples for dbp:drugName
Added 0 triples for owl:sameAs
Added 4739 triples for dbp:synonyms
Added 0 triples for owl:sameAs
Added 156 triples for dbp:tradename
Added 0 triples for owl:sameAs


In [23]:
len(drug_types), len(drug_labels), len(drug_sameas)

(11234, 118610, 55885)

(11234, 97065, 55885)


In [24]:
all_drugs = drug_types+drug_labels+drug_sameas

In [25]:
%%time
serialize_graph(all_drugs, "all_drugs", "./data/graphs/")

Saving ./data/graphs/pickle/all_drugs.pickle
Saving ./data/graphs/csv/all_drugs.csv
Saving ./data/graphs/ttl/all_drugs.ttl
CPU times: user 1min 15s, sys: 168 ms, total: 1min 15s
Wall time: 1min 15s


# Main graph

Get triples linked by `dbo:treatment`, `dbo:medication` and other health-related predicates.

In [26]:
hkg = dict()
hkg[DBP.treatment.n3()] = HKG.treatment.n3()
hkg[DBO.treatment.n3()] = HKG.treatment.n3()
hkg[DBP.medication.n3()] = HKG.medication.n3()
hkg[DBO.medication.n3()] = HKG.medication.n3()
hkg[DBP.prevention.n3()] = HKG.prevention.n3()
hkg[DBP.complications.n3()] = HKG.complication.n3()
hkg[DBO.complications.n3()]  = HKG.complication.n3()
hkg[DBO.medicalDiagnosis.n3()]  = HKG.diagnosis.n3()
hkg[DBP.diagnosis.n3()]  = HKG.diagnosis.n3()
hkg[DBO.medicalCause.n3()]  = HKG.cause.n3()
hkg[DBP.causes.n3()] = DBP.causes.n3()
hkg[DBO.symptom.n3()] = HKG.symptom.n3()
hkg[DBP.symptoms.n3()] = HKG.symptom.n3()
hkg[DBP.risks.n3()] = HKG.risk.n3()

In [27]:
pattern = """construct {{
                ?x {out_pred} ?y ; rdf:type ?xt ; rdfs:label ?lx ; owl:sameAs ?rx . ?rx rdfs:label ?lrx .
                              ?y   rdf:type ?yt ; rdfs:label ?ly ; owl:sameAs ?ry . ?ry rdfs:label ?lry .
            }}
            where
            {{
                ?x {in_pred} ?y .
                OPTIONAL {{ ?x rdf:type ?xt . filter ((?xt = dbo:Disease) || (?xt = dbo:Drug)) }}
                OPTIONAL {{ ?y rdf:type ?yt . filter ((?yt = dbo:Disease) || (?yt = dbo:Drug)) }}
                OPTIONAL {{ ?x rdfs:label ?lx}}
                OPTIONAL {{ ?y rdfs:label ?ly}}
                OPTIONAL {{ ?x dbo:wikiPageRedirects|^dbo:wikiPageRedirects ?rx . OPTIONAL {{ ?rx rdfs:label ?lrx }} }}
                OPTIONAL {{ ?y dbo:wikiPageRedirects|^dbo:wikiPageRedirects ?ry . OPTIONAL {{ ?ry rdfs:label ?lry }} }}
            }}"""

In [28]:
main, labels, types, sameas = Graph(), Graph(), Graph(), Graph()
for in_pred, out_pred in hkg.items():
    # out_pred=in_pred
    query = pattern.format(in_pred=in_pred, out_pred=out_pred)
    # print(query)
    graph = run_query(query, split=True)
    print(f"Number of results for {in_pred}: {len(graph)}")
    main += (s := subgraph(graph, out_pred))
    print(f"Added {len(s)} triples to main, for a total of {len(main)}")
    labels += (s := subgraph(graph, "rdfs:label"))
    print(f"Added {len(s)} triples to labels, for a total of {len(labels)}")
    sameas += (s := subgraph(graph, "owl:sameAs"))
    print(f"Added {len(s)} triples to sameas, for a total of {len(sameas)}")
    types += (s := subgraph(graph, "rdf:type"))
    print(f"Added {len(s)} triples to types, for a total of {len(types)}\n")

Number of results for <http://dbpedia.org/property/treatment>: 46603
Added 2888 triples to main, for a total of 2888
Added 23630 triples to labels, for a total of 23630
Added 18696 triples to sameas, for a total of 18696
Added 1389 triples to types, for a total of 1389

Number of results for <http://dbpedia.org/ontology/treatment>: 45140
Added 1641 triples to main, for a total of 3987
Added 23436 triples to labels, for a total of 29902
Added 19055 triples to sameas, for a total of 23798
Added 1008 triples to types, for a total of 1537

Number of results for <http://dbpedia.org/property/medication>: 28208
Added 925 triples to main, for a total of 4912
Added 14642 triples to labels, for a total of 36532
Added 12019 triples to sameas, for a total of 29258
Added 622 triples to types, for a total of 1808

Number of results for <http://dbpedia.org/ontology/medication>: 30801
Added 803 triples to main, for a total of 5254
Added 16116 triples to labels, for a total of 38662
Added 13269 triples

In [29]:
len(main), len(types), len(labels), len(sameas)

(30673, 3441, 76964, 59591)

### Fix 'dbp:causes' in the main graph

Exclude subjects and objects of `dbp:causes` that are not drugs or diseases, since `dbp:causes` covers entities of many kinds besides drugs or diseases.

In [30]:
add_namespaces(main)
manager = main.namespace_manager

In [31]:
causes_triples = set(main.triples((None, DBP.causes, None)))
len(causes_triples)

5919

Examples of triples with `dbp:causes`:

In [32]:
for i, (s, p, o) in enumerate(main.triples((None, DBP.causes, None))):
    print(s.n3(manager), p.n3(manager), o.n3(manager))
    if i > 4:
        break

dbr:1910_Columbus_streetcar_strike dbp:causes "long hours"@en
<http://dbpedia.org/resource/VCUarts_adjunct_workers'_protests> dbp:causes "long hours"@en
dbr:2021_Frito-Lay_strike dbp:causes "long hours"@en
dbr:Myositis dbp:causes dbr:Idiopathic
dbr:Gerstmann_syndrome dbp:causes dbr:Idiopathic
dbr:Precocious_puberty dbp:causes dbr:Idiopathic


Some examples of triples that are not needed for our purposes:

In [33]:
c = 0
for s, p, o in causes_triples:
    if "protest" in s:
        print(s.n3(manager), p.n3(manager), o.n3(manager))
        c += 1
        if c > 4:
            break

dbr:2014_Jadavpur_University_protests dbp:causes "police brutality on unarmed students taking part in a non-violent sit in demonstration"@en
dbr:George_Floyd_protests_in_Arizona dbp:causes "racial and social inequality"@en
<http://dbpedia.org/resource/Montenegrin_nationalist_protests_(2020–present)> dbp:causes "denial of srebrenica genocide"@en
dbr:June_2013_Egyptian_protests dbp:causes "policies of president mohamed morsi"@en
<http://dbpedia.org/resource/2011–2013_Russian_protests> dbp:causes "rigged elections"@en


In [34]:
main.remove((None, DBP.causes, None))
len(main)

24754

Add triples with `dbp:causes` to the main graph only if the entities involved are drugs or diseases:

In [35]:
drugs_and_diseases = set(types.subjects(predicate=RDF.type))
for s, p, o in causes_triples:
    if s in drugs_and_diseases or o in drugs_and_diseases:
        main.add((s, HKG.cause, o))

This is still not guaranteed to work 100%, because of triples like `"<http://dbpedia.org/resource/Assassination_of_Martin_Luther_King,_Jr.>",rdf:type,dbo:Disease`.

In [36]:
len(main)

27081

In [37]:
for i, (s, p, o) in enumerate(main.triples((None, HKG.cause, None))):
    print(s.n3(manager), p.n3(manager), o.n3(manager))
    if i > 4:
        break

dbr:Levator_ani_syndrome hkg:cause dbr:Levator_ani_muscle
dbr:Dysentery hkg:cause dbr:Entamoeba_histolytica
dbr:Amoebiasis hkg:cause dbr:Entamoeba_histolytica
dbr:Fungal_infection hkg:cause dbr:Mold
dbr:Dry_eye_syndrome hkg:cause dbr:Meibomian_gland_dysfunction
dbr:Early_pregnancy_bleeding hkg:cause dbr:Threatened_miscarriage


### Add labels to main kg

In [38]:
full = main+labels+types+sameas
len(full)

167077

In [39]:
main_entities = set(full.all_nodes())
for node in tqdm(main_entities):
    for trip in all_drugs.triples((node, None, None)):
        full.add(trip)
    for trip in all_diseases.triples((node, None, None)):
        full.add(trip)
print(len(full))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 142143/142143 [00:02<00:00, 61230.29it/s]

182291





### Save main graph, with its labels, types and equivalent entities

In [40]:
%%time
serialize_graph(full, "main_graph", "./data/graphs/")

Saving ./data/graphs/pickle/main_graph.pickle
Saving ./data/graphs/csv/main_graph.csv
Saving ./data/graphs/ttl/main_graph.ttl
CPU times: user 1min 13s, sys: 132 ms, total: 1min 13s
Wall time: 1min 13s
