In [1]:
from rdflib import Graph

In [2]:
radlex_owl_path = 'RadLex/data/RadLex.owl'

### Loading of the Ontology and transformation into a Directed Graph

In [3]:
g = Graph()
g.parse(radlex_owl_path, format="xml")  # OWL is typically RDF/XML

print(f"Number of triples: {len(g)}")
for subj, pred, obj in g[:10]:  # Print first 10 triples
    print(subj, pred, obj)

Number of triples: 802105


In [4]:
import rdflib
from rdflib import RDF, RDFS, OWL, Namespace
from rdflib.term import BNode
from itertools import islice

RADLEX = Namespace("http://www.radlex.org/RID/")
g.bind("radlex", RADLEX)
g.bind("rdf", RDF)
g.bind("rdfs", RDFS)
g.bind("owl", OWL)

def format_node(node):
    if isinstance(node, BNode):
        return f"[BlankNode:{str(node)[:8]}]"
    elif isinstance(node, rdflib.URIRef):
        # Try to use a qname if bound (e.g., radlex:RID47279)
        try:
            return g.qname(node)
        except Exception:
            return str(node)
    else:
        return str(node)

for s, p, o in islice(g, 10):
    print(f"{format_node(s)} -- {format_node(p)} --> {format_node(o)}")

[BlankNode:N0c310d2] -- owl:onProperty --> radlex:Has_Regional_Part
[BlankNode:N2025a12] -- owl:someValuesFrom --> radlex:RID27377
radlex:RID939 -- RID:Preferred_name_German --> Arteria mesenterica inferior
radlex:RID18861 -- RID:Preferred_name --> root of right thoracic nerve
[BlankNode:N90da7fb] -- rdf:rest --> [BlankNode:N8fea7b8]
[BlankNode:N8ee88a6] -- owl:onProperty --> radlex:Constitutional_Part_Of
[BlankNode:N257aa06] -- owl:annotatedSource --> radlex:RID22704
radlex:RID32153 -- rdf:type --> owl:Class
radlex:RID855 -- rdfs:subClassOf --> [BlankNode:N3a4f742]
radlex:RID44050 -- RID:Has_Regional_Part --> radlex:RID45178


In [5]:
from rdflib import URIRef

RID_PREFIX = "http://www.radlex.org/RID/"

rid_codes = set()
rid_pairs = set()
relations_between_rids = set()

for s, p, o in g:
    s_is_rid = isinstance(s, URIRef) and str(s).startswith(RID_PREFIX) and "RID" in str(s)
    o_is_rid = isinstance(o, URIRef) and str(o).startswith(RID_PREFIX) and "RID" in str(o)
    
    # Collect individual RID codes
    if s_is_rid:
        rid_codes.add(s)
    if o_is_rid:
        rid_codes.add(o)
    
    # Collect pairs and their relations
    if s_is_rid and o_is_rid:
        rid_pairs.add((s, o))
        relations_between_rids.add(p)

# Print the results
print(f"Number of unique RID codes: {len(rid_codes)}")
print(f"Number of RID-to-RID pairs: {len(rid_pairs)}")
print(f"Number of different relations between RID codes: {len(relations_between_rids)}")
print("\nRelations used between RID codes:")
for rel in relations_between_rids:
    print(f" - {g.qname(rel)}")


Number of unique RID codes: 46922
Number of RID-to-RID pairs: 134192
Number of different relations between RID codes: 57

Relations used between RID codes:
 - RID:Constitutional_Part_Of
 - RID:Bounds
 - RID:Has_Blood_Supply
 - RID:Tributary_Of
 - RID:Has_Part
 - RID:Receives_Input_From
 - rdfs:domain
 - RID:Projects_To
 - rdf:type
 - RID:Projects_From
 - RID:Replaced_by
 - rdfs:subClassOf
 - RID:Sends_Output_To
 - RID:Segment_Of
 - rdfs:subPropertyOf
 - RID:Has_Member
 - RID:Has_Branch
 - RID:Lymphatic_Drainage
 - RID:May_Cause
 - RID:Has_origin
 - RID:May_Be_Caused_By
 - RID:Contained_In
 - RID:Drains_Into
 - RID:Receives_Drainage_From
 - RID:Receives_Projection_From
 - RID:External_to
 - RID:Has_Entrapment_Site
 - rdfs:range
 - RID:Origin_of
 - RID:Part_Of
 - RID:Has_Constitutional_Part
 - RID:Lymphatic_Drainage_Of
 - RID:Attaches_to
 - RID:Distal_to
 - RID:Surrounded_by
 - RID:Blood_Supply_of
 - RID:Posterior_to
 - RID:Anterior_to
 - RID:Inferior_to
 - RID:Bounded_by
 - RID:Anatomic

In [6]:
import networkx as nx
from rdflib import URIRef

RID_PREFIX = "http://www.radlex.org/RID/"

# Create a directed multigraph
G = nx.MultiDiGraph()

for s, p, o in g:
    s_is_rid = isinstance(s, URIRef) and str(s).startswith(RID_PREFIX) and "RID" in str(s)
    o_is_rid = isinstance(o, URIRef) and str(o).startswith(RID_PREFIX) and "RID" in str(o)

    if s_is_rid and o_is_rid:
        # Add edge with predicate as label
        G.add_edge(str(s), str(o), label=g.qname(p))

# Summary info
print(f"Number of RID nodes: {G.number_of_nodes()}")
print(f"Number of RID-to-RID edges: {G.number_of_edges()}")

# Optional: show first 5 edges
for u, v, d in list(G.edges(data=True))[:5]:
    print(f"{u} --[{d['label']}]--> {v}")

Number of RID nodes: 46899
Number of RID-to-RID edges: 154615
http://www.radlex.org/RID/RID44050 --[RID:Has_Regional_Part]--> http://www.radlex.org/RID/RID45178
http://www.radlex.org/RID/RID44050 --[RID:Has_Regional_Part]--> http://www.radlex.org/RID/RID21846
http://www.radlex.org/RID/RID44050 --[RID:Has_Regional_Part]--> http://www.radlex.org/RID/RID44573
http://www.radlex.org/RID/RID44050 --[RID:Has_Regional_Part]--> http://www.radlex.org/RID/RID22880
http://www.radlex.org/RID/RID44050 --[RID:Has_Regional_Part]--> http://www.radlex.org/RID/RID22978


In [7]:
G = nx.relabel_nodes(G, lambda x: x.split("/")[-1])

In [8]:
print(len(G.nodes), len(G.edges))

46899 154615


In [30]:
import networkx as nx
import pandas as pd

# Load your CSV file
df = pd.read_csv("data/d_radlex_entities.csv")  # Adjust path if necessary

# Create a dictionary: code → preferred description
desc_map = dict(zip(df['radlex_code'], df['preferred_description']))

# Add the description as an attribute to each node in the graph
for node in G.nodes():
    if node in desc_map:
        G.nodes[node]['description'] = desc_map[node]
    else:
        G.nodes[node]['description'] = "(no description)"

### Check that the node 'RID1' is indeed a parent node in the ontology

In [9]:
if 'RID1' in G:
    print("✅ Node 'RID1' exists in the graph.")
else:
    print("❌ Node 'RID1' does NOT exist in the graph.")

✅ Node 'RID1' exists in the graph.


In [10]:
# Get the set of all nodes reachable from RID1 (including itself)
reachable_from_rid1 = nx.descendants(G, 'RID1')
reachable_from_rid1.add('RID1')  # Include RID1 itself

# Compare with the full set of nodes in G
all_nodes = set(G.nodes())

# Check if RID1 reaches all nodes
if reachable_from_rid1 == all_nodes:
    print("✅ RID1 is a parent node — all nodes are reachable from it.")
else:
    missing = all_nodes - reachable_from_rid1
    print(f"❌ RID1 is NOT a parent node. {len(missing)} node(s) are unreachable from RID1.")
    # Optionally print a few missing ones
    print("Examples of unreachable nodes:", list(missing)[:10])

❌ RID1 is NOT a parent node. 46897 node(s) are unreachable from RID1.
Examples of unreachable nodes: ['RID49319', 'RID41268', 'RID41235', 'RID12786', 'RID47277', 'RID16418', 'RID45952', 'RID34875', 'RID34836', 'RID10778']


In [11]:
reachable_from_rid1

{'RID0', 'RID1'}

In [12]:
if 'RID1' not in G:
    print("❌ Node 'RID1' does not exist in the graph.")
else:
    print(f"✅ Edges OUTGOING from RID1 ({G.out_degree('RID1')} edges):")
    for _, target, data in G.out_edges('RID1', data=True):
        print(f"RID1 --[{data.get('label', '⟶')}]--> {target}")

    print(f"\n✅ Edges INCOMING to RID1 ({G.in_degree('RID1')} edges):")
    for source, _, data in G.in_edges('RID1', data=True):
        print(f"{source} --[{data.get('label', '⟶')}]--> RID1")

✅ Edges OUTGOING from RID1 (1 edges):
RID1 --[rdfs:subClassOf]--> RID0

✅ Edges INCOMING to RID1 (18 edges):
RID6 --[rdfs:subClassOf]--> RID1
RID1559 --[rdfs:subClassOf]--> RID1
RID49573 --[rdfs:subClassOf]--> RID1
RID5 --[rdfs:subClassOf]--> RID1
RID34785 --[rdfs:subClassOf]--> RID1
RID3 --[rdfs:subClassOf]--> RID1
RID39128 --[rdfs:subClassOf]--> RID1
RID35977 --[rdfs:subClassOf]--> RID1
RID7479 --[rdfs:subClassOf]--> RID1
RID34861 --[rdfs:subClassOf]--> RID1
RID7467 --[rdfs:subClassOf]--> RID1
Receives_Input_From --[rdfs:range]--> RID1
RID13158 --[rdfs:subClassOf]--> RID1
RID28487 --[rdfs:subClassOf]--> RID1
RID28639 --[rdfs:subClassOf]--> RID1
RID50606 --[rdfs:subClassOf]--> RID1
Sends_Output_To --[rdfs:range]--> RID1
Member_Of --[rdfs:range]--> RID1


In [13]:
# Create a reversed hierarchy graph
H_hierarchy = nx.DiGraph()

for u, v, d in G.edges(data=True):
    if d.get('label') == 'rdfs:subClassOf':
        H_hierarchy.add_edge(v, u)  # Reverse the edge: parent --> child

In [14]:
descendants = nx.descendants(H_hierarchy, 'RID1')
print(f"RID1 has {len(descendants)} subclass descendants.")

RID1 has 45898 subclass descendants.


### Try to extract subgraph corresponding to codes from a sample report

In [15]:
codes = ['RID908', 'RID5619', 'RID5622', 'RID10404', 'RID10326', 'RID29991', 'RID5611', 'RID5576', 'RID1543']

In [16]:
# Create a set to store all nodes involved in paths between any two of the codes
nodes_in_paths = set()

# For each pair of codes, find all simple paths (can be adjusted to shortest if needed)
for i in range(len(codes)):
    for j in range(i + 1, len(codes)):
        source = codes[i]
        target = codes[j]
        try:
            # Get the shortest path (you can change to `all_simple_paths` if needed)
            path = nx.shortest_path(G, source=source, target=target)
            nodes_in_paths.update(path)
        except nx.NetworkXNoPath:
            pass  # No path found between this pair

# Induce subgraph
H = G.subgraph(nodes_in_paths).copy()

print(f"Subgraph has {H.number_of_nodes()} nodes and {H.number_of_edges()} edges.")

# Optional: Save or visualize
# nx.write_gexf(H, "rid_subgraph.gexf")

Subgraph has 35 nodes and 98 edges.


In [17]:
from pyvis.network import Network

# Create a PyVis network
net = Network(height="700px", width="100%", directed=True, notebook=True)

# Add nodes with color: blue for target codes, gray otherwise
for node in H.nodes():
    if node in codes:
        net.add_node(node, label=node, color='skyblue', shape='dot', size=25)
    else:
        net.add_node(node, label=node, color='lightgray', shape='dot', size=10)

# Add edges with labels
for u, v, d in H.edges(data=True):
    label = d.get('label', '')
    net.add_edge(u, v, label=label, arrows="to")

# Improve layout
net.repulsion(node_distance=150, spring_length=200)

# Show the network (in notebook or open as HTML)
# net.show("data/output/rid_subgraph.html")



### Simplify graph for display

In [18]:
def simplify_graph_for_display(H):
    simplified = nx.DiGraph()

    seen_pairs = set()
    
    for u, v, d in H.edges(data=True):
        if u == v:
            continue  # Exclude self-loops

        # Create an unordered key to detect bidirectional pairs
        pair = tuple(sorted((u, v)))
        
        if pair in seen_pairs:
            continue  # We've already added one direction
        seen_pairs.add(pair)

        # Add the edge in the current direction
        simplified.add_edge(u, v, label=d.get('label'))

    return simplified

In [19]:
# Usage:
cleaned_G = simplify_graph_for_display(H)

In [20]:
from pyvis.network import Network

net = Network(height="700px", width="100%", directed=True, notebook=True)

for node in cleaned_G.nodes():
    net.add_node(node, label=node)

for u, v, d in cleaned_G.edges(data=True):
    net.add_edge(u, v, label=d.get('label', ''))

net.repulsion(node_distance=150, spring_length=200)
# net.show("data/output/cleaned_rid_graph.html")



### Add these modifications in the cleaning of the graph for the subgraph with relevant codes

In [31]:
import networkx as nx
from pyvis.network import Network
from itertools import combinations

codes = ['RID908', 'RID5619', 'RID5622', 'RID10404', 'RID10326', 'RID29991', 'RID5611', 'RID5576', 'RID1543']

# --- Step 1: Extract subgraph containing all paths between any pair of codes ---

nodes_to_include = set(codes)

# For every pair of codes, include all nodes on shortest paths between them (if paths exist)
for u, v in combinations(codes, 2):
    if nx.has_path(G, u, v):
        path_nodes = nx.shortest_path(G, u, v)
        nodes_to_include.update(path_nodes)
    if nx.has_path(G, v, u):
        path_nodes = nx.shortest_path(G, v, u)
        nodes_to_include.update(path_nodes)

H_sub = G.subgraph(nodes_to_include).copy()

# --- Step 2: Create hierarchy graph with reversed rdfs:subClassOf edges ---

H_hierarchy = nx.DiGraph()

for u, v, d in H_sub.edges(data=True):
    if d.get('label') == 'rdfs:subClassOf':
        H_hierarchy.add_edge(v, u, label='rdfs:has_subClass')  # reversed edge
    else:
        H_hierarchy.add_edge(u, v, label=d.get('label'))

# --- Step 3: Clean graph by removing self-loops and reciprocal edges ---

def simplify_graph_for_display(G):
    simplified = nx.DiGraph()
    seen_pairs = set()
    for u, v, d in G.edges(data=True):
        if u == v:
            continue  # remove self-loops
        pair = tuple(sorted((u, v)))
        if pair in seen_pairs:
            continue  # remove reciprocal edge (keep first encountered)
        seen_pairs.add(pair)
        simplified.add_edge(u, v, label=d.get('label'))
    return simplified

cleaned_H = simplify_graph_for_display(H_hierarchy)

# --- Step 4: Visualize with pyvis ---

net = Network(height="700px", width="100%", directed=True, notebook=True)

for node in cleaned_H.nodes():
    color = 'skyblue' if node in codes else 'lightgray'
    size = 25 if node in codes else 10
    label = node
    title = G.nodes[node].get('description', '')  # Tooltip on hover
    net.add_node(node, label=label, title=title, color=color, size=size)

for u, v, d in cleaned_H.edges(data=True):
    net.add_edge(u, v, label=d.get('label', ''))

net.repulsion(node_distance=150, spring_length=200)
net.show("data/output/cleaned_subgraph.html")


data/output/cleaned_subgraph.html


In [22]:
# Step 2a: Reverse subClassOf edges in a new graph for path computation
G_reversed = nx.DiGraph()

for u, v, d in G.edges(data=True):
    if d.get('label') == 'rdfs:subClassOf':
        G_reversed.add_edge(v, u, **d)  # reverse the edge
    else:
        G_reversed.add_edge(u, v, **d)

In [23]:
root = 'RID1'

for code in codes:
    print(code, nx.has_path(G_reversed, root, code))
    print(nx.shortest_path(G_reversed, root, code))

RID908 True
['RID1', 'RID5', 'RID29023', 'RID35563', 'RID936', 'RID13183', 'RID908']
RID5619 True
['RID1', 'RID34861', 'RID5425', 'RID5618', 'RID5619']
RID5622 True
['RID1', 'RID34861', 'RID5425', 'RID5618', 'RID5622']
RID10404 True
['RID1', 'RID1559', 'RID8', 'RID10404']
RID10326 True
['RID1', 'RID50606', 'RID10311', 'RID10326']
RID29991 True
['RID1', 'RID5', 'RID29023', 'RID35313', 'RID2484', 'RID29991']
RID5611 True
['RID1', 'RID34861', 'RID29038', 'RID29033', 'RID5599', 'RID5610', 'RID5611']
RID5576 True
['RID1', 'RID34861', 'RID29038', 'RID29033', 'RID5429', 'RID5554', 'RID5576']
RID1543 True
['RID1', 'RID3', 'RID13389', 'RID29109', 'RID1540', 'RID38626', 'RID1543']


In [24]:
len(G)

46899

In [25]:
len(G.edges())

154615

### Alternative plot: DAG/Hierarchical format starting from parent node 'RID1'

In [26]:
# Create a reversed hierarchy graph
H_hierarchy = nx.DiGraph()

for u, v, d in G.edges(data=True):
    if d.get('label') == 'rdfs:subClassOf':
        H_hierarchy.add_edge(v, u)  # Reverse the edge: parent --> child

In [33]:
import networkx as nx
from pyvis.network import Network
from itertools import combinations

codes = ['RID908', 'RID5619', 'RID5622', 'RID10404', 'RID10326', 'RID29991', 'RID5611', 'RID5576', 'RID1543']
root = 'RID1'

# --- Step 1: Extract subgraph containing all paths between any pair of codes ---

nodes_to_include = set(codes)

# For every pair of codes, include all nodes on shortest paths between them (if paths exist)
for u, v in combinations(codes, 2):
    if nx.has_path(G, u, v):
        path_nodes = nx.shortest_path(G, u, v)
        nodes_to_include.update(path_nodes)
    if nx.has_path(G, v, u):
        path_nodes = nx.shortest_path(G, v, u)
        nodes_to_include.update(path_nodes)

H_sub = G.subgraph(nodes_to_include).copy()

# --- Step 2: Create hierarchy graph with reversed rdfs:subClassOf edges ---

H_hierarchy = nx.DiGraph()

for u, v, d in H_sub.edges(data=True):
    if d.get('label') == 'rdfs:subClassOf':
        H_hierarchy.add_edge(v, u, label='rdfs:subClassOf')  # reversed edge
    else:
        H_hierarchy.add_edge(u, v, label=d.get('label'))

# --- Step 3: Clean graph by removing self-loops and reciprocal edges ---

def simplify_graph_for_display(G):
    simplified = nx.DiGraph()
    seen_pairs = set()
    for u, v, d in G.edges(data=True):
        if u == v:
            continue  # remove self-loops
        pair = tuple(sorted((u, v)))
        if pair in seen_pairs:
            continue  # remove reciprocal edge (keep first encountered)
        seen_pairs.add(pair)
        simplified.add_edge(u, v, label=d.get('label'))
    return simplified

cleaned_H = simplify_graph_for_display(H_hierarchy)

# --- Step 4: Add shortest paths from RID1 to each code (from full ontology G) ---

# Build reverse subclass-only graph for proper path traversal
G_rev_sub = nx.DiGraph()
for u, v, d in G.edges(data=True):
    if d.get('label') == 'rdfs:subClassOf':
        G_rev_sub.add_edge(v, u, label='rdfs:subClassOf')  # reversed

# Track edges in shortest paths for special styling
highlight_edges = set()

for code in codes:
    if nx.has_path(G_rev_sub, root, code):
        path = nx.shortest_path(G_rev_sub, root, code)
        for i in range(len(path) - 1):
            u, v = path[i], path[i+1]
            cleaned_H.add_edge(u, v, label='rdfs:subClassOf')
            highlight_edges.add((u, v))
        for node in path:
            cleaned_H.add_node(node)  # ensure all path nodes are included

# --- Step 5: Visualize with pyvis ---

net = Network(height="700px", width="100%", directed=True, notebook=True)

for node in cleaned_H.nodes():
    if node == root:
        color = 'orange'
        size = 30
    elif node in codes:
        color = 'skyblue'
        size = 25
    else:
        color = 'lightgray'
        size = 10
    label = node
    title = G.nodes[node].get('description', '')  # Tooltip on hover
    net.add_node(node, label=label, title=title, color=color, size=size)

for u, v, d in cleaned_H.edges(data=True):
    color = 'black' if (u, v) in highlight_edges else '#cccccc'
    net.add_edge(u, v, label=d.get('label', ''), color=color)

net.repulsion(node_distance=150, spring_length=200)
net.show("data/output/final_graph.html")

data/output/final_graph.html


In [34]:
import networkx as nx
from pyvis.network import Network
from itertools import combinations

codes = ['RID908', 'RID5619', 'RID5622', 'RID10404', 'RID10326', 'RID29991', 'RID5611', 'RID5576', 'RID1543']
root = 'RID1'

# --- Step 1: Extract subgraph containing all paths between any pair of codes ---

nodes_to_include = set(codes)

# For every pair of codes, include all nodes on shortest paths between them (if paths exist)
for u, v in combinations(codes, 2):
    if nx.has_path(G, u, v):
        path_nodes = nx.shortest_path(G, u, v)
        nodes_to_include.update(path_nodes)
    if nx.has_path(G, v, u):
        path_nodes = nx.shortest_path(G, v, u)
        nodes_to_include.update(path_nodes)

H_sub = G.subgraph(nodes_to_include).copy()

# --- Step 2: Create hierarchy graph with reversed rdfs:subClassOf edges ---

H_hierarchy = nx.DiGraph()

for u, v, d in H_sub.edges(data=True):
    if d.get('label') == 'rdfs:subClassOf':
        H_hierarchy.add_edge(v, u, label='rdfs:subClassOf')  # reversed edge
    else:
        H_hierarchy.add_edge(u, v, label=d.get('label'))

# --- Step 3: Clean graph by removing self-loops and reciprocal edges ---

def simplify_graph_for_display(G):
    simplified = nx.DiGraph()
    seen_pairs = set()
    for u, v, d in G.edges(data=True):
        if u == v:
            continue  # remove self-loops
        pair = tuple(sorted((u, v)))
        if pair in seen_pairs:
            continue  # remove reciprocal edge (keep first encountered)
        seen_pairs.add(pair)
        simplified.add_edge(u, v, label=d.get('label'))
    return simplified

cleaned_H = simplify_graph_for_display(H_hierarchy)

# --- Step 4: Add shortest paths from RID1 to each code (from full ontology G) ---

# Build reverse subclass-only graph for proper path traversal
G_rev_sub = nx.DiGraph()
for u, v, d in G.edges(data=True):
    if d.get('label') == 'rdfs:subClassOf':
        G_rev_sub.add_edge(v, u, label='rdfs:subClassOf')  # reversed

# Track edges in shortest paths for special styling
highlight_edges = set()

for code in codes:
    if nx.has_path(G_rev_sub, root, code):
        path = nx.shortest_path(G_rev_sub, root, code)
        for i in range(len(path) - 1):
            u, v = path[i], path[i+1]
            cleaned_H.add_edge(u, v, label='rdfs:subClassOf')
            highlight_edges.add((u, v))
        for node in path:
            cleaned_H.add_node(node)  # ensure all path nodes are included

# --- Step 5: Visualize with pyvis ---

net = Network(height="700px", width="100%", directed=True, notebook=True)

for node in cleaned_H.nodes():
    if node == root:
        color = 'orange'
        size = 30
    elif node in codes:
        color = 'skyblue'
        size = 25
    else:
        color = 'lightgray'
        size = 10
    net.add_node(node, label=node, color=color, size=size)

for u, v, d in cleaned_H.edges(data=True):
    color = 'black' if (u, v) in highlight_edges else '#cccccc'
    net.add_edge(u, v, label=d.get('label', ''), color=color)

net.repulsion(node_distance=150, spring_length=200)
net.show("data/output/with_paths_to_RID1.html")

data/output/with_paths_to_RID1.html
