# ECS8052 Assessment Data Loader

This file contains the code necessary to download the data needed for the module assessment. You may use this code freely as part of your assessment.

In [6]:
import os
datadir = 'primekg'
if not os.path.exists(datadir):
    os.mkdir(datadir)

In [8]:
filelist = [
    ('kg.csv', 'https://dataverse.harvard.edu/api/access/datafile/6180620', 'aac8191d4fbc5bf09cdf8c3c78b4e75f'),
]

In [10]:
import requests
import hashlib

for f in filelist:
    print(f'INITIATING: download of {f[0]} from {f[1]}')
    response = requests.get(f[1])
    file_Path = datadir + '/' + f[0]
    if response.status_code != 200:
        print('Failed to download file {f[0]} from {f[1]}')
    else:
        m = hashlib.md5()
        m.update(response.content)
        if m.hexdigest() == f[2]:
            print(f'SUCCESS: File {f[0]} downloaded from {f[1]} with correct checksum {f[2]}')
            with open(file_Path, 'wb') as file:
                file.write(response.content)
        else:
            print(f'ERROR: File {f[0]} downloaded from {f[1]} with incorrect checksum {m.hexdigest()} (should be {f[2]})')            


INITIATING: download of kg.csv from https://dataverse.harvard.edu/api/access/datafile/6180620
SUCCESS: File kg.csv downloaded from https://dataverse.harvard.edu/api/access/datafile/6180620 with correct checksum aac8191d4fbc5bf09cdf8c3c78b4e75f


In [12]:
import pandas as pd

In [22]:
datakg=pd.read_csv("primekg/kg.csv",low_memory=False)

In [24]:
datakg.head()

Unnamed: 0,relation,display_relation,x_index,x_id,x_type,x_name,x_source,y_index,y_id,y_type,y_name,y_source
0,protein_protein,ppi,0,9796,gene/protein,PHYHIP,NCBI,8889,56992,gene/protein,KIF15,NCBI
1,protein_protein,ppi,1,7918,gene/protein,GPANK1,NCBI,2798,9240,gene/protein,PNMA1,NCBI
2,protein_protein,ppi,2,8233,gene/protein,ZRSR2,NCBI,5646,23548,gene/protein,TTC33,NCBI
3,protein_protein,ppi,3,4899,gene/protein,NRF1,NCBI,11592,11253,gene/protein,MAN1B1,NCBI
4,protein_protein,ppi,4,5297,gene/protein,PI4KA,NCBI,2122,8601,gene/protein,RGS20,NCBI


In [28]:

node_types = pd.concat([datakg['x_type'], datakg['y_type']]).unique()

print("Unique Node Types (Ontology):")
for node_type in node_types:
    print(node_type)

Unique Node Types (Ontology):
gene/protein
drug
effect/phenotype
disease
biological_process
molecular_function
cellular_component
exposure
pathway
anatomy


In [30]:
# Extract unique relations
relations = datakg['relation'].unique()

print("Unique Relations (Ontology):")
for relation in relations:
    print(relation)


Unique Relations (Ontology):
protein_protein
drug_protein
contraindication
indication
off-label use
drug_drug
phenotype_protein
phenotype_phenotype
disease_phenotype_negative
disease_phenotype_positive
disease_protein
disease_disease
drug_effect
bioprocess_bioprocess
molfunc_molfunc
cellcomp_cellcomp
molfunc_protein
cellcomp_protein
bioprocess_protein
exposure_protein
exposure_disease
exposure_exposure
exposure_bioprocess
exposure_molfunc
exposure_cellcomp
pathway_pathway
pathway_protein
anatomy_anatomy
anatomy_protein_present
anatomy_protein_absent


In [None]:
pip install owlready2

In [39]:
from owlready2 import *
import types

# Step 1: Initialize ontology
onto = get_ontology("http://www.example.org/kg.owl")

# Step 2: Extract unique entity types from your DataFrame
EntityTypes = pd.concat([datakg['x_type'], datakg['y_type']]).unique()

# Step 3: Define ontology classes dynamically
with onto:
    # Dictionary to store the created classes
    EntityClass = dict.fromkeys(EntityTypes, None)
    
    # Create a new class for each entity type
    for t in EntityTypes:
        EntityClass[t] = types.new_class(t.replace("/", "_"), (Thing,))  # Replace "/" with "_" to avoid syntax issues
        print(f"Created ontology class: {EntityClass[t]}")
    
    print("All ontology classes created successfully!")

Created ontology class: kg.gene_protein
Created ontology class: kg.drug
Created ontology class: kg.effect_phenotype
Created ontology class: kg.disease
Created ontology class: kg.biological_process
Created ontology class: kg.molecular_function
Created ontology class: kg.cellular_component
Created ontology class: kg.exposure
Created ontology class: kg.pathway
Created ontology class: kg.anatomy
All ontology classes created successfully!


In [55]:
onto = get_ontology("http://www.example.com/kg.owl")

with onto:
    # Step 2: Create entity types as classes
    EntityClass = dict.fromkeys(set(datakg['x_type'].unique()).union(set(datakg['y_type'].unique())), None)
    for t in EntityClass.keys():
        # Replace "/" with "_" for compatibility
        EntityClass[t] = types.new_class(t, (Thing,))

    # Step 3: Create relations and assign domain, range, source, and target
    RelationClass = {}
    RelationDomainRange = dict.fromkeys(relation_groups['relation'].unique(), None)
    
    for relation in relation_groups['relation'].unique():
        # Create relation as an ObjectProperty
        RelationClass[relation] = types.new_class(relation, (ObjectProperty,))
        
        # Initialize sets for Domain, Range, Source, and Target
        RelationDomainRange[relation] = {
            'Domain': set(),
            'Range': set(),
            'Source': set(),
            'Target': set()
        }
        
        # Find all domains, ranges, sources, and targets for this relation
        domains = relation_groups.loc[relation_groups['relation'] == relation, 'x_type'].unique()
        ranges = relation_groups.loc[relation_groups['relation'] == relation, 'y_type'].unique()
        
        # Update domain and range for the relation
        RelationClass[relation].domain = [EntityClass[d] for d in domains]
        RelationClass[relation].range = [EntityClass[r] for r in ranges]
        
        # Collect sources and targets for the relation
        sources = relation_groups.loc[relation_groups['relation'] == relation, 'x_type'].unique()
        targets = relation_groups.loc[relation_groups['relation'] == relation, 'y_type'].unique()
        
        # Add source and target types to RelationDomainRange
        RelationDomainRange[relation]['Source'].update([EntityClass[s] for s in sources])
        RelationDomainRange[relation]['Target'].update([EntityClass[t] for t in targets])

        print(f"Created relation: {relation}")
        print(f"Domain: {RelationDomainRange[relation]['Domain']}")
        print(f"Range: {RelationDomainRange[relation]['Range']}")
        print(f"Source: {RelationDomainRange[relation]['Source']}")
        print(f"Target: {RelationDomainRange[relation]['Target']}")

# Save the ontology
onto.save(file="kg_ontology_grouped_with_sources_and_targets.owl", format="rdfxml")
print("Ontology saved as kg_ontology_grouped_with_sources_and_targets.owl.")

Created relation: anatomy_anatomy
Domain: set()
Range: set()
Source: {kg.anatomy}
Target: {kg.anatomy}
Created relation: anatomy_protein_absent
Domain: set()
Range: set()
Source: {kg.gene/protein, kg.anatomy}
Target: {kg.gene/protein, kg.anatomy}
Created relation: anatomy_protein_present
Domain: set()
Range: set()
Source: {kg.gene/protein, kg.anatomy}
Target: {kg.gene/protein, kg.anatomy}
Created relation: bioprocess_bioprocess
Domain: set()
Range: set()
Source: {kg.biological_process}
Target: {kg.biological_process}
Created relation: bioprocess_protein
Domain: set()
Range: set()
Source: {kg.gene/protein, kg.biological_process}
Target: {kg.biological_process, kg.gene/protein}
Created relation: cellcomp_cellcomp
Domain: set()
Range: set()
Source: {kg.cellular_component}
Target: {kg.cellular_component}
Created relation: cellcomp_protein
Domain: set()
Range: set()
Source: {kg.cellular_component, kg.gene/protein}
Target: {kg.cellular_component, kg.gene/protein}
Created relation: contraindi

In [47]:
EntityClass

{'drug': kg.drug,
 'exposure': kg.exposure,
 'gene/protein': kg.gene_protein,
 'biological_process': kg.biological_process,
 'cellular_component': kg.cellular_component,
 'effect/phenotype': kg.effect_phenotype,
 'molecular_function': kg.molecular_function,
 'anatomy': kg.anatomy,
 'disease': kg.disease,
 'pathway': kg.pathway}

In [57]:
RelationClass

{'anatomy_anatomy': kg.anatomy_anatomy,
 'anatomy_protein_absent': kg.anatomy_protein_absent,
 'anatomy_protein_present': kg.anatomy_protein_present,
 'bioprocess_bioprocess': kg.bioprocess_bioprocess,
 'bioprocess_protein': kg.bioprocess_protein,
 'cellcomp_cellcomp': kg.cellcomp_cellcomp,
 'cellcomp_protein': kg.cellcomp_protein,
 'contraindication': kg.contraindication,
 'disease_disease': kg.disease_disease,
 'disease_phenotype_negative': kg.disease_phenotype_negative,
 'disease_phenotype_positive': kg.disease_phenotype_positive,
 'disease_protein': kg.disease_protein,
 'drug_drug': kg.drug_drug,
 'drug_effect': kg.drug_effect,
 'drug_protein': kg.drug_protein,
 'exposure_bioprocess': kg.exposure_bioprocess,
 'exposure_cellcomp': kg.exposure_cellcomp,
 'exposure_disease': kg.exposure_disease,
 'exposure_exposure': kg.exposure_exposure,
 'exposure_molfunc': kg.exposure_molfunc,
 'exposure_protein': kg.exposure_protein,
 'indication': kg.indication,
 'molfunc_molfunc': kg.molfunc_mol

In [59]:
RelationDomainRange

{'anatomy_anatomy': {'Domain': set(),
  'Range': set(),
  'Source': {kg.anatomy},
  'Target': {kg.anatomy}},
 'anatomy_protein_absent': {'Domain': set(),
  'Range': set(),
  'Source': {kg.anatomy, kg.gene/protein},
  'Target': {kg.anatomy, kg.gene/protein}},
 'anatomy_protein_present': {'Domain': set(),
  'Range': set(),
  'Source': {kg.anatomy, kg.gene/protein},
  'Target': {kg.anatomy, kg.gene/protein}},
 'bioprocess_bioprocess': {'Domain': set(),
  'Range': set(),
  'Source': {kg.biological_process},
  'Target': {kg.biological_process}},
 'bioprocess_protein': {'Domain': set(),
  'Range': set(),
  'Source': {kg.biological_process, kg.gene/protein},
  'Target': {kg.biological_process, kg.gene/protein}},
 'cellcomp_cellcomp': {'Domain': set(),
  'Range': set(),
  'Source': {kg.cellular_component},
  'Target': {kg.cellular_component}},
 'cellcomp_protein': {'Domain': set(),
  'Range': set(),
  'Source': {kg.cellular_component, kg.gene/protein},
  'Target': {kg.cellular_component, kg.ge

In [61]:
from owlready2 import *
import pandas as pd
import types

# Step 1: Group data by relation, x_type (source), and y_type (target)
relation_groups = datakg.groupby(['relation', 'x_type', 'y_type']).size().reset_index(name='count')

# Step 2: Initialize ontology
onto = get_ontology("http://www.example.com/kg.owl")

# Step 3: Create entity types as classes
with onto:
    EntityClass = dict.fromkeys(set(datakg['x_type'].unique()).union(set(datakg['y_type'].unique())), None)
    
    # Create a new class for each unique entity type
    for t in EntityClass.keys():
        # Replace "/" with "_" to avoid syntax issues with class names
        EntityClass[t] = types.new_class(t, (Thing,))
        print(f"Created ontology class: {EntityClass[t]}")

# Step 4: Create relations and assign domain, range, source, and target
RelationClass = {}
RelationDomainRange = dict.fromkeys(relation_groups['relation'].unique(), None)

with onto:
    for relation in relation_groups['relation'].unique():
        # Create a new relation as an ObjectProperty (for example, protein_protein as an object property)
        RelationClass[relation] = types.new_class(relation, (ObjectProperty,))

        # Initialize the Domain, Range, Source, and Target sets for the relation
        RelationDomainRange[relation] = {
            'Domain': set(),
            'Range': set(),
            'Source': set(),
            'Target': set()
        }

        # Find all domains and ranges for this relation
        domains = relation_groups.loc[relation_groups['relation'] == relation, 'x_type'].unique()
        ranges = relation_groups.loc[relation_groups['relation'] == relation, 'y_type'].unique()

        # Set the domain and range for the relation in the ontology
        RelationClass[relation].domain = [EntityClass[d] for d in domains]
        RelationClass[relation].range = [EntityClass[r] for r in ranges]

        # Collect sources and targets for the relation and update the sets
        sources = relation_groups.loc[relation_groups['relation'] == relation, 'x_type'].unique()
        targets = relation_groups.loc[relation_groups['relation'] == relation, 'y_type'].unique()

        RelationDomainRange[relation]['Source'].update([EntityClass[s] for s in sources])
        RelationDomainRange[relation]['Target'].update([EntityClass[t] for t in targets])

        print(f"Created relation: {relation}")
        print(f"Domain: {RelationDomainRange[relation]['Domain']}")
        print(f"Range: {RelationDomainRange[relation]['Range']}")
        print(f"Source: {RelationDomainRange[relation]['Source']}")
        print(f"Target: {RelationDomainRange[relation]['Target']}")
    
# Save the ontology
onto.save(file="kg_ontology_with_sources_and_targets.owl", format="rdfxml")
print("Ontology saved as kg_ontology_with_sources_and_targets.owl.")


Created ontology class: kg.drug
Created ontology class: kg.exposure
Created ontology class: kg.gene/protein
Created ontology class: kg.biological_process
Created ontology class: kg.cellular_component
Created ontology class: kg.effect/phenotype
Created ontology class: kg.molecular_function
Created ontology class: kg.anatomy
Created ontology class: kg.disease
Created ontology class: kg.pathway
Created relation: anatomy_anatomy
Domain: set()
Range: set()
Source: {kg.anatomy}
Target: {kg.anatomy}
Created relation: anatomy_protein_absent
Domain: set()
Range: set()
Source: {kg.gene/protein, kg.anatomy}
Target: {kg.gene/protein, kg.anatomy}
Created relation: anatomy_protein_present
Domain: set()
Range: set()
Source: {kg.gene/protein, kg.anatomy}
Target: {kg.gene/protein, kg.anatomy}
Created relation: bioprocess_bioprocess
Domain: set()
Range: set()
Source: {kg.biological_process}
Target: {kg.biological_process}
Created relation: bioprocess_protein
Domain: set()
Range: set()
Source: {kg.gene/p

In [65]:
RelationDomainRange

{'anatomy_anatomy': {'Domain': set(),
  'Range': set(),
  'Source': {kg.anatomy},
  'Target': {kg.anatomy}},
 'anatomy_protein_absent': {'Domain': set(),
  'Range': set(),
  'Source': {kg.anatomy, kg.gene/protein},
  'Target': {kg.anatomy, kg.gene/protein}},
 'anatomy_protein_present': {'Domain': set(),
  'Range': set(),
  'Source': {kg.anatomy, kg.gene/protein},
  'Target': {kg.anatomy, kg.gene/protein}},
 'bioprocess_bioprocess': {'Domain': set(),
  'Range': set(),
  'Source': {kg.biological_process},
  'Target': {kg.biological_process}},
 'bioprocess_protein': {'Domain': set(),
  'Range': set(),
  'Source': {kg.biological_process, kg.gene/protein},
  'Target': {kg.biological_process, kg.gene/protein}},
 'cellcomp_cellcomp': {'Domain': set(),
  'Range': set(),
  'Source': {kg.cellular_component},
  'Target': {kg.cellular_component}},
 'cellcomp_protein': {'Domain': set(),
  'Range': set(),
  'Source': {kg.cellular_component, kg.gene/protein},
  'Target': {kg.cellular_component, kg.ge

In [69]:
from owlready2 import *
import pandas as pd
import types

# Step 1: Group the data by relation, x_type (source), and y_type (target)
relation_groups = datakg.groupby(['relation', 'x_type', 'y_type']).size().reset_index(name='count')

# Step 2: Initialize ontology
onto = get_ontology("http://www.example.org/kg.owl")

# Step 3: Create entity types as classes
with onto:
    # Create a class for each unique entity type in x_type and y_type
    EntityClass = dict.fromkeys(set(datakg['x_type'].unique()).union(set(datakg['y_type'].unique())), None)
    
    # Create a new class for each unique entity type
    for t in EntityClass.keys():
        EntityClass[t] = types.new_class(t, (Thing,))  # No need to replace "/"
        print(f"Created ontology class: {EntityClass[t]}")

# Step 4: Create relations and assign domain, range, source, and target
RelationClass = {}
RelationDomainRange = {}

# With the grouped data, we now create relations and assign domain/range.
with onto:
    for relation in relation_groups['relation'].unique():
        # Create relation as an ObjectProperty (for example, protein_protein as an object property)
        RelationClass[relation] = types.new_class(relation, (ObjectProperty,))
        
        # Initialize the RelationDomainRange for this relation
        RelationDomainRange[relation] = {
            'Domain': set(),
            'Range': set(),
            'Source': set(),
            'Target': set()
        }
        
        # Find all domains (x_type) and ranges (y_type) for this relation
        relation_data = relation_groups[relation_groups['relation'] == relation]
        domains = relation_data['x_type'].unique()
        ranges = relation_data['y_type'].unique()
        
        # Set the domain and range for the relation
        for domain in domains:
            RelationClass[relation].domain.append(EntityClass[domain])  # Link domain to the corresponding entity class
        
        for range in ranges:
            RelationClass[relation].range.append(EntityClass[range])  # Link range to the corresponding entity class
        
        # Collect source and target types for the relation and update the sets
        sources = relation_data['x_type'].unique()
        targets = relation_data['y_type'].unique()

        RelationDomainRange[relation]['Source'].update([EntityClass[s] for s in sources])
        RelationDomainRange[relation]['Target'].update([EntityClass[t] for t in targets])

        # Print the domain, range, source, and target for the relation
        print(f"Created relation: {relation}")
        print(f"Domain: {RelationDomainRange[relation]['Domain']}")
        print(f"Range: {RelationDomainRange[relation]['Range']}")
        print(f"Source: {RelationDomainRange[relation]['Source']}")
        print(f"Target: {RelationDomainRange[relation]['Target']}")

# Save the ontology
onto.save(file="kg_ontology_with_sources_and_targets.owl", format="rdfxml")
print("Ontology saved as kg_ontology_with_sources_and_targets.owl.")


Created ontology class: kg.drug
Created ontology class: kg.exposure
Created ontology class: kg.gene/protein
Created ontology class: kg.biological_process
Created ontology class: kg.cellular_component
Created ontology class: kg.effect/phenotype
Created ontology class: kg.molecular_function
Created ontology class: kg.anatomy
Created ontology class: kg.disease
Created ontology class: kg.pathway
Created relation: anatomy_anatomy
Domain: set()
Range: set()
Source: {kg.anatomy}
Target: {kg.anatomy}
Created relation: anatomy_protein_absent
Domain: set()
Range: set()
Source: {kg.gene/protein, kg.anatomy}
Target: {kg.gene/protein, kg.anatomy}
Created relation: anatomy_protein_present
Domain: set()
Range: set()
Source: {kg.gene/protein, kg.anatomy}
Target: {kg.gene/protein, kg.anatomy}
Created relation: bioprocess_bioprocess
Domain: set()
Range: set()
Source: {kg.biological_process}
Target: {kg.biological_process}
Created relation: bioprocess_protein
Domain: set()
Range: set()
Source: {kg.gene/p

In [71]:
RelationDomainRange

{'anatomy_anatomy': {'Domain': set(),
  'Range': set(),
  'Source': {kg.anatomy},
  'Target': {kg.anatomy}},
 'anatomy_protein_absent': {'Domain': set(),
  'Range': set(),
  'Source': {kg.anatomy, kg.gene/protein},
  'Target': {kg.anatomy, kg.gene/protein}},
 'anatomy_protein_present': {'Domain': set(),
  'Range': set(),
  'Source': {kg.anatomy, kg.gene/protein},
  'Target': {kg.anatomy, kg.gene/protein}},
 'bioprocess_bioprocess': {'Domain': set(),
  'Range': set(),
  'Source': {kg.biological_process},
  'Target': {kg.biological_process}},
 'bioprocess_protein': {'Domain': set(),
  'Range': set(),
  'Source': {kg.biological_process, kg.gene/protein},
  'Target': {kg.biological_process, kg.gene/protein}},
 'cellcomp_cellcomp': {'Domain': set(),
  'Range': set(),
  'Source': {kg.cellular_component},
  'Target': {kg.cellular_component}},
 'cellcomp_protein': {'Domain': set(),
  'Range': set(),
  'Source': {kg.cellular_component, kg.gene/protein},
  'Target': {kg.cellular_component, kg.ge

In [88]:
print(RelationClass['anatomy_anatomy'].domain)
print(RelationClass['disease_phenotype_negative'].range)


[kg.anatomy, kg.anatomy]
[kg.effect/phenotype, kg.disease, kg.effect/phenotype, kg.disease]


In [75]:
RelationClass

{'anatomy_anatomy': kg.anatomy_anatomy,
 'anatomy_protein_absent': kg.anatomy_protein_absent,
 'anatomy_protein_present': kg.anatomy_protein_present,
 'bioprocess_bioprocess': kg.bioprocess_bioprocess,
 'bioprocess_protein': kg.bioprocess_protein,
 'cellcomp_cellcomp': kg.cellcomp_cellcomp,
 'cellcomp_protein': kg.cellcomp_protein,
 'contraindication': kg.contraindication,
 'disease_disease': kg.disease_disease,
 'disease_phenotype_negative': kg.disease_phenotype_negative,
 'disease_phenotype_positive': kg.disease_phenotype_positive,
 'disease_protein': kg.disease_protein,
 'drug_drug': kg.drug_drug,
 'drug_effect': kg.drug_effect,
 'drug_protein': kg.drug_protein,
 'exposure_bioprocess': kg.exposure_bioprocess,
 'exposure_cellcomp': kg.exposure_cellcomp,
 'exposure_disease': kg.exposure_disease,
 'exposure_exposure': kg.exposure_exposure,
 'exposure_molfunc': kg.exposure_molfunc,
 'exposure_protein': kg.exposure_protein,
 'indication': kg.indication,
 'molfunc_molfunc': kg.molfunc_mol

In [85]:
# Step 1: Create TheGraph with nodes and their respective entity types (classes)
TheGraph = dict.fromkeys(datakg['x_name'].unique(), None)  # Assuming 'x_name' is a column for unique entity identifiers

# Create a node for each unique entity type in the DataFrame
with onto:
    for n in TheGraph.keys():
        # Get the entity type from the 'x_type' or 'y_type' (we assume 'x_type' for this example)
        nodetype = datakg.loc[datakg['x_name'] == n, 'x_type'].iloc[0]  # Using 'x_type' for node type
        TheGraph[n] = EntityClass[nodetype](name=n)  # Create the node using its corresponding entity class
        TheGraph[n].rel = []  # Initialize an empty list for relations (edges)

# Step 2: Define the relations (edges) between nodes
for k in datakg.index:
    source = datakg.loc[k, 'x_name']  # Assuming 'x_name' is the source entity
    target = datakg.loc[k, 'y_name']  # Assuming 'y_name' is the target entity
    
    # Add the target node to the source node's relations (this establishes the relationship)
    TheGraph[source].rel.append(TheGraph[target])

# Step 3: Access the relations for a specific node
# Example: Get the relations for node 'MLH1'
print(TheGraph['MLH1'].rel)  # This will give a list of nodes that 'MLH1' is connected to


KeyboardInterrupt: 