In [1]:
import pandas as pd
from load_graph import BioKnowledgeGraph

In [2]:
graph = BioKnowledgeGraph("everycure_data/Nodes.csv", "everycure_data/Edges.csv")
graph

Loaded graph with 170009 nodes and 183760 edges


<load_graph.BioKnowledgeGraph at 0x1116644f0>

In [3]:
# Find all anatomical relationships for a gene
gene_anatomy = graph.find_edges(
    subject="NCBIGene:8483", # https://www.ncbi.nlm.nih.gov/gene/8483 CILP cartilage intermediate layer protein
    object_category="biolink:AnatomicalEntity"
)

# Print the relationships
for source, target, data in gene_anatomy:
    print(f"{source} --[{data['predicate']}]--> {target}")

# FMA:70022 = extracellular space: https://evsexplore.semantics.cancer.gov/evsexplore/concept/npo/NPO_1580 
# That part of a multicellular organism outside the cells proper, 
# usually taken to be outside the plasma membranes, and occupied by fluid. 

NCBIGene:8483 --[biolink:related_to]--> FMA:70022


In [47]:
gene_anatomy = graph.find_edges(
    subject="CHEMBL.COMPOUND:CHEMBL3137343", # humanized antibody Keytruda
    predicate="biolink:treats"
)

# Print the relationships
for source, target, data in gene_anatomy:
    print(f"{source} --[{data['predicate']}]--> {target}")

# MONDO:0021040 = pancreatic neoplasm: A benign or malignant neoplasm involving the pancreas.

CHEMBL.COMPOUND:CHEMBL3137343 --[biolink:treats]--> MONDO:0021040


In [53]:
treatments = graph.find_edges(
    subject="CHEMBL.COMPOUND:CHEMBL3137343",
    predicate="biolink:treats",
    object_category="biolink:Disease"
)

# Print the relationships with disease names
for source, target, data in treatments:
    target_info = graph.get_node_info(target)
    print(f"{source} --[{data['predicate']}]--> {target} ({target_info['name']})")

CHEMBL.COMPOUND:CHEMBL3137343 --[biolink:treats]--> MONDO:0021040 (pancreatic neoplasm)


In [54]:
# Find paths between a drug and a disease
paths = graph.find_paths(
    start="CHEMBL.COMPOUND:CHEMBL3137343",  # Example drug ID
    end="MONDO:0021040",       # Example disease ID
    max_length=3
)

# Print the paths
for path in paths:
    print(" -> ".join(path))

CHEMBL.COMPOUND:CHEMBL3137343 -> MONDO:0021040


In [55]:
# Get detailed information about a node
node_info = graph.get_node_info("NCBIGene:8483")
if node_info:
    print(f"Name: {node_info['name']}")
    print(f"Category: {node_info['category']}")
    print(f"Description: {node_info['description']}")

Name: Genetic locus associated with CILP
Category: biolink:Gene
Description: A protein coding gene CILP in human. // COMMENTS: Category=external.; UMLS Semantic Type: UMLSSC:T028


In [56]:
# Find all proteins in the graph
proteins = graph.find_nodes_by_category("biolink:Protein")
print(f"Found {len(proteins)} proteins")

Found 17338 proteins


In [5]:
# Finding all genes expressed in cartilage that are related to arthritis
def find_cartilage_genes_in_arthritis():
    """
    Find all genes that are related to cartilage tissue and have a path to arthritis
    
    Returns:
    --------
    List[str]
        List of gene IDs
    """
    # Find all genes related to cartilage tissue
    cartilage_edges = graph.find_edges(
        object="FMA:70022",  # Cartilage tissue
        predicate="biolink:related_to"
    )
    
    # Extract the subject (gene) nodes from these edges
    cartilage_genes = [source for source, _, _ in cartilage_edges]
    
    # Find which of these genes have a path to arthritis
    arthritis_related = [
        gene for gene in cartilage_genes
        if graph.has_path(
            start=gene,
            end="MONDO:0005321",  # Arthritis
            max_length=2
        )
    ]
    
    # Get gene information for better output
    result = []
    for gene_id in arthritis_related:
        gene_info = graph.get_node_info(gene_id)
        result.append({
            'id': gene_id,
            'name': gene_info.get('name', 'Unknown'),
            'category': gene_info.get('category', 'Unknown')
        })
    
    return result

# Usage:
arthritis_genes = find_cartilage_genes_in_arthritis()
for gene in arthritis_genes:
    print(f"Found gene: {gene['name']} ({gene['id']})")

In [19]:
nodes_df = pd.read_csv("/Users/guzal/dev/knowledgegraph/everycure_data/Nodes.csv", index_col="Unnamed: 0")
edges_df = pd.read_csv("/Users/guzal/dev/knowledgegraph/everycure_data/Edges.csv", index_col="Unnamed: 0")
gt_df = pd.read_csv("/Users/guzal/dev/knowledgegraph/everycure_data/Ground_Truth.csv", index_col="Unnamed: 0")
gt_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28368 entries, 0 to 28367
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   source  28368 non-null  object
 1   target  28368 non-null  object
 2   y       28368 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 886.5+ KB


In [21]:
nodes_df.head()

Unnamed: 0,id,name,category,all_names,all_categories,iri,description,equivalent_curies,publications,label
0,UniProtKB:P53805,RCAN1,biolink:Protein,RCAN1 GeneǂGenetic locus associated with RCAN1...,biolink:NucleicAcidEntityǂbiolink:Geneǂbiolink...,https://identifiers.org/uniprot:P53805,"Calcipressin-1 (252 aa, ~28 kDa) is encoded by...",HGNC:3040ǂRGD:631338ǂNCIT:C82947ǂENSEMBL:ENSG0...,DOI:10.1042/bj20030267ǂPMID:12809556ǂDOI:10.10...,biolink:NucleicAcidEntityǂbiolink:MolecularEnt...
1,UniProtKB:Q4KWH8,PLCH1,biolink:Protein,"1-phosphatidylinositol 4,5-bisphosphate phosph...",biolink:NucleicAcidEntityǂbiolink:Geneǂbiolink...,https://identifiers.org/uniprot:Q4KWH8,A protein that is a translation product of the...,NCBIGene:23007ǂMGI:2683547ǂPR:000012838ǂUMLS:C...,DOI:10.1042/bj20041677ǂDOI:10.1186/1471-2164-8...,biolink:NucleicAcidEntityǂbiolink:MolecularEnt...
2,UniProtKB:Q9UPU7,TBC1D2B,biolink:Protein,TBC1D2B (human)ǂGenetic locus associated with ...,biolink:NucleicAcidEntityǂbiolink:Geneǂbiolink...,https://identifiers.org/uniprot:Q9UPU7,A protein that is a translation product of the...,PR:Q9UPU7ǂENSEMBL:ENSG00000167202ǂMGI:1914266ǂ...,DOI:10.1126/scisignal.2000475ǂPMID:19690332ǂDO...,biolink:NucleicAcidEntityǂbiolink:MolecularEnt...
3,UniProtKB:P35462,DRD3,biolink:Protein,D(3) dopamine receptor (human)ǂDrd3 (rat)ǂD(3)...,biolink:NucleicAcidEntityǂbiolink:Geneǂbiolink...,https://identifiers.org/uniprot:P35462,A D(2)-like dopamine receptor that is a transl...,OMIM:126451ǂRGD:2521ǂMGI:94925ǂNCBIGene:1814ǂU...,PMID:16809426ǂPMID:16386234ǂPMID:7961889ǂ2-nǂ2...,biolink:NucleicAcidEntityǂbiolink:MolecularEnt...
4,UniProtKB:Q9UJY1,HSPB8,biolink:Protein,HSPB8ǂheat shock protein beta-8 (human)ǂHeat S...,biolink:NucleicAcidEntityǂbiolink:Geneǂbiolink...,https://identifiers.org/uniprot:Q9UJY1,"Heat shock protein beta-8 (196 aa, ~22 kDa) is...",NCBIGene:26353ǂNCIT:C95433ǂUniProtKB:Q9UJY1ǂPR...,PMID:15122253ǂDOI:10.1016/j.bbamcr.2009.05.005...,biolink:NucleicAcidEntityǂbiolink:MolecularEnt...


In [23]:
nodes_df.iloc[0,:]

id                                                    UniProtKB:P53805
name                                                             RCAN1
category                                               biolink:Protein
all_names            RCAN1 GeneǂGenetic locus associated with RCAN1...
all_categories       biolink:NucleicAcidEntityǂbiolink:Geneǂbiolink...
iri                             https://identifiers.org/uniprot:P53805
description          Calcipressin-1 (252 aa, ~28 kDa) is encoded by...
equivalent_curies    HGNC:3040ǂRGD:631338ǂNCIT:C82947ǂENSEMBL:ENSG0...
publications         DOI:10.1042/bj20030267ǂPMID:12809556ǂDOI:10.10...
label                biolink:NucleicAcidEntityǂbiolink:MolecularEnt...
Name: 0, dtype: object

In [20]:
edges_df.head()

Unnamed: 0,subject,object,predicate,knowledge_source,publications,publications_info,type,start_id,end_id
0,NCBIGene:8483,FMA:70022,biolink:related_to,infores:ensembl-gene,,{},biolink:related_to,NCBIGene:8483,FMA:70022
1,NCBIGene:390650,CHEMBL.TARGET:CHEMBL372,biolink:in_taxon,infores:ncbi-geneǂinfores:ensembl-gene,,{},biolink:in_taxon,NCBIGene:390650,CHEMBL.TARGET:CHEMBL372
2,UMLS:C1158823,ENSEMBL:ENST00000485267,biolink:has_participant,infores:ensembl-gene,,{},biolink:has_participant,UMLS:C1158823,ENSEMBL:ENST00000485267
3,ENSEMBL:ENST00000464141,CHEMBL.TARGET:CHEMBL372,biolink:in_taxon,infores:ensembl-gene,,{},biolink:in_taxon,ENSEMBL:ENST00000464141,CHEMBL.TARGET:CHEMBL372
4,ENSEMBL:ENST00000556723,UniProtKB:Q9H0R1,biolink:transcribed_from,infores:ensembl-gene,,{},biolink:transcribed_from,ENSEMBL:ENST00000556723,UniProtKB:Q9H0R1


In [7]:
gt_df.head()

Unnamed: 0,source,target,y
0,CHEMBL.COMPOUND:CHEMBL30,MONDO:0007186,1
1,CHEMBL.COMPOUND:CHEMBL3707202,MONDO:0007186,1
2,CHEMBL.COMPOUND:CHEMBL1502,MONDO:0007186,1
3,CHEMBL.COMPOUND:CHEMBL480,MONDO:0007186,1
4,CHEMBL.COMPOUND:CHEMBL3833310,MONDO:0007186,1


In [8]:
gt_df.describe()

Unnamed: 0,y
count,28368.0
mean,0.407396
std,0.491358
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [13]:
gt_df['target']

0        MONDO:0007186
1        MONDO:0007186
2        MONDO:0007186
3        MONDO:0007186
4        MONDO:0007186
             ...      
28363    MONDO:0004981
28364    MONDO:0004981
28365    MONDO:0004981
28366    MONDO:0004981
28367    MONDO:0004981
Name: target, Length: 28368, dtype: object

In [16]:
sampled_df = pd.read_csv("/Users/guzal/dev/knowledgegraph/everycure_data/sampled_kg_gt_dataset.csv",index_col='Unnamed: 0')
sampled_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28368 entries, 0 to 28367
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   target                  28368 non-null  object 
 1   target_embedding        28368 non-null  object 
 2   source                  28368 non-null  object 
 3   source_embedding        28368 non-null  object 
 4   y                       28368 non-null  int64  
 5   degree                  28368 non-null  int64  
 6   clustering_coefficient  28368 non-null  float64
dtypes: float64(1), int64(2), object(4)
memory usage: 1.7+ MB


In [17]:
sampled_df.head()

Unnamed: 0,target,target_embedding,source,source_embedding,y,degree,clustering_coefficient
0,MONDO:0001830,[-1.51357800e-01 6.52519986e-02 -1.80281922e-...,MESH:D018712,[-0.29937154 0.10315063 -0.19614923 -0.075108...,1,1,0.0
1,MONDO:0001830,[-1.51357800e-01 6.52519986e-02 -1.80281922e-...,MESH:D004338,[-3.15890819e-01 1.15171695e+00 -4.02470157e-...,1,665,0.0
2,MONDO:0001830,[-1.51357800e-01 6.52519986e-02 -1.80281922e-...,CHEMBL.COMPOUND:CHEMBL2106755,[-0.21377265 0.21636915 0.17833856 0.152273...,1,1,0.0
3,MONDO:0001830,[-1.51357800e-01 6.52519986e-02 -1.80281922e-...,MESH:Q000819,[-0.27162647 0.14502069 0.05017115 -0.106104...,1,4958,8.137754e-08
4,MONDO:0001830,[-1.51357800e-01 6.52519986e-02 -1.80281922e-...,CHEMBL.COMPOUND:CHEMBL2146126,[-0.33723655 0.26299965 0.05125289 0.025367...,1,28,0.0
