In [78]:
%reload_kedro

In [2]:
# -*- coding: utf-8 -*-
# from __future__ import absolute_import, division, print_function, unicode_literals
import random
import logging
import itertools
from rich import print
from IPython.display import display

import pandas as pd
import numpy as np
from modspy_data.helpers import KnowledgeGraphScores

import matplotlib.pyplot as plt
import seaborn as sns

import dask
import dask.dataframe as dd
import dask.array as da
from dask.distributed import Client, progress, performance_report
from dask_jobqueue import SLURMCluster


import torch
import pronto
import networkx as nx
# from utils import visualize
from nxontology import NXOntology
from nxontology.imports import (from_file, multidigraph_to_digraph,
                                pronto_to_multidigraph)
from networkx.drawing.nx_agraph import graphviz_layout
from nxontology.viz import create_similarity_graphviz
from torch_geometric.data import Data, InMemoryDataset, download_url
import torch_geometric.transforms as T
from torch_geometric.utils.convert import from_networkx
from torch.nn import Linear, ModuleList
from torch_geometric.nn import GATConv, GraphConv
# from torch_geometric.loaders import DataLoader, Dataset
from torch_geometric.data import HeteroData, DataLoader
from torch_geometric.utils import convert


In [51]:
edges_df = catalog.load('monarch_edges').sample(frac=0.001).compute()
print(edges_df.shape)

`id` column is better *identifier* for node identity that other columns such as `symbol` . `symbol` could be duplicated across different taxon, NCBI reference.

In [50]:
edges_df[edges['object']=='MONDO:0011488'].shape

(0, 19)

In [52]:
nodes = np.unique(edges_df[['subject', 'object']])
print(len(nodes))

In [53]:
nodes_ddf = catalog.load('monarch_nodes')
nodes_df = nodes_ddf[nodes_ddf['id'].isin(nodes)].compute()

In [54]:
display(nodes_df.head())
print(nodes_df.shape)

Unnamed: 0,id,category,name,xref,provided_by,synonym,full_name,in_taxon,in_taxon_label,symbol,description,deprecated,iri,same_as
41,PomBase:SPAC105.02c,biolink:Gene,ank1,UniProtKB:Q9P7I0,pombase_gene_nodes,,ank1,NCBITaxon:4896,Schizosaccharomyces pombe,ank1,,,,
81,PomBase:SPAC110.02,biolink:Gene,pds5,UniProtKB:Q9HFF5,pombase_gene_nodes,,pds5,NCBITaxon:4896,Schizosaccharomyces pombe,pds5,,,,
137,PomBase:SPAC11H11.04,biolink:Gene,mam2,UniProtKB:Q00619,pombase_gene_nodes,,mam2,NCBITaxon:4896,Schizosaccharomyces pombe,mam2,,,,
146,PomBase:SPAC1296.01c,biolink:Gene,,UniProtKB:Q09770,pombase_gene_nodes,SPAC22F3.01,,NCBITaxon:4896,Schizosaccharomyces pombe,,,,,
178,PomBase:SPAC12G12.11c,biolink:Gene,miy1,UniProtKB:Q09874,pombase_gene_nodes,,miy1,NCBITaxon:4896,Schizosaccharomyces pombe,miy1,,,,


In [9]:
m_edge_ddf = catalog.load('monarch_edges')
filt_edge_ddf = m_edge_ddf[(m_edge_ddf['subject'].isin(nodes)) | (m_edge_ddf['object'].isin(nodes))]
edges_df = filt_edge_ddf.compute()

In [58]:
from sklearn.preprocessing import LabelEncoder

# Assuming `df_nodes` is your dataframe and 'category' is the string column
label_encoder = LabelEncoder()
nodes_df['id_encoded'] = label_encoder.fit_transform(nodes_df['id'])
id_tensor = torch.tensor(nodes_df['id_tensor'].values, dtype=torch.long)
edges_df['subject_encoded'] = label_encoder.transform(edges_df['subject'].values)
edges_df['object_encoded'] = label_encoder.transform(edges_df['object'].values)

display(nodes_df[['id','id_encoded']])
display(edges_df[['subject','object','subject_encoded','object_encoded']])

Unnamed: 0,id,id_encoded
41,PomBase:SPAC105.02c,12342
81,PomBase:SPAC110.02,12343
137,PomBase:SPAC11H11.04,12344
146,PomBase:SPAC1296.01c,12345
178,PomBase:SPAC12G12.11c,12346
...,...,...
223530,ZP:0141180,17885
223631,ZP:0141281,17886
223751,ZP:0141401,17887
224387,ZP:0142037,17888


Unnamed: 0,subject,object,subject_encoded,object_encoded
188455,MONDO:0008104,HP:0000915,9800,6988
35792,PomBase:SPCC338.08,FYPO:0000268,12556,1953
57423,PomBase:SPBC13G1.10c,FYPO:0000088,12466,1941
36495,PomBase:SPBC2G2.13c,FYPO:0000087,12501,1940
85058,PomBase:SPBC4.04c,FYPO:0000455,12514,1957
...,...,...,...,...
17343,HGNC:12640,HGNC:8974,4008,6659
54232,HGNC:7795,HGNC:1974,6502,4753
104879,HGNC:30528,HGNC:21648,5692,4942
44699,HGNC:12562,HGNC:17270,3995,4516


In [None]:

# Assuming `df_nodes` is your dataframe and 'category' is the string column
category_encoder = LabelEncoder()
nodes_df['category_encoded'] = category_encoder.fit_transform(nodes_df['category'])
node_category_tensor = torch.tensor(nodes_df['category_tensor'].values, dtype=torch.long)
edges_df['subject_encoded'] = label_encoder.transform(edges_df['subject'].values)
edges_df['object_encoded'] = label_encoder.transform(edges_df['object'].values)

display(nodes_df[['id','id_encoded']])
display(edges_df[['subject','object','subject_encoded','object_encoded']])

In [66]:
print(nodes_df['category'].unique())
print(edges_df['category'].unique())


In [67]:
nodes_df.to_csv('./data/02_intermediate/monarch/sample_nodes.tsv', index=False, sep='\t')
edges_df.to_csv('./data/02_intermediate/monarch/sample_edges.tsv', index=False, sep='\t')

In [60]:
# Assuming `df_nodes` and `df_edges` are your dataframes
# Convert the dataframes to PyTorch tensors
node_attributes = torch.tensor(nodes_df['id_encoded'], dtype=torch.float)
edge_index = torch.tensor(edges_df[['subject_encoded','object_encoded']].values, dtype=torch.long).t().contiguous()

# Create a HeteroData object
data = HeteroData()

# Add node and edge data
for node_type in nodes_df['category'].unique():
    data[node_type].x = nodes_df[nodes_df['id']]
for edge_type in edges_df['category'].unique():
    data[edge_type].edge_index = 

# If you have edge attributes, add them like this:
# edge_attributes = torch.tensor(df_edges['attribute'].values, dtype=torch.float)
# data['edge_type'].edge_attr = edge_attributes

In [68]:
import torch
from torch_geometric.data import HeteroData
from collections import defaultdict

# Prepare node mapping and node types
node_mapping = {node_id: i for i, node_id in enumerate(nodes_df['id'].unique())}
node_types = nodes_df['category'].unique()

# Initialize HeteroData for the heterogeneous graph
data = HeteroData()

# Add nodes to the graph
for node_type in node_types:
    mask = nodes_df['category'] == node_type
    type_nodes = nodes_df[mask]
    data[node_type].x = torch.tensor(type_nodes.index.to_list(), dtype=torch.long)

# Prepare edge types and mappings
edge_types = edges_df['category'].unique()
edge_type_mappings = defaultdict(list)

for _, row in edges_df.iterrows():
    subject_type = nodes_df.loc[nodes_df['id'] == row['subject'], 'category'].values[0]
    object_type = nodes_df.loc[nodes_df['id'] == row['object'], 'category'].values[0]
    edge_key = (subject_type, row['predicate'], object_type)
    edge_type_mappings[edge_key].append((node_mapping[row['subject']], node_mapping[row['object']]))

# Add edges to the graph
for edge_key, edge_indices in edge_type_mappings.items():
    edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()
    data[edge_key].edge_index = edge_index

print(data)


In [75]:
torch.save(data, './data/02_intermediate/monarch/graph.pt')

In [76]:
test = torch.load('./data/02_intermediate/monarch/graph.pt')

In [77]:
display(test)

HeteroData(
  [1mbiolink:Gene[0m={ x=[13224] },
  [1mbiolink:Pathway[0m={ x=[299] },
  [1mbiolink:NamedThing[0m={ x=[77] },
  [1mbiolink:MolecularEntity[0m={ x=[54] },
  [1mbiolink:SmallMolecule[0m={ x=[1] },
  [1mbiolink:ChemicalEntity[0m={ x=[2] },
  [1mbiolink:Protein[0m={ x=[8] },
  [1mbiolink:Cell[0m={ x=[223] },
  [1mbiolink:GrossAnatomicalStructure[0m={ x=[680] },
  [1mbiolink:AnatomicalEntity[0m={ x=[210] },
  [1mbiolink:PhenotypicFeature[0m={ x=[1434] },
  [1mbiolink:BiologicalProcessOrActivity[0m={ x=[1208] },
  [1mbiolink:MacromolecularComplex[0m={ x=[95] },
  [1mbiolink:CellularComponent[0m={ x=[176] },
  [1mbiolink:LifeStage[0m={ x=[2] },
  [1mbiolink:Disease[0m={ x=[325] },
  [1mbiolink:PhenotypicQuality[0m={ x=[6] },
  [1mbiolink:PathologicalProcess[0m={ x=[1] },
  [1mbiolink:Vertebrate[0m={ x=[4] },
  [1mbiolink:Virus[0m={ x=[2] },
  [1mbiolink:CellularOrganism[0m={ x=[2] },
  [1m(biolink:Disease, biolink:has_phenotype, biolink

In [80]:
pyg_sample = catalog.load('monarch_pyg_sample')

In [81]:
display(pyg_sample)

HeteroData(
  [1mbiolink:Gene[0m={ x=[13224] },
  [1mbiolink:Pathway[0m={ x=[299] },
  [1mbiolink:NamedThing[0m={ x=[77] },
  [1mbiolink:MolecularEntity[0m={ x=[54] },
  [1mbiolink:SmallMolecule[0m={ x=[1] },
  [1mbiolink:ChemicalEntity[0m={ x=[2] },
  [1mbiolink:Protein[0m={ x=[8] },
  [1mbiolink:Cell[0m={ x=[223] },
  [1mbiolink:GrossAnatomicalStructure[0m={ x=[680] },
  [1mbiolink:AnatomicalEntity[0m={ x=[210] },
  [1mbiolink:PhenotypicFeature[0m={ x=[1434] },
  [1mbiolink:BiologicalProcessOrActivity[0m={ x=[1208] },
  [1mbiolink:MacromolecularComplex[0m={ x=[95] },
  [1mbiolink:CellularComponent[0m={ x=[176] },
  [1mbiolink:LifeStage[0m={ x=[2] },
  [1mbiolink:Disease[0m={ x=[325] },
  [1mbiolink:PhenotypicQuality[0m={ x=[6] },
  [1mbiolink:PathologicalProcess[0m={ x=[1] },
  [1mbiolink:Vertebrate[0m={ x=[4] },
  [1mbiolink:Virus[0m={ x=[2] },
  [1mbiolink:CellularOrganism[0m={ x=[2] },
  [1m(biolink:Disease, biolink:has_phenotype, biolink

In [74]:
data['biolink:Disease', 'biolink:has_mode_of_inheritance', 'biolink:PhenotypicFeature']

{'edge_index': tensor([[16473, 16447, 16465, 16675, 16536, 16526],
        [15864, 15864, 15864, 15864, 15864, 16169]])}

In [None]:
# Reaches around 91.8% Micro-F1 after 5 epochs.

import os.path as osp

import torch

from torch_geometric.datasets import AMiner
from torch_geometric.nn import MetaPath2Vec

path = osp.join(osp.dirname(osp.realpath(__file__)), '../../data/AMiner')
dataset = AMiner(path)
data = dataset[0]

metapath = [
    ('author', 'writes', 'paper'),
    ('paper', 'published_in', 'venue'),
    ('venue', 'publishes', 'paper'),
    ('paper', 'written_by', 'author'),
]

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = MetaPath2Vec(data.edge_index_dict, embedding_dim=128,
                     metapath=metapath, walk_length=50, context_size=7,
                     walks_per_node=5, num_negative_samples=5,
                     sparse=True).to(device)

loader = model.loader(batch_size=128, shuffle=True, num_workers=6)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)


def train(epoch, log_steps=100, eval_steps=2000):
    model.train()

    total_loss = 0
    for i, (pos_rw, neg_rw) in enumerate(loader):
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        if (i + 1) % log_steps == 0:
            print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                   f'Loss: {total_loss / log_steps:.4f}'))
            total_loss = 0

        if (i + 1) % eval_steps == 0:
            acc = test()
            print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                   f'Acc: {acc:.4f}'))


@torch.no_grad()
def test(train_ratio=0.1):
    model.eval()

    z = model('author', batch=data['author'].y_index.to(device))
    y = data['author'].y

    perm = torch.randperm(z.size(0))
    train_perm = perm[:int(z.size(0) * train_ratio)]
    test_perm = perm[int(z.size(0) * train_ratio):]

    return model.test(z[train_perm], y[train_perm], z[test_perm], y[test_perm],
                      max_iter=150)


for epoch in range(1, 6):
    train(epoch)
    acc = test()
    print(f'Epoch: {epoch}, Accuracy: {acc:.4f}')

In [73]:
edges_df[edges_df['category']=='biolink:has_phenotype']

Unnamed: 0,id,original_subject,predicate,original_object,category,aggregator_knowledge_source,primary_knowledge_source,provided_by,publications,qualifiers,...,has_evidence,negated,onset_qualifier,sex_qualifier,stage_qualifier,relation,subject,object,subject_encoded,object_encoded


In [13]:
display(m_node_df.shape)
display(m_node_df.head(5))

(8622, 14)

Unnamed: 0,id,category,name,xref,provided_by,synonym,full_name,in_taxon,in_taxon_label,symbol,description,deprecated,iri,same_as
165991,RGD:11443353,biolink:Gene,LOC108350053,NCBIGene:108350053|RGD:11443353,alliance_gene_nodes,uncharacterized protein LOC108350053,uncharacterized LOC108350053,NCBITaxon:10116,Rattus norvegicus,LOC108350053,,,,
30890,ZFIN:ZDB-GENE-060929-860,biolink:Gene,cenpe,UniProtKB:A0A8M6YXU4|PANTHER:PTHR24115|ENSEMBL...,alliance_gene_nodes,zgc:152925|wu:fc06a10,centromere protein E,NCBITaxon:7955,Danio rerio,cenpe,,,,
94623,WB:WBGene00016749,biolink:Gene,C48E7.1,ENSEMBL:WBGene00016749|NCBIGene:172322|UniProt...,alliance_gene_nodes,CELE_C48E7.1,C48E7.1,NCBITaxon:6239,Caenorhabditis elegans,C48E7.1,,,,
127040,WB:WBGene00201689,biolink:Gene,C04D8.5,ENSEMBL:WBGene00201689|NCBIGene:13190501|RNAce...,alliance_gene_nodes,CELE_C04D8.5,C04D8.5,NCBITaxon:6239,Caenorhabditis elegans,C04D8.5,,,,
201669,MGI:1202298,biolink:Gene,Nmt2,ENSEMBL:ENSMUSG00000026643|NCBIGene:18108|PANT...,alliance_gene_nodes,A930001K02Rik|AI605445|AU044698|expressed sequ...,N-myristoyltransferase 2,NCBITaxon:10090,Mus musculus,Nmt2,,,,
