In [1]:
%reload_kedro

In [2]:
import itertools
import logging
from collections import defaultdict

from tqdm import tqdm
from rich import print
from IPython.display import display, HTML

import dask
import dask.dataframe as dd
import dask.array as da
from dask.distributed import Client, progress, performance_report
from dask_jobqueue import SLURMCluster

import torch
from torch_geometric.data import HeteroData

In [84]:
cluster = SLURMCluster(
    cores=32,  # Number of cores per job
    processes=1,
    memory='200GB',  # Memory allocated to each worker
    walltime='02:30:00',  # Walltime limit for each job
    # Specify any additional SLURM or Dask configurations as needed
)

In [85]:
client = Client(cluster)  # start distributed scheduler locally.

In [86]:
display(cluster)
display(client)

Tab(children=(HTML(value='<div class="jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-outpu…

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://10.29.85.222:8787/status,

0,1
Dashboard: http://10.29.85.222:8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.29.85.222:44122,Workers: 0
Dashboard: http://10.29.85.222:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [87]:
# Scale the cluster to the desired number of workers
cluster.scale(jobs=4)  # Request 10 jobs, adjust based on your needs

In [57]:
!pwd

/scratch/rahit/modspy-data/notebooks/monarch_gnn


In [60]:
nodes_df = dd.read_csv('../../data/01_raw/monarch/monarch-kg_nodes.tsv', sep='\t', usecols=['id', 'category', 'name', 'in_taxon', 'in_taxon_label', 'symbol'], dtype={'id':'object', 
                        'category':'object', 'name':'object', 'in_taxon':'object', 'in_taxon_label':'object', 'symbol':'object'})
edges_df = dd.read_parquet('../../data/02_intermediate/monarch/edges_with_node_cat')

In [61]:
# Group by 'node_type' and assign type-wise indices
nodes_df['type_index'] = nodes_df.groupby('category').cumcount()

In [62]:
nodes_df.head()

Unnamed: 0,id,category,name,in_taxon,in_taxon_label,symbol,type_index
0,PomBase:SPAC1002.01,biolink:Gene,mrx11,NCBITaxon:4896,Schizosaccharomyces pombe,mrx11,0
1,PomBase:SPAC1002.02,biolink:Gene,pom34,NCBITaxon:4896,Schizosaccharomyces pombe,pom34,1
2,PomBase:SPAC1002.03c,biolink:Gene,gls2,NCBITaxon:4896,Schizosaccharomyces pombe,gls2,2
3,PomBase:SPAC1002.04c,biolink:Gene,taf11,NCBITaxon:4896,Schizosaccharomyces pombe,taf11,3
4,PomBase:SPAC1002.05c,biolink:Gene,jmj2,NCBITaxon:4896,Schizosaccharomyces pombe,jmj2,4


In [63]:
# nodes_df = nodes_df.assign(enum_idx=1)
# nodes_df['enum_idx'] = nodes_df['enum_idx'].cumsum() - 1
nodes_df = nodes_df.set_index('id')
nodes_df.columns

Index(['category', 'name', 'in_taxon', 'in_taxon_label', 'symbol',
       'type_index'],
      dtype='object')

In [64]:
len(nodes_df.index.unique())

862115

In [65]:
len(nodes_df['type_index'].unique())

559272

In [66]:
############# CHANGE ME #######################
edges_df = edges_df.rename(columns={'subject_category': 'e_category', 'edge_category': 'subject_category'}).rename(columns={'e_category': 'edge_category'})
###############################################
_edf = edges_df.merge(nodes_df, left_on='subject', right_index=True, suffixes=('_ndf', '_edf'))
# print(f"Columns after merging on subject category: {_edf.columns}")
_edf = _edf.rename(columns={'type_index': 'subject_id'})
print(f"Columns after merging on subject category: {_edf.columns}")
display(_edf.head())

_edf = _edf.merge(nodes_df, left_on='object', right_index=True, suffixes=('_ndf', '_edf'))
# print(f"Columns after merging on object category: {_edf.columns}")
_edf = _edf.rename(columns={'type_index': 'object_id'})
print(f"Columns after merging on object category: {_edf.columns}")
display(_edf.head())

Unnamed: 0,id,subject,edge_category,predicate,subject_category,object_category,object,category,name,in_taxon,in_taxon_label,symbol,subject_id
0,uuid:6ef055fb-9bab-11ee-b780-6b2918cfaf31,HGNC:16391,biolink:PairwiseGeneToGeneInteraction,biolink:interacts_with,biolink:Gene,biolink:Gene,HGNC:989,biolink:Gene,CARD9,NCBITaxon:9606,Homo sapiens,CARD9,518846
1,uuid:6ef055fc-9bab-11ee-b780-6b2918cfaf31,HGNC:16391,biolink:PairwiseGeneToGeneInteraction,biolink:interacts_with,biolink:Gene,biolink:Gene,HGNC:989,biolink:Gene,CARD9,NCBITaxon:9606,Homo sapiens,CARD9,518846
2,uuid:a77a4a50-9bab-11ee-b780-6b2918cfaf31,HGNC:16391,biolink:PairwiseGeneToGeneInteraction,biolink:interacts_with,biolink:Gene,biolink:Gene,HGNC:989,biolink:Gene,CARD9,NCBITaxon:9606,Homo sapiens,CARD9,518846
3,uuid:c3ee0928-9bab-11ee-b780-6b2918cfaf31,HGNC:16391,biolink:PairwiseGeneToGeneInteraction,biolink:interacts_with,biolink:Gene,biolink:Gene,HGNC:989,biolink:Gene,CARD9,NCBITaxon:9606,Homo sapiens,CARD9,518846
4,uuid:c9de5a17-9bab-11ee-b780-6b2918cfaf31,HGNC:16391,biolink:PairwiseGeneToGeneInteraction,biolink:interacts_with,biolink:Gene,biolink:Gene,HGNC:989,biolink:Gene,CARD9,NCBITaxon:9606,Homo sapiens,CARD9,518846


Unnamed: 0,id,subject,edge_category,predicate,subject_category,object_category,object,category_ndf,name_ndf,in_taxon_ndf,in_taxon_label_ndf,symbol_ndf,subject_id,category_edf,name_edf,in_taxon_edf,in_taxon_label_edf,symbol_edf,object_id
0,uuid:6ef055fb-9bab-11ee-b780-6b2918cfaf31,HGNC:16391,biolink:PairwiseGeneToGeneInteraction,biolink:interacts_with,biolink:Gene,biolink:Gene,HGNC:989,biolink:Gene,CARD9,NCBITaxon:9606,Homo sapiens,CARD9,518846,biolink:Gene,BCL10,NCBITaxon:9606,Homo sapiens,BCL10,517786
1,uuid:6ef055fc-9bab-11ee-b780-6b2918cfaf31,HGNC:16391,biolink:PairwiseGeneToGeneInteraction,biolink:interacts_with,biolink:Gene,biolink:Gene,HGNC:989,biolink:Gene,CARD9,NCBITaxon:9606,Homo sapiens,CARD9,518846,biolink:Gene,BCL10,NCBITaxon:9606,Homo sapiens,BCL10,517786
2,uuid:a77a4a50-9bab-11ee-b780-6b2918cfaf31,HGNC:16391,biolink:PairwiseGeneToGeneInteraction,biolink:interacts_with,biolink:Gene,biolink:Gene,HGNC:989,biolink:Gene,CARD9,NCBITaxon:9606,Homo sapiens,CARD9,518846,biolink:Gene,BCL10,NCBITaxon:9606,Homo sapiens,BCL10,517786
3,uuid:c3ee0928-9bab-11ee-b780-6b2918cfaf31,HGNC:16391,biolink:PairwiseGeneToGeneInteraction,biolink:interacts_with,biolink:Gene,biolink:Gene,HGNC:989,biolink:Gene,CARD9,NCBITaxon:9606,Homo sapiens,CARD9,518846,biolink:Gene,BCL10,NCBITaxon:9606,Homo sapiens,BCL10,517786
4,uuid:c9de5a17-9bab-11ee-b780-6b2918cfaf31,HGNC:16391,biolink:PairwiseGeneToGeneInteraction,biolink:interacts_with,biolink:Gene,biolink:Gene,HGNC:989,biolink:Gene,CARD9,NCBITaxon:9606,Homo sapiens,CARD9,518846,biolink:Gene,BCL10,NCBITaxon:9606,Homo sapiens,BCL10,517786


In [21]:
len(_edf['subject'].unique().compute())

492073

In [22]:
len(edges_df['subject'].unique().compute())

492073

In [31]:
len(_edf['subject_id'].unique().compute())

492073

In [19]:
_edf.shape[0].compute()

11412471

In [20]:
edges_df.shape[0].compute()

11412471

In [43]:
nodes_df['category'].unique().values.compute()

array(['biolink:PhenotypicFeature', 'biolink:NamedThing',
       'biolink:BiologicalProcessOrActivity', 'biolink:PhenotypicQuality',
       'biolink:AnatomicalEntity', 'biolink:OrganismalEntity',
       'biolink:MolecularEntity', 'biolink:ChemicalEntity',
       'biolink:SmallMolecule', 'biolink:Protein', 'biolink:Drug',
       'biolink:RNAProduct', 'biolink:ConfidenceLevel', 'biolink:Cell',
       'biolink:CellularComponent', 'biolink:MacromolecularComplex',
       'biolink:CellLine', 'biolink:OrganismTaxon', 'biolink:Dataset',
       'biolink:Disease', 'biolink:Gene', 'biolink:GeographicExposure',
       'biolink:Procedure', 'biolink:Transcript', 'biolink:EvidenceType',
       'biolink:InformationContentEntity', 'biolink:DrugExposure',
       'biolink:ChemicalExposure', 'biolink:GrossAnatomicalStructure',
       'biolink:EnvironmentalFeature', 'biolink:EnvironmentalProcess',
       'biolink:Invertebrate', 'biolink:Fungus', 'biolink:Mammal',
       'biolink:Zygosity', 'biolink:Genetic

In [67]:
edges = _edf[['id','subject', 'subject_id', 'subject_category', 'predicate', 'edge_category', 'object_category', 'object_id', 'object']]
print(f"Columns after renaming and triming: {edges.columns}")

In [28]:
def custome(df):
    
    pass

_edf = edges.groupby(['subject_category','predicate','object_category'])['subject_id','object_id'] #.apply(custom)
_edf.get_group(2) #.compute()
# edges.head(n=3)

## grouped_size = _edf.size.compute()

In [83]:
grouped_edge = _edf['subject_category','predicate','object_category'].unique().compute()
for key, edge_info in grouped_edge:
    print(key)

In [63]:
# Function to apply to each partition
def create_edge_tuple(df):
    df['edge_key'] = list(zip(df['subject_category'], df['predicate'], df['object_category']))
    return df

# Apply the function to each partition
# __e = __e.map_partitions(create_edge_tuple)
__edf = __e.map_partitions(create_edge_tuple).compute()
display(__edf.head())


# edges_df['edge_key'] = dd.zip(edges_df['subject_category'], edges_df['predicate'], edges_df['object_category'])

Unnamed: 0,id,subject,subject_id,subject_category,predicate,edge_category,object_category,object_id,object,edge_key
750522,uuid:a08bb3ea-9bab-11ee-b780-6b2918cfaf31,HGNC:21911,562720,biolink:Gene,biolink:interacts_with,biolink:PairwiseGeneToGeneInteraction,biolink:Gene,541256,HGNC:17012,"(biolink:Gene, biolink:interacts_with, biolink..."
739442,uuid:c18f21af-9bab-11ee-b780-6b2918cfaf31,HGNC:30595,563475,biolink:Gene,biolink:interacts_with,biolink:PairwiseGeneToGeneInteraction,biolink:Gene,555052,HGNC:6903,"(biolink:Gene, biolink:interacts_with, biolink..."
3339584,uuid:ab1eab9a-9ba8-11ee-b780-6b2918cfaf31,ZFIN:ZDB-GENE-040718-356,19418,biolink:Gene,biolink:expressed_in,biolink:GeneToExpressionSiteAssociation,biolink:GrossAnatomicalStructure,822506,ZFA:0001094,"(biolink:Gene, biolink:expressed_in, biolink:G..."
3016944,uuid:1a75c6ed-9bb5-11ee-b780-6b2918cfaf31,NCBIGene:396145,306308,biolink:Gene,biolink:interacts_with,biolink:PairwiseGeneToGeneInteraction,biolink:Gene,305657,NCBIGene:395432,"(biolink:Gene, biolink:interacts_with, biolink..."
215609,uuid:f29aebd7-9bad-11ee-b780-6b2918cfaf31,NCBIGene:282322,434384,biolink:Gene,biolink:enables,biolink:MacromolecularMachineToMolecularActivi...,biolink:BiologicalProcessOrActivity,645268,GO:0019904,"(biolink:Gene, biolink:enables, biolink:Biolog..."


In [67]:
# Function to apply to each partition
def create_edge_tuple(df):
    df['edge_key'] = list(zip(df['subject_category'], df['predicate'], df['object_category']))
    return df

# Apply the function to each partition
_temp_e = __e.map_partitions(create_edge_tuple)[['edge_key', 'subject_id','object_id']]
_temp_bag = _temp_e.to_bag(format='dict')
_temp_bag.compute()

[{'edge_key': ('biolink:Gene', 'biolink:interacts_with', 'biolink:Gene'),
  'subject_id': 562720,
  'object_id': 541256},
 {'edge_key': ('biolink:Gene', 'biolink:interacts_with', 'biolink:Gene'),
  'subject_id': 563475,
  'object_id': 555052},
 {'edge_key': ('biolink:Gene',
   'biolink:expressed_in',
   'biolink:GrossAnatomicalStructure'),
  'subject_id': 19418,
  'object_id': 822506},
 {'edge_key': ('biolink:Gene', 'biolink:interacts_with', 'biolink:Gene'),
  'subject_id': 306308,
  'object_id': 305657},
 {'edge_key': ('biolink:Gene',
   'biolink:enables',
   'biolink:BiologicalProcessOrActivity'),
  'subject_id': 434384,
  'object_id': 645268},
 {'edge_key': ('biolink:Gene', 'biolink:interacts_with', 'biolink:Gene'),
  'subject_id': 570287,
  'object_id': 571213},
 {'edge_key': ('biolink:Gene',
   'biolink:enables',
   'biolink:BiologicalProcessOrActivity'),
  'subject_id': 565123,
  'object_id': 632006},
 {'edge_key': ('biolink:Gene', 'biolink:interacts_with', 'biolink:Gene'),
  'su

In [69]:

# Function to apply to each partition
def create_edge_tuple(df):
    df['edge_key'] = list(zip(df['subject_category'], df['predicate'], df['object_category']))
    return df

# Create the key of the edge_index
def key_function(edge):
    return edge['edge_key']

# accumulate the edges for a specific key
def binop(accumulator, edge):
    # accumulator is a list of (subject_id, object_id) tuples
    # edge is the current item being processed
    accumulator.append([edge['subject_id'], edge['object_id']])
    return accumulator

# combine multiple list that are accumulated
def combine(accumulator1, accumulator2):
    # Combine the lists from two accumulators
    return accumulator1 + accumulator2


# Apply the function to each partition
_temp_e = __e.map_partitions(create_edge_tuple)[['edge_key', 'subject_id','object_id']]
_temp_bag = _temp_e.to_bag(format='dict')
# Using foldby to group by 'edge_key' and consolidate (subject_id, object_id) pairs
_r = _temp_bag.foldby(key=key_function, binop=binop, combine=combine, initial=[])




# Convert to dictionary to get the final structure
_fr = dict(_r.compute())


In [70]:
_fr

{('biolink:Gene', 'biolink:interacts_with', 'biolink:Gene'): [(562720, 541256),
  (563475, 555052),
  (19418, 822506),
  (306308, 305657),
  (434384, 645268),
  (570287, 571213),
  (565123, 632006),
  (156507, 186689),
  (136035, 636090),
  (26875, 6936),
  (560117, 569358),
  (265323, 265124),
  (290860, 289440),
  (159477, 159476),
  (204661, 645174),
  (19540, 822506),
  (550388, 682580),
  (544262, 439730),
  (82593, 82586),
  (7542, 643338),
  (550335, 545291),
  (194292, 285516),
  (542763, 537931),
  (213222, 738457),
  (564191, 579538),
  (95332, 96114),
  (24145, 822506),
  (416377, 758628),
  (148439, 147814),
  (2331, 623387),
  (151577, 517351),
  (581430, 511526),
  (802425, 802233),
  (577911, 549204),
  (564798, 545308),
  (6565, 822506),
  (197136, 642243),
  (296832, 634187),
  (229721, 636485),
  (238176, 635546),
  (214868, 205819),
  (252075, 238326),
  (433680, 434189),
  (177165, 632711),
  (52361, 47347),
  (555253, 555221),
  (135507, 657549),
  (540555, 580255)

In [None]:
_temp_e = __e.map_partitions(create_edge_tuple)[['edge_key', 'subject_id','object_id']]

In [None]:
_temp_e

In [40]:
edges['subject_category'].unique().compute()

In [84]:

# Initialize HeteroData for the heterogeneous graph
data = HeteroData()

In [85]:
# Prepare node mapping and node types
# node_mapping = {node_id: i for i, node_id in enumerate(nodes_df['id'].unique())}
node_types = nodes_df['category'].unique()

# Add nodes to the graph
for node_type in tqdm(node_types):
    # get nodes of type `node_type`
    mask = nodes_df['category'] == node_type
    type_nodes = nodes_df[mask].compute()
    
    # node_features = torch.ones((type_nodes.shape[0], 1))
    data[node_type].num_nodes = type_nodes.shape[0] # node_features # torch.tensor(type_nodes.index.to_list(), dtype=torch.long).view(len(type_nodes.index.to_list()),1)

    # Add dummy features (e.g., a simple constant feature)
    data[node_type].x = torch.ones((data[node_type].num_nodes, 1))  # Each user has a feature vector of [1]



100%|██████████| 88/88 [04:22<00:00,  2.98s/it]


In [98]:
__e = edges.sample(frac=0.0001)
__e.shape[0].compute()

1141

In [18]:
__e.compute()

Unnamed: 0,id,subject,subject_id,subject_category,predicate,edge_category,object_category,object_id,object
6158261,uuid:a9ca20a5-9bb4-11ee-b780-6b2918cfaf31,FB:FBgn0026373,283734,biolink:Gene,biolink:interacts_with,biolink:PairwiseGeneToGeneInteraction,biolink:Gene,304300,FB:FBgn0283478
5868283,uuid:c6ebb543-9ba8-11ee-b780-6b2918cfaf31,MGI:95841,237728,biolink:Gene,biolink:expressed_in,biolink:GeneToExpressionSiteAssociation,biolink:GrossAnatomicalStructure,32,EMAPA:16099
3818660,uuid:c83c398b-9bad-11ee-b780-6b2918cfaf31,RGD:1305649,194095,biolink:Gene,biolink:actively_involved_in,biolink:MacromolecularMachineToBiologicalProce...,biolink:BiologicalProcessOrActivity,17838,GO:0045663
1680312,uuid:73c15680-9bb0-11ee-b780-6b2918cfaf31,HGNC:11582,553852,biolink:Gene,biolink:has_phenotype,biolink:GeneToPhenotypicFeatureAssociation,biolink:PhenotypicFeature,9217,HP:0002079
2671903,uuid:417dfc0e-9bb6-11ee-b780-6b2918cfaf31,HGNC:14275,536376,biolink:Gene,biolink:interacts_with,biolink:PairwiseGeneToGeneInteraction,biolink:Gene,548847,HGNC:10348
...,...,...,...,...,...,...,...,...,...
2507403,uuid:abb9701d-9ba8-11ee-b780-6b2918cfaf31,ZFIN:ZDB-GENE-010919-2,5591,biolink:Gene,biolink:expressed_in,biolink:GeneToExpressionSiteAssociation,biolink:GrossAnatomicalStructure,28306,ZFA:0007022
981752,uuid:02e2b4a0-9bb6-11ee-b780-6b2918cfaf31,RGD:1305513,185237,biolink:Gene,biolink:interacts_with,biolink:PairwiseGeneToGeneInteraction,biolink:Gene,188860,RGD:620754
1229786,uuid:fa507149-9bb4-11ee-b780-6b2918cfaf31,Xenbase:XB-GENE-1007374,52513,biolink:Gene,biolink:interacts_with,biolink:PairwiseGeneToGeneInteraction,biolink:Gene,48704,Xenbase:XB-GENE-970920
554587,uuid:bf217726-9bb4-11ee-b780-6b2918cfaf31,ZFIN:ZDB-GENE-040808-64,19633,biolink:Gene,biolink:interacts_with,biolink:PairwiseGeneToGeneInteraction,biolink:Gene,8350,ZFIN:ZDB-GENE-050417-399


In [93]:
# Scale the cluster to the desired number of workers
cluster.scale(jobs=1)  # Request 10 jobs, adjust based on your needs

In [None]:
__e['edge_key'] = da

In [18]:

# Prepare edge types and mappings efficiently using Dask
edge_types = edges['edge_category'].unique().compute()  # Compute edge types on scheduler

def group_and_create_edge_index(df):
    edge_type_mappings = defaultdict(list)
    for _, row in df.iterrows():
        edge_key = (row['subject_category'], row['predicate'], row['object_category'])
        # print(edge_key)
        # print((row['subject_id'], row['object_id']))
        edge_type_mappings[edge_key].append((row['subject_id'], row['object_id']))

    results = []
    # Create edge index for each edge type within partitions
    for edge_key, edge_indices in edge_type_mappings.items():
        # print(edge_key)
        # print(edge_indices)
        edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()
        # logging.debug(edge_key, edge_index)
        # yield edge_key, edge_index
        results.append((edge_key, edge_index))
    return results  # Return a list of tuples instead of a generator

# # Parallelize edge index creation across partitions
# edge_index_futures = edges.map_partitions(group_and_create_edge_index, meta=('edge_key', object))
# # edge_index_futures = edges.map_partitions(group_and_create_edge_index, meta=('x', object))

# result = edge_index_futures.compute()

# Gather results, ensuring proper edge_key ordering
# edge_index_dict = dict(result)


# Gemini
import dask.bag as db

bag = db.from_delayed(db.from_sequence(edges.map_partitions(group_and_create_edge_index, meta=('edge_key', object))))



In [35]:
edge_index_dict

{0: <generator object group_and_create_edge_index at 0x2af52a977c10>,
 1: <generator object group_and_create_edge_index at 0x2af5309ad2e0>,
 2: <generator object group_and_create_edge_index at 0x2af5387ebd60>}

In [22]:
for edge_key, edge_index in edge_index_dict.items():
    print(edge_key)
    print(edge_index[0])
    # data[edge_index[0]].edge_index = edge_index

In [None]:

# Prepare edge types and mappings
edge_types = edges['edge_category'].unique()
edge_type_mappings = defaultdict(list)

for _, row in edges.iterrows():
    edge_key = (row['subject_category', row['predicate'], row['object_category'])
    edge_type_mappings[edge_key].append((row['subject_id'], row['object_id']))

# Add edges to the graph
for edge_key, edge_indices in edge_type_mappings.items():
    edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()
    data[edge_key].edge_index = edge_index

In [None]:
def map_nodes(df, node_mapping):
    # Map 'subject' and 'object' columns to their indices using the `node_mapping`
    df['subject_idx'] = df['subject'].map(node_mapping)
    df['object_idx'] = df['object'].map(node_mapping)
    return df[['subject_category', 'predicate', 'object_category', 'subject_idx', 'object_idx']]

edges_mapped = edges_df.map_partitions(map_nodes, node_mapping=node_mapping, meta=edges_df)


In [None]:
def aggregate_edges(df):
    # Group by edge type and aggregate indices into lists
    grouped = df.groupby(['subject_category', 'predicate', 'object_category'])
    aggregated = grouped.apply(lambda x: list(zip(x['subject_idx'], x['object_idx'])), meta=('edges', 'object'))
    return aggregated.reset_index().rename(columns={0: 'edge_indices'})

edges_aggregated = edges_mapped.map_partitions(aggregate_edges, meta=edges_mapped)


In [23]:
import torch

In [14]:
node_mapping = {node_id: i for i, node_id in enumerate(nodes_df['id'].unique())}

In [24]:

edge_groups = edges.groupby(['subject_category', 'edge_category', 'object_category'])

def process_edge_group(group):
    subject_indices = group['subject_id'].compute()
    object_indices = group['object_id'].compute()
    edge_indices = torch.stack([subject_indices, object_indices], dim=0)
    # attributes = group[['attributes']].compute() if 'attributes' in group else None  # Load attributes with compute() if needed
    return dict(edge_index=edge_indices.contiguous())

# edge_data = dict(dask.dataframe.map_partitions(process_edge_group, edge_groups, meta={'edge_index': (torch.tensor, [None, None], torch.long))
#                 .compute())
                                                                                      
edge_data = dict(edge_groups.apply(process_edge_group, meta={'edge_index': (torch.tensor, [None, None], torch.long)}))
# edge_data = dict(map(process_edge_group, edge_groups))


In [None]:
# Prepare node mapping and node types
node_mapping = {node_id: i for i, node_id in enumerate(nodes_df['id'].unique())}
node_types = nodes_df['category'].unique()

# # Initialize HeteroData for the heterogeneous graph
# data = HeteroData()

# # Add nodes to the graph
# for node_type in node_types:
#     # get nodes of type `node_type`
#     mask = nodes_df['category'] == node_type
#     type_nodes = nodes_df[mask]
    
#     # node_features = torch.ones((type_nodes.shape[0], 1))
#     data[node_type].num_nodes = type_nodes.shape[0] # node_features # torch.tensor(type_nodes.index.to_list(), dtype=torch.long).view(len(type_nodes.index.to_list()),1)

#     # Add dummy features (e.g., a simple constant feature)
#     data[node_type].x = torch.ones((data[node_type].num_nodes, 1))  # Each user has a feature vector of [1]


# Define a function to apply to each partition of the DataFrame
def process_partition(partition, node_mapping):
    edge_type_mappings = defaultdict(list)
    for _, row in partition.iterrows():
        edge_key = (row['subject_category', row['predicate'], row['object_category'])
        edge_type_mappings[edge_key].append((node_mapping[row['subject']], node_mapping[row['object']]))
    return edge_type_mappings

# Use map_partitions to apply the function to each partition
results = edges_df.map_partitions(process_partition, node_mapping=node_mapping, meta='object').compute()

# Combine the results from all partitions
combined_edge_type_mappings = defaultdict(list)
for result in results:
    for key, value in result.items():
        combined_edge_type_mappings[key].extend(value)

# Convert edge mappings to PyTorch tensors and add to HeteroData
data = HeteroData()
for edge_key, edge_indices in combined_edge_type_mappings.items():
    edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()
    data[edge_key].edge_index = edge_index

In [68]:
################## INITIALIZE NODES ##################

# Prepare node mapping and node types
node_mapping = {node_id: i for i, node_id in enumerate(nodes_df.index.unique())}
node_types = nodes_df['category'].unique()

# Initialize HeteroData for the heterogeneous graph
data = HeteroData()
    
# Prepare node mapping and node types
# node_mapping = {node_id: i for i, node_id in enumerate(nodes_df['id'].unique())}
node_types = nodes_df['category'].unique()

# Add nodes to the graph
for node_type in tqdm(node_types):
    # get nodes of type `node_type`
    mask = nodes_df['category'] == node_type
    type_nodes = nodes_df[mask].compute()
    
    # node_features = torch.ones((type_nodes.shape[0], 1))
    data[node_type].num_nodes = type_nodes.shape[0] # node_features # torch.tensor(type_nodes.index.to_list(), dtype=torch.long).view(len(type_nodes.index.to_list()),1)

    # Add dummy features (e.g., a simple constant feature)
    data[node_type].x = torch.ones((data[node_type].num_nodes, 1))  # Each user has a feature vector of [1]



100%|██████████| 88/88 [05:13<00:00,  3.57s/it]


In [97]:
cluster.scale(jobs=1)

In [99]:
edges = __e.copy()

# Convert the edge categories to categoricals for efficiency
edges['subject_category'] = edges['subject_category'].astype('category')
edges['predicate'] = edges['predicate'].astype('category')
edges['object_category'] = edges['object_category'].astype('category')
edges['edge_category'] = edges['edge_category'].astype('category')

# From unknown type categorical to known type categorical
edges = edges.categorize(columns=['subject_category','predicate','object_category','edge_category'])

# Prepare edge types and mappings
edge_types = edges['edge_category'].unique().compute()  # Compute edge types on scheduler

In [70]:
edges['subject_category'].cat.known

True

In [71]:
print(edges.dtypes)

In [72]:
edges.head()

Unnamed: 0,id,subject,subject_id,subject_category,predicate,edge_category,object_category,object_id,object
0,uuid:6ef055fb-9bab-11ee-b780-6b2918cfaf31,HGNC:16391,518846,biolink:Gene,biolink:interacts_with,biolink:PairwiseGeneToGeneInteraction,biolink:Gene,517786,HGNC:989
1,uuid:6ef055fc-9bab-11ee-b780-6b2918cfaf31,HGNC:16391,518846,biolink:Gene,biolink:interacts_with,biolink:PairwiseGeneToGeneInteraction,biolink:Gene,517786,HGNC:989
2,uuid:a77a4a50-9bab-11ee-b780-6b2918cfaf31,HGNC:16391,518846,biolink:Gene,biolink:interacts_with,biolink:PairwiseGeneToGeneInteraction,biolink:Gene,517786,HGNC:989
3,uuid:c3ee0928-9bab-11ee-b780-6b2918cfaf31,HGNC:16391,518846,biolink:Gene,biolink:interacts_with,biolink:PairwiseGeneToGeneInteraction,biolink:Gene,517786,HGNC:989
4,uuid:c9de5a17-9bab-11ee-b780-6b2918cfaf31,HGNC:16391,518846,biolink:Gene,biolink:interacts_with,biolink:PairwiseGeneToGeneInteraction,biolink:Gene,517786,HGNC:989


In [43]:
edges[['subject_category','predicate','object_category','edge_category']].isna().any().compute()

subject_category    False
predicate           False
object_category     False
edge_category       False
dtype: bool

In [39]:
# Function to apply to each partition
def create_edge_tuple(df):
    # Ensure the function always returns an iterable, even if df is empty
    return zip(df['subject_category'], df['predicate'], df['object_category'],
               df['subject_id'], df['object_id'])

# Convert DataFrame to Bag of tuples
# display(edges)
ebag = edges.map_partitions(create_edge_tuple)
# display(list(ebag.compute()[0]))
ebag = ebag.to_bag()
# display(ebag.compute())


[None, None, None]

In [48]:
edges.npartitions

3

In [33]:
eb = ebag.compute()
eb

[None, None, None]

In [100]:
# Function to convert DataFrame rows to tuples for the Bag
def create_edge_tuple(row):
    return (row['subject_category'], row['predicate'], row['object_category']), row['subject_id'], row['object_id']

# Create a Bag from the DataFrame, and map the conversion function to each row
edges_bag = edges.map_partitions(lambda df: df.apply(create_edge_tuple, axis=1)).to_bag()
# display(edges_bag.compute())

In [101]:
# Key for the edge type mapping
def key_function(x):
    # print(x)
    return (x[0], x[1], x[2])

def binop(accumulator, edge):
    if edge is not None:  # Check if edge is not None
        # Append the tuple to the accumulator
        accumulator.append((edge[1], edge[2]))  # edge[1] is subject_id, edge[2] is object_id
    return accumulator

def combine(accumulator1, accumulator2):
    # print(accumulator1)
    # Extend the first accumulator with the second
    accumulator1.extend(accumulator2)
    return accumulator1

# # accumulate the edges for a specific key
# def binop(accumulator, edge):
#     # accumulator is a list of (subject_id, object_id) tuples
#     # edge is the current item being processed
#     accumulator.append([(x[3], x[4])])
#     return accumulator

# # combine multiple list that are accumulated
# def combine(accumulator1, accumulator2):
#     # Combine the lists from two accumulators
#     return accumulator1 + accumulator2

# Use foldby with the process_tuples function
edge_type_mappings = edges_bag.foldby(key=lambda x: x[0], binop=binop, combine=combine, initial=[])
edge_type_mappings = edge_type_mappings.compute()  # Trigger the computation


In [102]:
edge_type_mappings

[(('biolink:Gene', 'biolink:expressed_in', 'biolink:GrossAnatomicalStructure'),
  [(214529, 2302),
   (255135, 209),
   (521263, 145),
   (277734, 6514),
   (536392, 536397),
   (277996, 3848),
   (526349, 9248),
   (261180, 11324),
   (285549, 4639),
   (132180, 406),
   (523251, 537659),
   (541378, 550690),
   (284140, 1210),
   (156420, 14197),
   (554164, 553328),
   (288937, 4538),
   (523095, 557837),
   (189348, 11267),
   (302668, 4478),
   (11541, 9208),
   (520411, 19469),
   (148445, 16628),
   (288756, 1150),
   (527915, 3324),
   (551535, 553018),
   (287027, 3888),
   (148752, 1082),
   (266971, 232),
   (178614, 21467),
   (4330, 2456),
   (17698, 16215),
   (526489, 3324),
   (3342, 1464),
   (160026, 24583),
   (300647, 1207),
   (133935, 897),
   (542122, 542132),
   (147143, 28018),
   (11951, 8621),
   (221998, 466),
   (284153, 285257),
   (294300, 4639),
   (543835, 517892),
   (277169, 299077),
   (542301, 553690),
   (523875, 7833),
   (154858, 12446),
   (5262

In [19]:
################## INITIALIZE EDGES ##################

# Prepare edge types and mappings
edge_types = edges['edge_category'].unique()

# Function to apply to each partition
def create_edge_tuple(df):
    df['edge_key'] = list(zip(df['subject_category'], df['predicate'], df['object_category']))
    return df

# Key for the edge type mapping
def key_function(edge):
    return edge['edge_key']

# accumulate the edges for a specific key
def binop(accumulator, edge):
    # accumulator is a list of (subject_id, object_id) tuples
    # edge is the current item being processed
    accumulator.append([edge['subject_id'], edge['object_id']])
    return accumulator

# combine multiple list that are accumulated
def combine(accumulator1, accumulator2):
    # Combine the lists from two accumulators
    return accumulator1 + accumulator2


# Apply the function to each partition
_edf = edges.map_partitions(create_edge_tuple)
_edf = _edf[['edge_key','subject_id','object_id']]
_ebag = _edf.to_bag(format='dict')
# Using foldby to group by 'edge_key' and consolidate (subject_id, object_id) pairs
edge_type_mappings = _ebag.foldby(key=key_function, binop=binop, combine=combine, initial=[]).compute()    # Computation Heavy


ERROR:tornado.application:Uncaught exception GET /info/call-stacks/tcp%3A%2F%2F10.29.81.147%3A46004.html (10.29.77.38)
HTTPServerRequest(protocol='http', host='localhost:8787', method='GET', uri='/info/call-stacks/tcp%3A%2F%2F10.29.81.147%3A46004.html', version='HTTP/1.1', remote_ip='10.29.77.38')
Traceback (most recent call last):
  File "/home/rahit/jupyter_py3/lib/python3.8/site-packages/distributed/comm/core.py", line 328, in connect
    handshake = await asyncio.wait_for(comm.read(), time_left())
  File "/cvmfs/soft.computecanada.ca/easybuild/software/2020/avx2/Core/python/3.8.10/lib/python3.8/asyncio/tasks.py", line 501, in wait_for
    raise exceptions.TimeoutError()
asyncio.exceptions.TimeoutError

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/rahit/jupyter_py3/lib/python3.8/site-packages/tornado/web.py", line 1704, in _execute
    result = await result
  File "/home/rahit/jupyter_py3/lib/python3.8/site-pa

ERROR:tornado.application:Uncaught exception GET /info/call-stacks/tcp%3A%2F%2F10.29.81.147%3A46004.html (10.29.77.38)
HTTPServerRequest(protocol='http', host='localhost:8787', method='GET', uri='/info/call-stacks/tcp%3A%2F%2F10.29.81.147%3A46004.html', version='HTTP/1.1', remote_ip='10.29.77.38')
Traceback (most recent call last):
  File "/home/rahit/jupyter_py3/lib/python3.8/site-packages/distributed/comm/core.py", line 328, in connect
    handshake = await asyncio.wait_for(comm.read(), time_left())
  File "/cvmfs/soft.computecanada.ca/easybuild/software/2020/avx2/Core/python/3.8.10/lib/python3.8/asyncio/tasks.py", line 501, in wait_for
    raise exceptions.TimeoutError()
asyncio.exceptions.TimeoutError

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/rahit/jupyter_py3/lib/python3.8/site-packages/tornado/web.py", line 1704, in _execute
    result = await result
  File "/home/rahit/jupyter_py3/lib/python3.8/site-pa

2024-02-13 03:35:36,979 - distributed.scheduler - ERROR - broadcast to tcp://10.29.81.147:46004 failed: OSError: Timed out during handshake while connecting to tcp://10.29.81.147:46004 after 30 s


ERROR:tornado.application:Uncaught exception GET /info/logs/tcp%3A%2F%2F10.29.81.147%3A46004.html (10.29.77.38)
HTTPServerRequest(protocol='http', host='localhost:8787', method='GET', uri='/info/logs/tcp%3A%2F%2F10.29.81.147%3A46004.html', version='HTTP/1.1', remote_ip='10.29.77.38')
Traceback (most recent call last):
  File "/home/rahit/jupyter_py3/lib/python3.8/site-packages/distributed/comm/core.py", line 328, in connect
    handshake = await asyncio.wait_for(comm.read(), time_left())
  File "/cvmfs/soft.computecanada.ca/easybuild/software/2020/avx2/Core/python/3.8.10/lib/python3.8/asyncio/tasks.py", line 501, in wait_for
    raise exceptions.TimeoutError()
asyncio.exceptions.TimeoutError

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/rahit/jupyter_py3/lib/python3.8/site-packages/tornado/web.py", line 1704, in _execute
    result = await result
  File "/home/rahit/jupyter_py3/lib/python3.8/site-packages/distrib

2024-02-13 03:38:30,658 - distributed.scheduler - ERROR - broadcast to tcp://10.29.81.147:46004 failed: CommClosedError: in <TCP (closed) Scheduler Broadcast local=tcp://10.29.84.176:44546 remote=tcp://10.29.81.147:46004>: Stream is closed


ERROR:tornado.application:Uncaught exception GET /info/logs/tcp%3A%2F%2F10.29.81.147%3A46004.html (10.29.77.38)
HTTPServerRequest(protocol='http', host='localhost:8787', method='GET', uri='/info/logs/tcp%3A%2F%2F10.29.81.147%3A46004.html', version='HTTP/1.1', remote_ip='10.29.77.38')
Traceback (most recent call last):
  File "/home/rahit/jupyter_py3/lib/python3.8/site-packages/distributed/comm/tcp.py", line 225, in read
    frames_nbytes = await stream.read_bytes(fmt_size)
tornado.iostream.StreamClosedError: Stream is closed

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/rahit/jupyter_py3/lib/python3.8/site-packages/tornado/web.py", line 1704, in _execute
    result = await result
  File "/home/rahit/jupyter_py3/lib/python3.8/site-packages/distributed/utils.py", line 741, in wrapper
    return await func(*args, **kwargs)
  File "/home/rahit/jupyter_py3/lib/python3.8/site-packages/distributed/http/scheduler/info.py

In [111]:
import sys

# Assuming 'result' is your Dask-computed variable
size_in_bytes = sys.getsizeof(data)
print(f"Size of the variable in bytes: {size_in_bytes}")

# Convert bytes to more readable units like MB
size_in_mb = size_in_bytes / (1024**2)
print(f"Size of the variable in MB: {size_in_mb}")


In [95]:
import pickle

with open('edge_type_mappings.pkl', 'wb') as f:
    pickle.dump(edge_type_mappings, f)

2024-02-15 03:47:05,435 - tornado.application - ERROR - Exception in callback functools.partial(<bound method IOLoop._discard_future_result of <zmq.eventloop.ioloop.ZMQIOLoop object at 0x2b05f41bb730>>, <Task finished name='Task-6068326' coro=<ProfileTimePlot.trigger_update.<locals>.cb() done, defined at /home/rahit/jupyter_py3/lib/python3.8/site-packages/distributed/utils.py:739> exception=AttributeError("'NoneType' object has no attribute 'add_next_tick_callback'")>)
Traceback (most recent call last):
  File "/home/rahit/jupyter_py3/lib/python3.8/site-packages/tornado/ioloop.py", line 741, in _run_callback
    ret = callback()
  File "/home/rahit/jupyter_py3/lib/python3.8/site-packages/tornado/ioloop.py", line 765, in _discard_future_result
    future.result()
  File "/home/rahit/jupyter_py3/lib/python3.8/site-packages/distributed/utils.py", line 741, in wrapper
    return await func(*args, **kwargs)
  File "/home/rahit/jupyter_py3/lib/python3.8/site-packages/distributed/dashboard/co

2024-02-15 03:48:17,663 - distributed.scheduler - ERROR - Couldn't gather keys {"('foldby-b-b1c8781001549f00d8728f67e5dbce19', 0)": ['tcp://10.29.85.254:33416']} state: [None] workers: ['tcp://10.29.85.254:33416']
NoneType: None


2024-02-15 03:48:17,672 - distributed.scheduler - ERROR - Shut down workers that don't have promised key: ['tcp://10.29.85.254:33416'], ('foldby-b-b1c8781001549f00d8728f67e5dbce19', 0)
NoneType: None


In [104]:
# Add edges to the graph
for edge_key, edge_indices in edge_type_mappings:
    edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()
    data[edge_key].edge_index = edge_index

In [109]:
len(data.edge_types)

43

In [113]:
# data.save('test.pt')
torch.save(data, 'test_hetero_data.pt')

In [21]:
# Prepare node mapping and node types
node_mapping = {node_id: i for i, node_id in enumerate(nodes_df['id'].unique())}
node_types = nodes_df['category'].unique()

# Initialize HeteroData for the heterogeneous graph
data = HeteroData()

# Add nodes to the graph
for node_type in node_types:
    # get nodes of type `node_type`
    mask = nodes_df['category'] == node_type
    type_nodes = nodes_df[mask]
    
    # node_features = torch.ones((type_nodes.shape[0], 1))
    data[node_type].num_nodes = type_nodes.shape[0] # node_features # torch.tensor(type_nodes.index.to_list(), dtype=torch.long).view(len(type_nodes.index.to_list()),1)

    # Add dummy features (e.g., a simple constant feature)
    data[node_type].x = torch.ones((data[node_type].num_nodes, 1))  # Each user has a feature vector of [1]


# Prepare edge types and mappings
edge_types = edges_df['edge_category'].unique()
edge_type_mappings = defaultdict(list)

for _, row in edges_df.iterrows():
    subject_type = nodes_df.loc[nodes_df['id'] == row['subject'], 'category'].values[0]
    object_type = nodes_df.loc[nodes_df['id'] == row['object'], 'category'].values[0]
    edge_key = (row['subject_category', row['predicate'], row['object_category'])
    edge_type_mappings[edge_key].append((node_mapping[row['subject']], node_mapping[row['object']]))

# Add edges to the graph
for edge_key, edge_indices in edge_type_mappings.items():
    edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()
    data[edge_key].edge_index = edge_index


In [None]:
len(edge_types)
edge_type_mappings

In [82]:
cluster.close()

In [83]:
client.close()