In [20]:
%reload_kedro

In [21]:
import itertools
from collections import defaultdict
from rich import print
from IPython.display import display, HTML

import dask
import dask.dataframe as dd
import dask.array as da
from dask.distributed import Client, progress, performance_report
from dask_jobqueue import SLURMCluster

In [22]:
cluster = SLURMCluster(
    cores=16,  # Number of cores per job
    processes=1,
    memory='32GB',  # Memory allocated to each worker
    walltime='00:30:00',  # Walltime limit for each job
    # Specify any additional SLURM or Dask configurations as needed
)

In [23]:
client = Client(cluster)  # start distributed scheduler locally.

In [24]:
# Scale the cluster to the desired number of workers
cluster.scale(jobs=1)  # Request 10 jobs, adjust based on your needs

In [25]:
!pwd

/scratch/rahit/modspy-data/notebooks/monarch_gnn


In [26]:
nodes_df = dd.read_csv('../../data/01_raw/monarch/monarch-kg_nodes.tsv', sep='\t', usecols=['id', 'category', 'name', 'in_taxon', 'in_taxon_label', 'symbol'], dtype={'id':'object', 
                        'category':'object', 'name':'object', 'in_taxon':'object', 'in_taxon_label':'object', 'symbol':'object'})
edges_df = dd.read_csv('../../data/01_raw/monarch/monarch-kg_edges.tsv', sep='\t', usecols=['id', 'original_subject', 'predicate', 'original_object', 'category', 'subject', 'object'],
                      dtype={'id':'object', 'original_subject':'object', 'predicate':'object', 'original_object':'object', 'category':'object', 'subject':'object', 'object':'object'})

In [8]:
edges_df.head()

Unnamed: 0,id,original_subject,predicate,original_object,category,subject,object
0,uuid:68d6e706-9bb0-11ee-b780-6b2918cfaf31,NCBIGene:64170,biolink:causes,OMIM:212050,biolink:CausalGeneToDiseaseAssociation,HGNC:16391,MONDO:0008905
1,uuid:68d6e707-9bb0-11ee-b780-6b2918cfaf31,NCBIGene:51256,biolink:causes,OMIM:248000,biolink:CausalGeneToDiseaseAssociation,HGNC:21066,MONDO:0009544
2,uuid:68d6e708-9bb0-11ee-b780-6b2918cfaf31,NCBIGene:28981,biolink:causes,OMIM:617895,biolink:CausalGeneToDiseaseAssociation,HGNC:14313,MONDO:0033485
3,uuid:68d6e709-9bb0-11ee-b780-6b2918cfaf31,NCBIGene:8216,biolink:causes,OMIM:616564,biolink:CausalGeneToDiseaseAssociation,HGNC:6742,MONDO:0014693
4,uuid:68d6e70a-9bb0-11ee-b780-6b2918cfaf31,NCBIGene:6505,biolink:contributes_to,OMIM:615232,biolink:CorrelatedGeneToDiseaseAssociation,HGNC:10939,MONDO:0014092


Adding nodes' category information by merging node dataframe on the subject and object column.

In [27]:
nodes_df = nodes_df.set_index('id')

In [10]:
_edf = edges_df.merge(nodes_df, left_on='subject', right_index=True, suffixes=('_ndf', '_edf'))    # @TODO swap _ndf and _edf 
_edf = _edf.rename(columns={'category_ndf': 'subject_category', 'category_edf': 'edge_category'})
print(f"Columns after merging on subject category: {_edf.columns}")

_edf = _edf.merge(nodes_df, left_on='object', right_index=True, suffixes=('_ndf', '_edf'))
_edf = _edf.rename(columns={'category': 'object_category'})
print(f"Columns after merging on object category: {_edf.columns}")

In [11]:
edges = _edf[['id','subject', 'subject_category', 'predicate', 'edge_category', 'object_category', 'object']]
print(f"Columns after renaming and triming: {edges.columns}")

In [12]:
# edges['edge_key']  = edges.apply(lambda row:(row['subject_category'], row['predicate'], row['object_category']), axis=1, meta=pd.Series(dtype=(set<str>)))
edges = edges.reset_index(drop=True)
edges.columns

Index(['id', 'subject', 'subject_category', 'predicate', 'edge_category',
       'object_category', 'object'],
      dtype='object')

In [13]:
edges.to_parquet('../../data/02_intermediate/monarch/edges_with_node_cat', write_index=False)  

In [19]:
!ls -ahl '../../data/02_intermediate/monarch/edges_with_node_cat'

total 168M
drwxr-x--- 2 rahit rahit  33K Feb  7 19:15 .
drwxr-x--- 3 rahit rahit  33K Feb  7 18:17 ..
-rw-r----- 1 rahit rahit 100M Feb  7 19:15 part.0.parquet
-rw-r----- 1 rahit rahit  16M Feb  7 19:15 part.1.parquet
-rw-r----- 1 rahit rahit  53M Feb  7 19:15 part.2.parquet


In [15]:
edges.shape[0].compute()

11412471

In [34]:
cluster.close()

In [35]:
client.close()