In [1]:
%reload_kedro

In [3]:
# import itertools
# import logging
# from collections import defaultdict

from tqdm import tqdm
from rich import print
from IPython.display import display, HTML

import dask
import dask.dataframe as dd
import dask.array as da
import dask.bag as db
from dask.distributed import Client, progress, performance_report
from dask_jobqueue import SLURMCluster

import torch
from torch_geometric.data import HeteroData

In [19]:
cluster = SLURMCluster(
    cores=1,  # Number of cores per job
    processes=1,
    memory='68GB',  # Memory allocated to each worker
    walltime='01:30:00',  # Walltime limit for each job
    # Specify any additional SLURM or Dask configurations as needed
)

In [20]:
client = Client(cluster)  # start distributed scheduler locally.

In [21]:
display(cluster)
display(client)

Tab(children=(HTML(value='<div class="jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-outpu…

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://10.29.86.12:8787/status,

0,1
Dashboard: http://10.29.86.12:8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.29.86.12:33233,Workers: 0
Dashboard: http://10.29.86.12:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [22]:
# Scale the cluster to the desired number of workers
cluster.scale(jobs=4)  # Request 10 jobs, adjust based on your needs

In [7]:
!pwd

/home/rahit/scratch/modspy-data


In [4]:
# nodes_df = dd.read_csv('./data/01_raw/monarch/monarch-kg_nodes.tsv', sep='\t', usecols=['id', 'category', 'name', 'in_taxon', 'in_taxon_label', 'symbol'], dtype={'id':'object', 
#                         'category':'object', 'name':'object', 'in_taxon':'object', 'in_taxon_label':'object', 'symbol':'object'})
nodes_df = dd.read_parquet('./data/02_intermediate/monarch/nodes_with_type_idx')  
# edges_df = dd.read_parquet('./data/02_intermediate/monarch/edges_with_node_cat')

In [23]:

# Function to convert DataFrame rows to tuples for the Bag
def create_edge_tuple(row):
    return row['subject_category'], row['predicate'], row['object_category'], row['subject_id'], row['object_id']

def binop(accumulator, edge):
    if accumulator is None:
        accumulator = []  # Initialize accumulator if it's None
    accumulator.append((edge[3], edge[4]))
    return accumulator

# def combine(accumulator1, accumulator2):
#     return accumulator1.extend(accumulator2)
def combine(accumulator1, accumulator2):
    if accumulator1 is None:
        accumulator1 = []
    if accumulator2 is None:
        accumulator2 = []
    return accumulator1 + accumulator2  # Use + for list concatenation



In [38]:
edges_df = edges_df.repartition(npartitions=16)  # only 1/100th of the data
edges_df = edges_df.persist()  # if on a distributed system

In [None]:
############# CHANGE ME #######################
# Should be removed. It is added because the edge dataframe has erroneous column names.
edges_df = edges_df.rename(columns={'subject_category': 'e_category', 'edge_category': 'subject_category'}).rename(columns={'e_category': 'edge_category'})
###############################################
# display(edges_df.head())
_edf = edges_df.merge(nodes_df, left_on='subject', right_on='id', suffixes=('_ndf', '_edf'))
# print(f"Columns after merging on subject category: {_edf.columns}")
# display(_edf.head())
_edf = _edf.rename(columns={'type_index': 'subject_id'})
# print(f"Columns after merging on subject category: {_edf.columns}")
# display(_edf.head())
_edf = _edf.merge(nodes_df, left_on='object', right_on='id', suffixes=('_ndf', '_edf'))
# print(f"Columns after merging on object category: {_edf.columns}")
_edf = _edf.rename(columns={'type_index': 'object_id'})
# print(f"Columns after merging on object category: {_edf.columns}")
# display(_edf.head())

# Keep only the columns we need
edges = _edf[['id','subject', 'subject_id', 'subject_category', 'predicate', 'edge_category', 'object_category', 'object_id', 'object']].copy()
# print(f"Columns after renaming and trimming: {edges.columns}")
    
# logger.info(f"➡️ Creating edges")

################## INITIALIZE EDGES ##################
# Convert the edge categories to categoricals for efficiency
edges['subject_category'] = edges['subject_category'].astype('category')
edges['predicate'] = edges['predicate'].astype('category')
edges['object_category'] = edges['object_category'].astype('category')
# edges['edge_category'] = edges['edge_category'].astype('category')

# From unknown type categorical to known type categorical
edges = edges.categorize(columns=['subject_category','predicate','object_category'])

# Map ('subject_category','predicate','object_category') to integer index for optimization purpose
edges['edge_key'] = list(zip(edges['subject_category'], edges['predicate'], edges['object_category']))
mapping_dict = {k: i for i, k in enumerate(edges['edge_key'].unique().compute())}
edges['edge_key_id'] = edges['combined'].map(mapping_dict)
edges['edge_key_id'] = edges['edge_key_id'].astype('category')
edges = edges.categorize(columns=['edge_key_id'])   # From unknown type categorical to known type categorical

# Save edges dataframe at current state before reducing.
edges.to_parquet()


In [None]:

# Reduce edges dataframe to reduce memory overhead
edges = _edf[['edge_key_id','subject_id','object_id']].copy()


# # Prepare edge types and mappings
# edge_types = edges['edge_category'].unique().compute()  # Compute edge types on scheduler

# Create a Bag from the DataFrame, and map the conversion function to each row
edges_bag = edges.map_partitions(lambda df: df.apply(create_edge_tuple, axis=1)).to_bag()
# display(edges_bag.compute())
# # Use map_partitions with itertuples to convert each row to the desired tuple format
# edges_bag = edges.map_partitions(
#     lambda df: [create_edge_tuple_optimized(row) for row in df.itertuples(index=False)],
#     meta=(('edge_type', 'object'), ('subject_id', 'int64'), ('object_id', 'int64'))
# ).to_bag()

# # Function to convert a tuple to a string
# def tuple_to_string(t):
#     # Convert the tuple to a string format of your choice
#     # This example uses a simple comma-separated format
#     return ','.join(map(str, t))

# # Map the conversion function over the bag to convert tuples to strings
edges_bag = edges_bag.map(lambda x: str(x))


In [39]:

# # Persist the intermediate result
# edges_bag = edges_bag.persist()
edges_bag.to_textfiles('./data/02_intermediate/monarch/edges_bag/*.json.gz')



['/scratch/rahit/modspy-data/./data/02_intermediate/monarch/edges_bag/00.json.gz',
 '/scratch/rahit/modspy-data/./data/02_intermediate/monarch/edges_bag/01.json.gz',
 '/scratch/rahit/modspy-data/./data/02_intermediate/monarch/edges_bag/02.json.gz',
 '/scratch/rahit/modspy-data/./data/02_intermediate/monarch/edges_bag/03.json.gz',
 '/scratch/rahit/modspy-data/./data/02_intermediate/monarch/edges_bag/04.json.gz',
 '/scratch/rahit/modspy-data/./data/02_intermediate/monarch/edges_bag/05.json.gz',
 '/scratch/rahit/modspy-data/./data/02_intermediate/monarch/edges_bag/06.json.gz',
 '/scratch/rahit/modspy-data/./data/02_intermediate/monarch/edges_bag/07.json.gz',
 '/scratch/rahit/modspy-data/./data/02_intermediate/monarch/edges_bag/08.json.gz',
 '/scratch/rahit/modspy-data/./data/02_intermediate/monarch/edges_bag/09.json.gz',
 '/scratch/rahit/modspy-data/./data/02_intermediate/monarch/edges_bag/10.json.gz',
 '/scratch/rahit/modspy-data/./data/02_intermediate/monarch/edges_bag/11.json.gz',
 '/s

In [24]:
import ast

# Define a function to convert a line (string) back into a tuple
def parse_line(line):
    # Strip leading/trailing whitespace and newline characters
    line = line.strip()
    # Use `ast.literal_eval` to safely evaluate the string as a Python literal
    # This converts the string representation of a tuple back into an actual tuple
    return ast.literal_eval(line)

In [25]:
edges_bag = db.read_text('./data/02_intermediate/monarch/edges_bag/*.json.gz').map(parse_line).persist()

In [None]:
import json

# ebag = edges_bag.repartition(npartitions=32)

# Use foldby with the process_tuples function
edge_type_mappings = edges_bag.foldby(key=lambda x: (x[0],x[1],x[2]), binop=binop, combine=combine, initial=[], split_every=8)
# edge_type_mappings = edge_type_mappings.compute()  # Trigger the computation

# http://gra998.graham.sharcnet:8888/lab?token=acc796a9ad732594aeb992ab092a14b4313b11b43159228e
edge_type_mappings.map(json.dumps).to_textfiles('./data/02_intermediate/monarch/edges_index_v1/*.json.gz')
# etm = edge_type_mappings.compute()  # Trigger the computation

In [None]:
edges_type_index = db.read_text('./data/02_intermediate/monarch/edges_index/*.json.gz').map(parse_line).persist()


In [56]:
edges = dd.read_parquet('./data/02_intermediate/monarch/edges_pre_df_reduction_v2')

In [57]:
edge_kvs = edges['edge_key'].value_counts().compute()
display(edge_kvs)

biolink:Gene-biolink:interacts_with-biolink:Gene                                 4096780
biolink:Gene-biolink:expressed_in-biolink:GrossAnatomicalStructure               1472390
biolink:Gene-biolink:has_phenotype-biolink:PhenotypicFeature                      883206
biolink:Gene-biolink:enables-biolink:BiologicalProcessOrActivity                  841329
biolink:Gene-biolink:actively_involved_in-biolink:BiologicalProcessOrActivity     747796
                                                                                  ...   
biolink:MacromolecularComplex-biolink:related_to-biolink:RNAProduct                    1
biolink:MacromolecularComplex-biolink:related_to-biolink:Vertebrate                    1
biolink:GrossAnatomicalStructure-biolink:related_to-biolink:OrganismTaxon              1
biolink:PhenotypicFeature-biolink:subclass_of-biolink:PhenotypicQuality                1
biolink:Transcript-biolink:subclass_of-biolink:NucleicAcidEntity                       1
Name: edge_key, Lengt

In [12]:
from torch_geometric.datasets import AMiner
from torch_geometric.nn import MetaPath2Vec

dataset = AMiner(root='./data/01_raw/AMiner')
data = dataset[0]
display(data.edge_index_dict)

Downloading https://www.dropbox.com/s/1bnz8r7mofx0osf/net_aminer.zip?dl=1
Extracting data/01_raw/AMiner/net_aminer.zip
Downloading https://www.dropbox.com/s/nkocx16rpl4ydde/label.zip?dl=1
Extracting data/01_raw/AMiner/raw/label.zip
Processing...
Done!


{('paper',
  'written_by',
  'author'): tensor([[      0,       1,       2,  ..., 3194404, 3194404, 3194404],
         [      0,       1,       2,  ...,    4393,   21681,  317436]]),
 ('author',
  'writes',
  'paper'): tensor([[      0,       1,       2,  ...,    4393,   21681,  317436],
         [      0,       1,       2,  ..., 3194404, 3194404, 3194404]]),
 ('paper',
  'published_in',
  'venue'): tensor([[      0,       1,       2,  ..., 3194402, 3194403, 3194404],
         [   2190,    2190,    2190,  ...,    3148,    3148,    3148]]),
 ('venue',
  'publishes',
  'paper'): tensor([[   2190,    2190,    2190,  ...,    3148,    3148,    3148],
         [      0,       1,       2,  ..., 3194402, 3194403, 3194404]])}

In [5]:
nodes_df['category'].value_counts().compute()

category
biolink:Gene                              559272
biolink:PhenotypicFeature                 117145
biolink:BiologicalProcessOrActivity        38789
biolink:GrossAnatomicalStructure           28387
biolink:Disease                            26940
                                           ...  
biolink:SiRNA                                  1
biolink:Snv                                    1
biolink:StudyVariable                          1
biolink:TranscriptionFactorBindingSite         1
biolink:Zygosity                               1
Name: count, Length: 88, dtype: int64[pyarrow]

In [6]:

# Prepare node mapping and node types
# node_mapping = {node_id: i for i, node_id in enumerate(nodes_df['id'].unique())}
node_types = nodes_df['category'].unique()

# Add nodes to the graph
for node_type in tqdm(node_types):
    if node_type == 'biolink:Gene':
        mask = nodes_df['category'] == node_type
        type_nodes = nodes_df[mask].compute()
        print(type_nodes.shape[0])

  0%|          | 0/88 [00:00<?, ?it/s]

100%|██████████| 88/88 [00:00<00:00, 345.64it/s]


In [58]:
################## INITIALIZE NODES ##################

# Prepare node mapping and node types
# node_mapping = {node_id: i for i, node_id in enumerate(nodes_df.index.unique())}
# node_types = nodes_df['category'].unique()

# Initialize HeteroData for the heterogeneous graph
data = HeteroData()
    
# Prepare node mapping and node types
# node_mapping = {node_id: i for i, node_id in enumerate(nodes_df['id'].unique())}
node_types = nodes_df['category'].unique()

# Add nodes to the graph
for node_type in tqdm(node_types):
    # get nodes of type `node_type`
    mask = nodes_df['category'] == node_type
    type_nodes = nodes_df[mask].compute()
    
    # node_features = torch.ones((type_nodes.shape[0], 1))
    data[node_type].num_nodes = type_nodes.shape[0] # node_features # torch.tensor(type_nodes.index.to_list(), dtype=torch.long).view(len(type_nodes.index.to_list()),1)

    # Add dummy features (e.g., a simple constant feature)
    data[node_type].x = torch.ones((data[node_type].num_nodes, 1))  # Each user has a feature vector of [1]


################## INITIALIZE EDGES ##################

# Prepare edge types and mappings
edge_types = edges['edge_key'].unique().compute()

# Add edges to the graph
for i in tqdm(edge_types):
    edge_key = tuple(i.split('-'))
    edge_idx = edges[edges['edge_key'] == i][['subject_id','object_id']].compute()
    data[edge_key].edge_index = torch.tensor(edge_idx.values, dtype=torch.long).t().contiguous()
    

100%|██████████| 88/88 [01:54<00:00,  1.30s/it]
100%|██████████| 289/289 [03:56<00:00,  1.22it/s]


In [59]:
torch.save(data, './data/05_model_input/2024-02-monarch_heterodata_v1.pt')

In [69]:
data.has_isolated_nodes()

True

In [70]:
data.is_undirected()

False

In [71]:
data.validate()

True

In [17]:
cluster.close()

In [18]:
client.close()