# OBNB for UniEntrezDB benchmark

## Preprocessing data 
map to the UniEntrezDB benchmark gene set with gene annoataion, gene sequence and corresponding protein sequece data available

In [25]:
d = np.load("datasets/BioGRID/processed/data.npz")
print(max(d['edge_index'][0]))
len(d['node_ids'])

7402


7403

In [32]:
import numpy as np
import os
from tqdm import tqdm
import pandas as pd

# List of .npz files to process
npz_files = [
    "datasets/BioGRID/processed/obnb_data.npz",
    "datasets/BioPlex/processed/obnb_data.npz",
    "datasets/ComPPIHumanInt/processed/obnb_data.npz",
    "datasets/ConsensusPathDB/processed/obnb_data.npz",
    "datasets/FunCoup/processed/obnb_data.npz",
    "datasets/HIPPIE/processed/obnb_data.npz",
    "datasets/HuMAP/processed/obnb_data.npz",
    "datasets/HuRI/processed/obnb_data.npz",
    "datasets/OmniPath/processed/obnb_data.npz",
    "datasets/PCNet/processed/obnb_data.npz",
    "datasets/ProteomeHD/processed/obnb_data.npz",
    "datasets/SIGNOR/processed/obnb_data.npz",
    "datasets/STRING/processed/obnb_data.npz",
]

# npz_files = [
#     "datasets/BioGRID/processed/data.npz",
#     "datasets/BioPlex/processed/data.npz",
#     "datasets/ComPPIHumanInt/processed/data.npz",
#     "datasets/ConsensusPathDB/processed/data.npz",
#     "datasets/FunCoup/processed/data.npz",
#     "datasets/HIPPIE/processed/data.npz",
#     "datasets/HuMAP/processed/data.npz",
#     "datasets/HuRI/processed/data.npz",
#     "datasets/OmniPath/processed/data.npz",
#     "datasets/PCNet/processed/data.npz",
#     "datasets/ProteomeHD/processed/data.npz",
#     "datasets/SIGNOR/processed/data.npz",
#     "datasets/STRING/processed/data.npz",
# ]

# Load the external ID sets
goaid = pd.read_csv("Embeddings/GOA/gene_entrezID.csv", header=None)
dnabertid = pd.read_csv("Embeddings/DNABert/dnabert_allid.csv", header=None)
gene2vecid = pd.read_csv("Embeddings/Gene2Vec/gene2vec_id.csv", header=None)
ontoproteinid = pd.read_csv("Embeddings/ontoprotein/ids.csv", header=None)

# Create sets of IDs
goa_id = set([str(id) for id in list(goaid[0])])
dnabert_id = set([str(id) for id in list(dnabertid[0])])
gene2vec_id = set([str(id) for id in list(gene2vecid[0])])
ontoprotein_id = set([str(id) for id in list(ontoproteinid[0])])
overlap = goa_id & dnabert_id & gene2vec_id & ontoprotein_id
print(f"Initial overlap count: {len(overlap)}")

for file in npz_files:
    # Load data
    data = np.load(file)
    obnb_output_path = os.path.join(os.path.dirname(file), "obnb_data.npz")
    np.savez(obnb_output_path, **data)
    print(f"Original .npz file saved as {obnb_output_path}")


    node_ids = list(data['node_ids'])
    overlap_nodes = set(node_ids) & overlap
    edge_index = data['edge_index']
    
    # Convert edge indices to edges represented by node IDs
    edge_id_pairs = [(node_ids[i], node_ids[j]) for i, j in edge_index.T]
    
    # Filter edges to keep only those where both nodes are in the overlap set
    filtered_edge_id_pairs = [
        (u, v) for u, v in edge_id_pairs if u in overlap_nodes and v in overlap_nodes
    ]
    
    # Generate a new list of node IDs and their corresponding indices
    overlap_nodes_list = sorted(overlap_nodes)
    node_to_new_index = {node_id: i for i, node_id in enumerate(overlap_nodes_list)}
    # print(node_to_new_index)
    
    # Convert filtered edges back to edge indices based on the new node ID positions
    new_edge_index = np.array(
        [
            [node_to_new_index[u], node_to_new_index[v]]
            for u, v in filtered_edge_id_pairs
        ]
    ).T
    
    # Save the filtered data to a new .npz file
    output_path = file[:-13] + "data.npz"
    np.savez(output_path, edge_index=new_edge_index, node_ids=overlap_nodes_list)
    
    print(f"New .npz file created at {output_path} with {len(node_ids)} / {len(overlap_nodes_list)} nodes. and {len(edge_index[0])} / {len(new_edge_index)} edges, {max(new_edge_index[0])}")



Initial overlap count: 7403
Original .npz file saved as datasets/BioGRID/processed/obnb_data.npz
New .npz file created at datasets/BioGRID/processed/data.npz with 19765 / 5760 nodes. and 1554790 / 2 edges, 5759
Original .npz file saved as datasets/BioPlex/processed/obnb_data.npz
New .npz file created at datasets/BioPlex/processed/data.npz with 8108 / 2537 nodes. and 71004 / 2 edges, 2536
Original .npz file saved as datasets/ComPPIHumanInt/processed/obnb_data.npz
New .npz file created at datasets/ComPPIHumanInt/processed/data.npz with 17015 / 5487 nodes. and 699620 / 2 edges, 5486
Original .npz file saved as datasets/ConsensusPathDB/processed/obnb_data.npz
New .npz file created at datasets/ConsensusPathDB/processed/data.npz with 17735 / 5697 nodes. and 10611416 / 2 edges, 5696
Original .npz file saved as datasets/FunCoup/processed/obnb_data.npz
New .npz file created at datasets/FunCoup/processed/data.npz with 17892 / 5612 nodes. and 10037478 / 2 edges, 5611
Original .npz file saved as d

In [35]:
import os

# List of .npz files to rename
npz_files = [
    "datasets/BioGRID/processed/obnb_data.npz",
    "datasets/BioPlex/processed/obnb_data.npz",
    "datasets/ComPPIHumanInt/processed/obnb_data.npz",
    "datasets/ConsensusPathDB/processed/obnb_data.npz",
    "datasets/FunCoup/processed/obnb_data.npz",
    "datasets/HIPPIE/processed/obnb_data.npz",
    "datasets/HuMAP/processed/obnb_data.npz",
    "datasets/HuRI/processed/obnb_data.npz",
    "datasets/OmniPath/processed/obnb_data.npz",
    "datasets/PCNet/processed/obnb_data.npz",
    "datasets/ProteomeHD/processed/obnb_data.npz",
    "datasets/SIGNOR/processed/obnb_data.npz",
    "datasets/STRING/processed/obnb_data.npz",
]

for file in npz_files:
    # Extract the directory and the original filename
    directory, original_filename = os.path.split(file)
    
    # Create the new filename by removing the 'obnb_' prefix
    new_filename = original_filename.replace("obnb_", "")
    
    # Combine the directory with the new filename
    new_filepath = os.path.join(directory, new_filename)
    
    # Rename the file
    os.rename(file, new_filepath)
    
    print(f"Renamed '{file}' to '{new_filepath}'")

print("All files have been renamed.")


Renamed 'datasets/BioGRID/processed/obnb_data.npz' to 'datasets/BioGRID/processed/data.npz'
Renamed 'datasets/BioPlex/processed/obnb_data.npz' to 'datasets/BioPlex/processed/data.npz'
Renamed 'datasets/ComPPIHumanInt/processed/obnb_data.npz' to 'datasets/ComPPIHumanInt/processed/data.npz'
Renamed 'datasets/ConsensusPathDB/processed/obnb_data.npz' to 'datasets/ConsensusPathDB/processed/data.npz'
Renamed 'datasets/FunCoup/processed/obnb_data.npz' to 'datasets/FunCoup/processed/data.npz'
Renamed 'datasets/HIPPIE/processed/obnb_data.npz' to 'datasets/HIPPIE/processed/data.npz'
Renamed 'datasets/HuMAP/processed/obnb_data.npz' to 'datasets/HuMAP/processed/data.npz'
Renamed 'datasets/HuRI/processed/obnb_data.npz' to 'datasets/HuRI/processed/data.npz'
Renamed 'datasets/OmniPath/processed/obnb_data.npz' to 'datasets/OmniPath/processed/data.npz'
Renamed 'datasets/PCNet/processed/obnb_data.npz' to 'datasets/PCNet/processed/data.npz'
Renamed 'datasets/ProteomeHD/processed/obnb_data.npz' to 'datase

In [19]:
npz_files = [
    "datasets/BioGRID/processed/obnb_data.npz",
    "datasets/BioPlex/processed/obnb_data.npz",
    "datasets/ComPPIHumanInt/processed/obnb_data.npz",
    "datasets/ConsensusPathDB/processed/obnb_data.npz",
    "datasets/FunCoup/processed/obnb_data.npz",
    "datasets/HIPPIE/processed/obnb_data.npz",
    "datasets/HuMAP/processed/obnb_data.npz",
    "datasets/HuRI/processed/obnb_data.npz",
    "datasets/OmniPath/processed/obnb_data.npz",
    "datasets/PCNet/processed/obnb_data.npz",
    "datasets/ProteomeHD/processed/obnb_data.npz",
    "datasets/SIGNOR/processed/obnb_data.npz",
    "datasets/STRING/processed/obnb_data.npz",
]

# Load the external ID sets
goaid = pd.read_csv("Embeddings/GOA/gene_entrezID.csv", header=None)
dnabertid = pd.read_csv("Embeddings/DNABert/dnabert_allid.csv", header=None)
gene2vecid = pd.read_csv("Embeddings/Gene2Vec/gene2vec_id.csv", header=None)
ontoproteinid = pd.read_csv("Embeddings/ontoprotein/ids.csv", header=None)

# Create sets of IDs
goa_id = set([id for id in list(goaid[0])])
dnabert_id = set([id for id in list(dnabertid[0])])
gene2vec_id = set([id for id in list(gene2vecid[0])])
ontoprotein_id = set([id for id in list(ontoproteinid[0])])
overlap = goa_id & dnabert_id & gene2vec_id & ontoprotein_id
pd.Series(list(overlap)).to_csv("selected_genes.txt", header=False, index=False)
# data = np.load(file)
# node_ids = list(data['node_ids'])
# overlap_nodes = set(node_ids) & overlap
# print(f"Initial overlap count: {len(overlap)}")

In [22]:
gene_list = list(pd.read_csv("selected_genes.txt", header=None)[0])

In [34]:
d = np.load("datasets/BioGRID/processed/data.npz")
len(d['node_ids'])

5760

In [24]:
list(gene_list[0])

[1,
 2,
 10,
 12,
 393229,
 19,
 21,
 24,
 163882,
 47,
 49,
 54,
 58,
 60,
 98365,
 70,
 71,
 72,
 852042,
 393293,
 852053,
 88,
 89,
 90,
 91,
 92,
 93,
 95,
 852065,
 98,
 101,
 102,
 104,
 108,
 115,
 852088,
 120,
 124,
 125,
 126,
 127,
 852096,
 131,
 133,
 140,
 148,
 150,
 152,
 154,
 155,
 161,
 165,
 174,
 177,
 178,
 185,
 187,
 189,
 197,
 213,
 214,
 216,
 217,
 218,
 219,
 223,
 224,
 226,
 231,
 240,
 242,
 246,
 247,
 33015,
 249,
 250,
 266,
 267,
 271,
 272,
 274,
 196883,
 327957,
 280,
 283,
 285,
 286,
 287,
 288,
 852255,
 290,
 557348,
 33062,
 131368,
 301,
 302,
 131377,
 306,
 307,
 308,
 310,
 311,
 164153,
 316,
 320,
 321,
 323,
 325,
 327,
 852296,
 328,
 330,
 332,
 335,
 337,
 343,
 345,
 346,
 347,
 348,
 350,
 351,
 354,
 357,
 358,
 360,
 362,
 366,
 372,
 374,
 377,
 378,
 379,
 98682,
 382,
 387,
 389,
 390,
 393,
 395,
 397,
 852369,
 131474,
 403,
 852373,
 98711,
 408,
 409,
 328092,
 197021,
 421,
 65960,
 65963,
 427,
 432,
 433,
 852402,
 98

In [10]:
import numpy as np
import os
from tqdm import tqdm
import pandas as pd

# List of .npz files to process
npz_files = [
    "datasets/BioGRID/processed/data.npz",
    "datasets/BioPlex/processed/data.npz",
    "datasets/ComPPIHumanInt/processed/data.npz",
    "datasets/ConsensusPathDB/processed/data.npz",
    "datasets/FunCoup/processed/data.npz",
    "datasets/HIPPIE/processed/data.npz",
    "datasets/HumanNet/processed/data_xc.npz",
    "datasets/HuMAP/processed/data.npz",
    "datasets/HuRI/processed/data.npz",
    "datasets/OmniPath/processed/data.npz",
    "datasets/PCNet/processed/data.npz",
    "datasets/ProteomeHD/processed/data.npz",
    "datasets/SIGNOR/processed/data.npz",
    "datasets/STRING/processed/data.npz",
]
goaid = pd.read_csv("Embeddings/GOA/gene_entrezID.csv", header=None)
dnabertid = pd.read_csv("Embeddings/DNABert/dnabert_allid.csv", header=None)
gene2vecid = pd.read_csv("Embeddings/Gene2Vec/gene2vec_id.csv", header=None)
ontoproteinid = pd.read_csv("Embeddings/ontoprotein/ids.csv", header=None)

goa_id = set([str(id) for id in list(goaid[0])])
dnabert_id = set([str(id) for id in list(dnabertid[0])])
gene2vec_id = set([str(id) for id in list(gene2vecid[0])])
ontoprotein_id = set([str(id) for id in list(ontoproteinid[0])])
overlap = goa_id & dnabert_id & gene2vec_id & ontoprotein_id
print(len(overlap))



for file in npz_files:
    data = np.load(file)
    node_ids = set(list(data['node_ids']))
    overlap_nodes = overlap & node_ids  # Intersect with existing overlap nodes# Create a new edge index with only the overlapping nodes
    edge_index_list = []
    data = np.load(file)
    edge_index = data['edge_index']
    node_ids = data['node_ids']

    # Create a mapping from old indices to new indices (only for overlapping nodes)
    old_to_new = {old_id: i for i, old_id in enumerate(node_ids) if old_id in overlap_nodes}
    
    # Filter the edge index to include only the overlapping nodes
    filtered_edge_index = []
    for i, j in tqdm(edge_index.T, desc="remove unmapped edge"):
        if node_ids[i] in old_to_new and node_ids[j] in old_to_new:
            filtered_edge_index.append([old_to_new[node_ids[i]], old_to_new[node_ids[j]]])
    
    if filtered_edge_index:
        edge_index_list.append(np.array(filtered_edge_index).T)

    # Combine all filtered edge indicesif edge_index_list:
        combined_edge_index = np.concatenate(edge_index_list, axis=1)
    else:
        combined_edge_index = np.empty((2, 0), dtype=int)

    # Convert overlap_nodes to a sorted list (to maintain order)
    overlap_nodes_list = sorted(list(overlap_nodes))

    # Save the new .npz file
    output_path = file[:-8] + "data.npz"
    np.savez(output_path, edge_index=combined_edge_index, node_ids=overlap_nodes_list)

    print(f"New .npz file created at {output_path} with {len(overlap_nodes)} overlapping nodes.")


7403


remove unmapped edge: 100%|██████████| 1554790/1554790 [00:02<00:00, 646222.50it/s]


New .npz file created at datasets/BioGRID/processed/data.npz with 5760 overlapping nodes.


remove unmapped edge: 100%|██████████| 71004/71004 [00:00<00:00, 636598.77it/s]


New .npz file created at datasets/BioPlex/processed/data.npz with 2537 overlapping nodes.


remove unmapped edge: 100%|██████████| 699620/699620 [00:01<00:00, 631928.27it/s]


New .npz file created at datasets/ComPPIHumanInt/processed/data.npz with 5487 overlapping nodes.


remove unmapped edge: 100%|██████████| 10611416/10611416 [00:16<00:00, 639679.01it/s]


New .npz file created at datasets/ConsensusPathDB/processed/data.npz with 5697 overlapping nodes.


remove unmapped edge: 100%|██████████| 10037478/10037478 [00:14<00:00, 702368.29it/s]


New .npz file created at datasets/FunCoup/processed/data.npz with 5612 overlapping nodes.


remove unmapped edge: 100%|██████████| 1542044/1542044 [00:02<00:00, 646147.30it/s]


New .npz file created at datasets/HIPPIE/processed/data.npz with 5765 overlapping nodes.


remove unmapped edge: 100%|██████████| 2250780/2250780 [00:03<00:00, 634715.28it/s]


New .npz file created at datasets/HumanNet/processed/datdata.npz with 5825 overlapping nodes.


remove unmapped edge: 100%|██████████| 35052604/35052604 [00:52<00:00, 662809.06it/s]


New .npz file created at datasets/HuMAP/processed/data.npz with 4789 overlapping nodes.


remove unmapped edge: 100%|██████████| 103188/103188 [00:00<00:00, 568224.52it/s]


New .npz file created at datasets/HuRI/processed/data.npz with 2597 overlapping nodes.


remove unmapped edge: 100%|██████████| 289134/289134 [00:00<00:00, 684104.35it/s]


New .npz file created at datasets/OmniPath/processed/data.npz with 5200 overlapping nodes.


remove unmapped edge: 100%|██████████| 5365116/5365116 [00:07<00:00, 672004.80it/s]


New .npz file created at datasets/PCNet/processed/data.npz with 5737 overlapping nodes.


remove unmapped edge: 100%|██████████| 125172/125172 [00:00<00:00, 644003.96it/s]


New .npz file created at datasets/ProteomeHD/processed/data.npz with 651 overlapping nodes.


remove unmapped edge: 100%|██████████| 28676/28676 [00:00<00:00, 533655.73it/s]


New .npz file created at datasets/SIGNOR/processed/data.npz with 1940 overlapping nodes.


remove unmapped edge: 100%|██████████| 11019492/11019492 [00:17<00:00, 630982.16it/s]


New .npz file created at datasets/STRING/processed/data.npz with 5789 overlapping nodes.


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from obnb.data.network.base import BaseNDExData
from obnb.typing import Converter


class ProteomeHD(BaseNDExData):
    """The ProteomeHD Protein Protein Interaction network."""

    cx_uuid = "4cb4b0f3-83da-11e9-848d-0ac135e8bacf"

    def __init__(
        self,
        root: str,
        *,
        weighted: bool = False,
        directed: bool = False,
        largest_comp: bool = True,
        gene_id_converter: Converter = "HumanEntrez",
        **kwargs,
    ):
        """Initialize the ProteomeHD network data."""
        super().__init__(
            root,
            weighted=weighted,
            directed=directed,
            largest_comp=largest_comp,
            gene_id_converter=gene_id_converter,
            cx_kwargs={
                "interaction_types": ["correlates-with"],
                "node_id_prefix": "ncbigene",
                "default_edge_weight": 1.0,
                "edge_weight_attr_name": "score",
                "use_node_alias": True,
            },
            **kwargs,
        )

In [None]:
import torch
goaid = pd.read_csv("Embeddings/GOA/gene_entrezID.csv", header=None)
dnabertid = pd.read_csv("Embeddings/DNABert/dnabert_allid.csv", header=None)
gene2vecid = pd.read_csv("Embeddings/Gene2Vec/gene2vec_id.csv", header=None)
ontoproteinid = pd.read_csv("Embeddings/ontoprotein/ids.csv", header=None)

goa_id = set([str(id) for id in list(goaid[0])])
dnabert_id = set([str(id) for id in list(dnabertid[0])])
gene2vec_id = set([str(id) for id in list(gene2vecid[0])])
ontoprotein_id = set([str(id) for id in list(ontoproteinid[0])])
overlap = goa_id & dnabert_id & gene2vec_id & ontoprotein_id

goaemb = torch.load("Embeddings/GOA/mean_20001.pt")
dnabertemb = torch.load("Embeddings/DNABert/dnabert_all.pt")
gene2vecemb = torch.load("Embeddings/Gene2Vec/gene2vec.pt")
ontoproteinemb = torch.load("Embeddings/ontoprotein/ontoprotein.pt")

In [1]:
import torch
import json
import pandas as pd

# Load IDs from CSV files
goaid = pd.read_csv("Embeddings/GOA/gene_entrezID.csv", header=None)
dnabertid = pd.read_csv("Embeddings/DNABert/dnabert_allid.csv", header=None)
gene2vecid = pd.read_csv("Embeddings/Gene2Vec/gene2vec_id.csv", header=None)
ontoproteinid = pd.read_csv("Embeddings/ontoprotein/ids.csv", header=None)

# Create sets of IDs
goa_id = set([str(id) for id in list(goaid[0])])
dnabert_id = set([str(id) for id in list(dnabertid[0])])
gene2vec_id = set([str(id) for id in list(gene2vecid[0])])
ontoprotein_id = set([str(id) for id in list(ontoproteinid[0])])
overlap = goa_id & dnabert_id & gene2vec_id & ontoprotein_id

# Load embeddings
goaemb = torch.load("Embeddings/GOA/mean_20001.pt")
dnabertemb = torch.load("Embeddings/DNABert/dnabert_all.pt")
gene2vecemb = torch.load("Embeddings/Gene2Vec/gene2vec.pt")
ontoproteinemb = torch.load("Embeddings/ontoprotein/ontoprotein.pt")

# Create ID-to-embedding dictionaries
goa_dict = {str(goaid[0][i]): goaemb[i].tolist() for i in range(len(goaid))}
dnabert_dict = {str(dnabertid[0][i]): dnabertemb[i].tolist() for i in range(len(dnabertid))}
gene2vec_dict = {str(gene2vecid[0][i]): gene2vecemb[i].tolist() for i in range(len(gene2vecid))}
ontoprotein_dict = {str(ontoproteinid[0][i]): ontoproteinemb[i].tolist() for i in range(len(ontoproteinid))}
filtered_goa = {id_: goa_dict[id_] for id_ in overlap if id_ in goa_dict}
filtered_dnabert = {id_: dnabert_dict[id_] for id_ in overlap if id_ in dnabert_dict}
filtered_gene2vec = {id_: gene2vec_dict[id_] for id_ in overlap if id_ in gene2vec_dict}
filtered_ontoprotein = {id_: ontoprotein_dict[id_] for id_ in overlap if id_ in ontoprotein_dict}
filtered_goa_gene2vec = {id_: goa_dict[id_] + gene2vec_dict[id_] for id_ in overlap if id_ in goa_dict and id_ in gene2vec_dict}
filtered_goa_dnabert = {id_: goa_dict[id_] + dnabert_dict[id_] for id_ in overlap if id_ in goa_dict and id_ in dnabert_dict}

# Create a new dictionary with only the overlapping IDs and their embeddings
with open("filtered_goa_embeddings.json", "w") as json_file:
    json.dump(filtered_goa, json_file)
with open("filtered_dnabert_embeddings.json", "w") as json_file:
    json.dump(filtered_dnabert, json_file)
with open("filtered_gene2vec_embeddings.json", "w") as json_file:
    json.dump(filtered_gene2vec, json_file)
with open("filtered_ontoprotein_embeddings.json", "w") as json_file:
    json.dump(filtered_ontoprotein, json_file)
with open("filtered_goa_gene2vec_embeddings.json", "w") as json_file:
    json.dump(filtered_goa_gene2vec, json_file)
with open("filtered_goa_dnabert_embeddings.json", "w") as json_file:
    json.dump(filtered_goa_dnabert, json_file)


print(f"Filtered embeddings saved to filtered_embeddings.json")


  from .autonotebook import tqdm as notebook_tqdm


Filtered embeddings saved to filtered_embeddings.json


In [3]:
len(filtered_goa)

7403

In [8]:
len(list(filtered_goa_gene2vec.values())[0])

1224

In [5]:
import os
import numpy as np
import torch
import json
from tqdm import tqdm

# List of .npz files
npz_files = [
    "datasets/BioGRID/processed/data.npz",
    "datasets/BioPlex/processed/data.npz",
    "datasets/ComPPIHumanInt/processed/data.npz",
    "datasets/ConsensusPathDB/processed/data.npz",
    "datasets/FunCoup/processed/data.npz",
    "datasets/HIPPIE/processed/data.npz",
    "datasets/HumanNet/processed/data_xc.npz",
    "datasets/HuMAP/processed/data.npz",
    "datasets/HuRI/processed/data.npz",
    "datasets/OmniPath/processed/data.npz",
    "datasets/PCNet/processed/data.npz",
    "datasets/ProteomeHD/processed/data.npz",
    "datasets/SIGNOR/processed/data.npz",
    "datasets/STRING/processed/data.npz",
]
with open("filtered_goa_embeddings.json", "r") as json_file:
    filtered_goa = json.load(json_file)
with open("filtered_dnabert_embeddings.json", "r") as json_file:
    filtered_dnabert = json.load(json_file)
with open("filtered_gene2vec_embeddings.json", "r") as json_file:
    filtered_gene2vec = json.load(json_file)
with open("filtered_ontoprotein_embeddings.json", "r") as json_file:
    filtered_ontoprotein = json.load(json_file)
with open("filtered_goa_gene2vec_embeddings.json", "r") as json_file:
    filtered_goa_gene2vec = json.load(json_file)
with open("filtered_goa_dnabert_embeddings.json", "r") as json_file:
    filtered_goa_dnabert = json.load(json_file)

def generate_features(node_ids, embedding_dict, output_filename, embedding_dim):
    features = []
    for node_id in tqdm(node_ids):
        if node_id in embedding_dict:
            features.append(embedding_dict[node_id])
        else:
            features.append([0.0] * embedding_dim)
    
    # Convert to a PyTorch tensor
    features_tensor = torch.tensor(features)
    
    # Save the tensor as a .pt file
    torch.save(features_tensor, output_filename)
    print(f"Features saved to {output_filename}")

for file in tqdm(npz_files):
    # Load the .npz file and extract node IDs
    data = np.load(file)
    node_ids = list(data['node_ids'])

    # Determine the output directory
    output_dir = os.path.dirname(file)

    # Determine the embedding dimensions
    goa_dim = len(next(iter(filtered_goa.values())))
    dnabert_dim = len(next(iter(filtered_dnabert.values())))
    gene2vec_dim = len(next(iter(filtered_gene2vec.values())))
    ontoprotein_dim = len(next(iter(filtered_ontoprotein.values())))
    goa_gene2vec_dim = len(next(iter(filtered_goa_gene2vec.values())))
    goa_dnabert_dim = len(next(iter(filtered_goa_dnabert.values())))

    # Generate and save features for each dictionary
    generate_features(node_ids, filtered_goa, os.path.join(output_dir, "goa.pt"), goa_dim)
    generate_features(node_ids, filtered_dnabert, os.path.join(output_dir, "dnabert.pt"), dnabert_dim)
    generate_features(node_ids, filtered_gene2vec, os.path.join(output_dir, "gene2vec.pt"), gene2vec_dim)
    generate_features(node_ids, filtered_ontoprotein, os.path.join(output_dir, "ontoprotein.pt"), ontoprotein_dim)
    generate_features(node_ids, filtered_goa_gene2vec, os.path.join(output_dir, "goa_gene2vec.pt"), goa_gene2vec_dim)
    generate_features(node_ids, filtered_goa_dnabert, os.path.join(output_dir, "goa_dnabert.pt"), goa_dnabert_dim)

100%|██████████| 19765/19765 [00:00<00:00, 95305.69it/s] 


Features saved to datasets/BioGRID/processed/goa.pt


100%|██████████| 19765/19765 [00:00<00:00, 239436.73it/s]


Features saved to datasets/BioGRID/processed/dnabert.pt


100%|██████████| 19765/19765 [00:00<00:00, 605820.07it/s]


Features saved to datasets/BioGRID/processed/gene2vec.pt


100%|██████████| 19765/19765 [00:00<00:00, 141710.12it/s]


Features saved to datasets/BioGRID/processed/ontoprotein.pt


100%|██████████| 19765/19765 [00:01<00:00, 19534.25it/s]


Features saved to datasets/BioGRID/processed/goa_gene2vec.pt


100%|██████████| 19765/19765 [00:00<00:00, 95727.86it/s] 
  7%|▋         | 1/14 [00:06<01:28,  6.84s/it]

Features saved to datasets/BioGRID/processed/goa_dnabert.pt


100%|██████████| 8108/8108 [00:00<00:00, 239214.26it/s]


Features saved to datasets/BioPlex/processed/goa.pt


100%|██████████| 8108/8108 [00:00<00:00, 235015.29it/s]


Features saved to datasets/BioPlex/processed/dnabert.pt


100%|██████████| 8108/8108 [00:00<00:00, 735852.36it/s]


Features saved to datasets/BioPlex/processed/gene2vec.pt


100%|██████████| 8108/8108 [00:00<00:00, 248721.32it/s]


Features saved to datasets/BioPlex/processed/ontoprotein.pt


100%|██████████| 8108/8108 [00:00<00:00, 156125.52it/s]


Features saved to datasets/BioPlex/processed/goa_gene2vec.pt


100%|██████████| 8108/8108 [00:00<00:00, 168313.55it/s]
 14%|█▍        | 2/14 [00:09<00:50,  4.24s/it]

Features saved to datasets/BioPlex/processed/goa_dnabert.pt


100%|██████████| 17015/17015 [00:00<00:00, 155860.96it/s]


Features saved to datasets/ComPPIHumanInt/processed/goa.pt


100%|██████████| 17015/17015 [00:00<00:00, 201761.54it/s]


Features saved to datasets/ComPPIHumanInt/processed/dnabert.pt


100%|██████████| 17015/17015 [00:00<00:00, 585889.95it/s]


Features saved to datasets/ComPPIHumanInt/processed/gene2vec.pt


100%|██████████| 17015/17015 [00:00<00:00, 152939.66it/s]


Features saved to datasets/ComPPIHumanInt/processed/ontoprotein.pt


100%|██████████| 17015/17015 [00:00<00:00, 154053.89it/s]


Features saved to datasets/ComPPIHumanInt/processed/goa_gene2vec.pt


100%|██████████| 17015/17015 [00:00<00:00, 113017.58it/s]
 21%|██▏       | 3/14 [00:14<00:50,  4.63s/it]

Features saved to datasets/ComPPIHumanInt/processed/goa_dnabert.pt


100%|██████████| 17735/17735 [00:00<00:00, 17983.08it/s] 


Features saved to datasets/ConsensusPathDB/processed/goa.pt


100%|██████████| 17735/17735 [00:00<00:00, 240930.16it/s]


Features saved to datasets/ConsensusPathDB/processed/dnabert.pt


100%|██████████| 17735/17735 [00:00<00:00, 658347.11it/s]


Features saved to datasets/ConsensusPathDB/processed/gene2vec.pt


100%|██████████| 17735/17735 [00:00<00:00, 165917.92it/s]


Features saved to datasets/ConsensusPathDB/processed/ontoprotein.pt


100%|██████████| 17735/17735 [00:00<00:00, 164750.09it/s]


Features saved to datasets/ConsensusPathDB/processed/goa_gene2vec.pt


100%|██████████| 17735/17735 [00:00<00:00, 121204.29it/s]
 29%|██▊       | 4/14 [00:20<00:52,  5.30s/it]

Features saved to datasets/ConsensusPathDB/processed/goa_dnabert.pt


100%|██████████| 17892/17892 [00:00<00:00, 136442.79it/s]


Features saved to datasets/FunCoup/processed/goa.pt


100%|██████████| 17892/17892 [00:00<00:00, 223614.89it/s]


Features saved to datasets/FunCoup/processed/dnabert.pt


100%|██████████| 17892/17892 [00:00<00:00, 555843.92it/s]


Features saved to datasets/FunCoup/processed/gene2vec.pt


100%|██████████| 17892/17892 [00:00<00:00, 19949.96it/s]


Features saved to datasets/FunCoup/processed/ontoprotein.pt


100%|██████████| 17892/17892 [00:00<00:00, 161279.17it/s]


Features saved to datasets/FunCoup/processed/goa_gene2vec.pt


100%|██████████| 17892/17892 [00:00<00:00, 100681.12it/s]
 36%|███▌      | 5/14 [00:26<00:50,  5.60s/it]

Features saved to datasets/FunCoup/processed/goa_dnabert.pt


100%|██████████| 19338/19338 [00:00<00:00, 182367.60it/s]


Features saved to datasets/HIPPIE/processed/goa.pt


100%|██████████| 19338/19338 [00:00<00:00, 207262.94it/s]


Features saved to datasets/HIPPIE/processed/dnabert.pt


100%|██████████| 19338/19338 [00:00<00:00, 627209.29it/s]


Features saved to datasets/HIPPIE/processed/gene2vec.pt


100%|██████████| 19338/19338 [00:00<00:00, 155536.43it/s]


Features saved to datasets/HIPPIE/processed/ontoprotein.pt


100%|██████████| 19338/19338 [00:00<00:00, 121497.00it/s]


Features saved to datasets/HIPPIE/processed/goa_gene2vec.pt


100%|██████████| 19338/19338 [00:00<00:00, 104578.96it/s]
 43%|████▎     | 6/14 [00:32<00:45,  5.74s/it]

Features saved to datasets/HIPPIE/processed/goa_dnabert.pt


100%|██████████| 18591/18591 [00:00<00:00, 18887.11it/s] 


Features saved to datasets/HumanNet/processed/goa.pt


100%|██████████| 18591/18591 [00:00<00:00, 222934.67it/s]


Features saved to datasets/HumanNet/processed/dnabert.pt


100%|██████████| 18591/18591 [00:00<00:00, 442667.40it/s]


Features saved to datasets/HumanNet/processed/gene2vec.pt


100%|██████████| 18591/18591 [00:00<00:00, 158007.98it/s]


Features saved to datasets/HumanNet/processed/ontoprotein.pt


100%|██████████| 18591/18591 [00:00<00:00, 126151.98it/s]


Features saved to datasets/HumanNet/processed/goa_gene2vec.pt


100%|██████████| 18591/18591 [00:00<00:00, 99311.62it/s] 
 50%|█████     | 7/14 [00:40<00:44,  6.31s/it]

Features saved to datasets/HumanNet/processed/goa_dnabert.pt


100%|██████████| 15433/15433 [00:00<00:00, 206757.12it/s]


Features saved to datasets/HuMAP/processed/goa.pt


100%|██████████| 15433/15433 [00:00<00:00, 164233.16it/s]


Features saved to datasets/HuMAP/processed/dnabert.pt


100%|██████████| 15433/15433 [00:00<00:00, 468160.60it/s]


Features saved to datasets/HuMAP/processed/gene2vec.pt


100%|██████████| 15433/15433 [00:01<00:00, 13516.51it/s]


Features saved to datasets/HuMAP/processed/ontoprotein.pt


100%|██████████| 15433/15433 [00:00<00:00, 181910.06it/s]


Features saved to datasets/HuMAP/processed/goa_gene2vec.pt


100%|██████████| 15433/15433 [00:00<00:00, 82146.80it/s] 
 57%|█████▋    | 8/14 [00:47<00:38,  6.43s/it]

Features saved to datasets/HuMAP/processed/goa_dnabert.pt


100%|██████████| 8100/8100 [00:00<00:00, 191352.45it/s]


Features saved to datasets/HuRI/processed/goa.pt


100%|██████████| 8100/8100 [00:00<00:00, 163160.95it/s]


Features saved to datasets/HuRI/processed/dnabert.pt


100%|██████████| 8100/8100 [00:00<00:00, 1015539.62it/s]


Features saved to datasets/HuRI/processed/gene2vec.pt


100%|██████████| 8100/8100 [00:00<00:00, 196839.24it/s]


Features saved to datasets/HuRI/processed/ontoprotein.pt


100%|██████████| 8100/8100 [00:00<00:00, 217029.91it/s]


Features saved to datasets/HuRI/processed/goa_gene2vec.pt


100%|██████████| 8100/8100 [00:00<00:00, 148934.35it/s]
 64%|██████▍   | 9/14 [00:49<00:26,  5.27s/it]

Features saved to datasets/HuRI/processed/goa_dnabert.pt


100%|██████████| 16325/16325 [00:00<00:00, 169880.45it/s]


Features saved to datasets/OmniPath/processed/goa.pt


100%|██████████| 16325/16325 [00:00<00:00, 143486.73it/s]


Features saved to datasets/OmniPath/processed/dnabert.pt


100%|██████████| 16325/16325 [00:00<00:00, 612746.88it/s]


Features saved to datasets/OmniPath/processed/gene2vec.pt


100%|██████████| 16325/16325 [00:00<00:00, 189667.91it/s]


Features saved to datasets/OmniPath/processed/ontoprotein.pt


100%|██████████| 16325/16325 [00:00<00:00, 169684.27it/s]


Features saved to datasets/OmniPath/processed/goa_gene2vec.pt


100%|██████████| 16325/16325 [00:00<00:00, 97648.65it/s] 
 71%|███████▏  | 10/14 [00:55<00:21,  5.47s/it]

Features saved to datasets/OmniPath/processed/goa_dnabert.pt


100%|██████████| 18544/18544 [00:01<00:00, 14682.82it/s] 


Features saved to datasets/PCNet/processed/goa.pt


100%|██████████| 18544/18544 [00:00<00:00, 248010.04it/s]


Features saved to datasets/PCNet/processed/dnabert.pt


100%|██████████| 18544/18544 [00:00<00:00, 535245.32it/s]


Features saved to datasets/PCNet/processed/gene2vec.pt


100%|██████████| 18544/18544 [00:00<00:00, 220892.65it/s]


Features saved to datasets/PCNet/processed/ontoprotein.pt


100%|██████████| 18544/18544 [00:00<00:00, 149875.08it/s]


Features saved to datasets/PCNet/processed/goa_gene2vec.pt


100%|██████████| 18544/18544 [00:00<00:00, 122206.68it/s]
 79%|███████▊  | 11/14 [01:02<00:18,  6.02s/it]

Features saved to datasets/PCNet/processed/goa_dnabert.pt


100%|██████████| 2471/2471 [00:00<00:00, 369303.21it/s]


Features saved to datasets/ProteomeHD/processed/goa.pt


100%|██████████| 2471/2471 [00:00<00:00, 217095.21it/s]


Features saved to datasets/ProteomeHD/processed/dnabert.pt


100%|██████████| 2471/2471 [00:00<00:00, 590986.21it/s]


Features saved to datasets/ProteomeHD/processed/gene2vec.pt


100%|██████████| 2471/2471 [00:00<00:00, 147517.33it/s]


Features saved to datasets/ProteomeHD/processed/ontoprotein.pt


100%|██████████| 2471/2471 [00:00<00:00, 262236.86it/s]


Features saved to datasets/ProteomeHD/processed/goa_gene2vec.pt


100%|██████████| 2471/2471 [00:00<00:00, 157903.06it/s]


Features saved to datasets/ProteomeHD/processed/goa_dnabert.pt


100%|██████████| 5291/5291 [00:00<00:00, 178590.90it/s]


Features saved to datasets/SIGNOR/processed/goa.pt


100%|██████████| 5291/5291 [00:00<00:00, 306600.66it/s]


Features saved to datasets/SIGNOR/processed/dnabert.pt


100%|██████████| 5291/5291 [00:00<00:00, 709147.52it/s]


Features saved to datasets/SIGNOR/processed/gene2vec.pt


100%|██████████| 5291/5291 [00:00<00:00, 248580.93it/s]


Features saved to datasets/SIGNOR/processed/ontoprotein.pt


100%|██████████| 5291/5291 [00:00<00:00, 221325.26it/s]


Features saved to datasets/SIGNOR/processed/goa_gene2vec.pt


100%|██████████| 5291/5291 [00:00<00:00, 140082.83it/s]
 93%|█████████▎| 13/14 [01:05<00:03,  3.70s/it]

Features saved to datasets/SIGNOR/processed/goa_dnabert.pt


100%|██████████| 18480/18480 [00:00<00:00, 109015.59it/s]


Features saved to datasets/STRING/processed/goa.pt


100%|██████████| 18480/18480 [00:00<00:00, 258329.52it/s]


Features saved to datasets/STRING/processed/dnabert.pt


100%|██████████| 18480/18480 [00:00<00:00, 519387.13it/s]


Features saved to datasets/STRING/processed/gene2vec.pt


100%|██████████| 18480/18480 [00:00<00:00, 186566.07it/s]


Features saved to datasets/STRING/processed/ontoprotein.pt


100%|██████████| 18480/18480 [00:00<00:00, 138578.82it/s]


Features saved to datasets/STRING/processed/goa_gene2vec.pt


100%|██████████| 18480/18480 [00:00<00:00, 19187.89it/s]
100%|██████████| 14/14 [01:12<00:00,  5.21s/it]

Features saved to datasets/STRING/processed/goa_dnabert.pt





In [10]:
import os
import csv
import json

# Define the directory to search
directory = "results/dev"# Define the output CSV file
output_csv = "all_results.csv"# Initialize a list to store the results
results = []

for root, dirs, files in os.walk(directory):
    for file in files:
        if"final_scores"in file:
            filepath = os.path.join(root, file)
            with open(filepath, "r") as f:
                data = json.load(f)
                
                # Filter the data for "test" task_id and calculate averages
                test_scores = [entry for entry in data if entry['task_id'] == 'test']
                
                if test_scores:
                    ap_scores = [entry['score_value'] for entry in test_scores if entry['score_type'] == 'AP']
                    apop_scores = [entry['score_value'] for entry in test_scores if entry['score_type'] == 'APOP']
                    auroc_scores = [entry['score_value'] for entry in test_scores if entry['score_type'] == 'AUROC']
                    
                    avg_ap = sum(ap_scores) / len(ap_scores) if ap_scores else None
                    avg_apop = sum(apop_scores) / len(apop_scores) if apop_scores else None
                    avg_auroc = sum(auroc_scores) / len(auroc_scores) if auroc_scores else None
                    
                    results.append({
                        "filepath": filepath,
                        "test AP": avg_ap,
                        "test APOP": avg_apop,
                        "test AUROC": avg_auroc
                    })

with open(output_csv, "w", newline='') as csvfile:
    fieldnames = ["filepath", "test AP", "test APOP", "test AUROC"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for result in results:
        writer.writerow(result)

print(f"Results saved to {output_csv}")


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [54]:
# 
a =pd.read_csv("results/dev/SIGNOR-DISEASES-MLP+Node2vec/run_0/final_scores_goa_gene2vec.csv")
print('AP',np.mean(list(a[(a['split'] == 'test') & (a['score_type'] == 'AP')]['score_value'])))
print('APOP',np.mean(list(a[(a['split'] == 'test') & (a['score_type'] == 'APOP')]['score_value'])))
print('AUROC',np.mean(list(a[(a['split'] == 'test') & (a['score_type'] == 'AUROC')]['score_value'])))

FileNotFoundError: [Errno 2] No such file or directory: 'results/dev/SIGNOR-DISEASE-MLP+Node2vec/run_0/final_scores_goa_gene2vec.csv'

APOP 0.6054598269598751
AP 0.03931396241974209
AUROC 0.5282329824464075
