In [140]:

# source: https://ogb.stanford.edu/docs/linkprop/#ogbl-collab

from ogb.linkproppred import LinkPropPredDataset

dataset = LinkPropPredDataset(name = 'ogbl-biokg')

split_edge = dataset.get_edge_split()
train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"]
graph = dataset[0]
graph_backup = graph.copy()

In [141]:
metadata = {
    "description": "An example of heterogeneous dataset.",
    "citation": "",
    "data": {
        "Node": {
        },
        "Edge": {

        },
        "Graph": {
            "_NodeList": {
                "file": "example.npz",
                "key": "node_list"
            },
            "_EdgeList": {
                "file": "example.npz",
                "key": "edge_list"
            }
        }
    }
}



In [142]:
import json
import numpy as np
import torch
import scipy.sparse as sparse

In [143]:
valid_edge['relation']
print(graph.keys())

dict_keys(['edge_index_dict', 'edge_feat_dict', 'node_feat_dict', 'num_nodes_dict', 'edge_reltype'])


In [144]:
print(graph.keys())
processed_keys = set()
node_count = dict()
edge_idx_dict = dict()
for pair in graph['edge_index_dict'].keys():
    processed_keys.add(pair[0])
    #for i in range
len(graph['edge_reltype'])

dict_keys(['edge_index_dict', 'edge_feat_dict', 'node_feat_dict', 'num_nodes_dict', 'edge_reltype'])


51

In [145]:
node_set = set()

for pair in graph['edge_index_dict'].keys():
    node_set.add(pair[0])
    node_set.add(pair[2])

In [146]:
print(graph['num_nodes_dict'])
node_dict = {}
node_cumsum = 0
for key in graph['num_nodes_dict'].keys():
    node_dict[key] = np.arange(node_cumsum, node_cumsum+graph['num_nodes_dict'][key])
    print(node_cumsum)
    node_cumsum = node_dict[key][-1]+1

{'disease': 10687, 'drug': 10533, 'function': 45085, 'protein': 17499, 'sideeffect': 9969}
0
10687
21220
66305
83804


In [147]:
sum(graph['edge_index_dict'][('disease','disease-protein','protein')][1,:] == 0)

2

In [148]:
link_dict = {}
for pair in graph['edge_index_dict']:
    link_dict[pair] = graph['edge_index_dict'][pair]
    link_dict[pair][0, :] += node_dict[pair[0]][0]
    link_dict[pair][1, :] += node_dict[pair[2]][0]

In [149]:
len(link_dict)

51

In [150]:
for key in node_dict.keys():
    metadata['data']['Node'][key] = {'_ID':{'file': 'ogbl-biokg.npz', 'key': key+'_id'}}
for key in link_dict.keys():
    metadata['data']['Edge'][key[1]] = {'_ID':{'file': 'ogbl-biokg.npz', 'key': key[1]+'_id'}, '_Edge': {'file': 'ogbl-biokg.npz', 'key':key[1]}}

In [151]:
data = {}
for key in node_dict.keys():
    data[key+'_id'] = node_dict[key]
edge_cumsum = 0
for key in link_dict.keys():
    data[key[1]+'_id'] = np.arange(edge_cumsum, edge_cumsum + link_dict[key].shape[1])
    edge_cumsum = data[key[1]+'_id'][-1]+1
    data[key[1]] = link_dict[key].T
data['node_list'] = np.ones((1,node_cumsum))

data['edge_list'] = np.ones((1, edge_cumsum))





In [72]:
biokg = np.load("ogbl-biokg.npz", allow_pickle=True)

In [71]:
json.dump(metadata, open("metadata.json", "w"), indent=4)

In [169]:
train_edge.keys()
train_set = []
edge_type_list = list(graph_backup['edge_reltype'])
for idx in range(len(train_edge['relation'])):
    edge_type = edge_type_list[train_edge['relation'][idx]]
    head_type = train_edge['head_type'][idx]
    head_id = data[head_type+'_id'][train_edge['head'][idx]]
    tail_type = train_edge['tail_type'][idx]
    tail_id = data[tail_type+'_id'][train_edge['tail'][idx]]
    edge_mat = data[edge_type[1]]
    edge_index = np.where((edge_mat[:,0] == head_id) & (edge_mat[:,1] == tail_id))
    train_set.append(edge_index[0][0])

KeyboardInterrupt: 

In [184]:
valid_edge.keys()
val_set = []
edge_type_list = list(graph_backup['edge_reltype'])
total_edge = len(data['edge_list'][0])
for idx in range(len(valid_edge['relation'])):
    edge_type = edge_type_list[valid_edge['relation'][idx]]
    head_type = valid_edge['head_type'][idx]
    head_id = data[head_type+'_id'][valid_edge['head'][idx]]
    tail_type = valid_edge['tail_type'][idx]
    tail_id = data[tail_type+'_id'][valid_edge['tail'][idx]]
    edge_mat = data[edge_type[1]]
    data[edge_type[1]] = np.append(edge_mat, [[head_id, tail_id]], axis=0)
    data[edge_type[1]+'_id'] = np.append(data[edge_type[1]+'_id'], [total_edge], axis=0)
    val_set.append(total_edge)
    total_edge += 1

('disease', 'disease-protein', 'protein')
3935
3935
1987
68292
[    0     1     2 ... 73544 73545 73546]
('disease', 'disease-protein', 'protein')
287
287
530
66835
[      0       1       2 ...   73545   73546 4762678]
('disease', 'disease-protein', 'protein')
1094
1094
17013
83318
[      0       1       2 ...   73546 4762678 4762679]
('disease', 'disease-protein', 'protein')
495
495
1521
67826
[      0       1       2 ... 4762678 4762679 4762680]
('disease', 'disease-protein', 'protein')
4794
4794
13055
79360
[      0       1       2 ... 4762679 4762680 4762681]
('disease', 'disease-protein', 'protein')
5477
5477
2973
69278
[      0       1       2 ... 4762680 4762681 4762682]
('disease', 'disease-protein', 'protein')
1602
1602
7288
73593
[      0       1       2 ... 4762681 4762682 4762683]
('disease', 'disease-protein', 'protein')
3523
3523
12138
78443
[      0       1       2 ... 4762682 4762683 4762684]
('disease', 'disease-protein', 'protein')
1036
1036
3786
70091
[      0       

In [189]:
test_edge.keys()
test_set = []
edge_type_list = list(graph_backup['edge_reltype'])
total_edge = len(data['edge_list'][0])
for idx in range(len(test_edge['relation'][0:10])):
    edge_type = edge_type_list[test_edge['relation'][idx]]
    head_type = test_edge['head_type'][idx]
    head_id = data[head_type+'_id'][test_edge['head'][idx]]
    tail_type = test_edge['tail_type'][idx]
    tail_id = data[tail_type+'_id'][test_edge['tail'][idx]]
    edge_mat = data[edge_type[1]]
    data[edge_type[1]] = np.append(edge_mat, [[head_id, tail_id]], axis=0)
    data[edge_type[1]+'_id'] = np.append(data[edge_type[1]+'_id'], [total_edge], axis=0)
    test_set.append(total_edge)
    total_edge += 1

In [190]:
data['edge_list'] = np.ones((1, total_edge))
np.savez_compressed("ogbl-biokg.npz", **data)

[4762678,
 4762679,
 4762680,
 4762681,
 4762682,
 4762683,
 4762684,
 4762685,
 4762686,
 4762687]

In [8]:
task = {
    "description": "The task is to predict new triplets given the training triplets. The evaluation protocol is exactly the same as ogbl-wikikg2, except that here we only consider ranking against entities of the same type. For instance, when corrupting head entities of the protein type, we only consider negative protein entities.",
    "type": "LinkPrediction",

    "target": "Edge/EdgeType",
    "train_set": {
        "file": "ogbl-biokg_task.npz",
        "key": "train"
    },
    "val_set": {
        "file": "ogbl-biokg_task.npz",
        "key": "val"
    },
    "test_set": {
        "file": "ogbl-biokg_task.npz",
        "key": "test"
    }
}

In [9]:
task_data = {
    "train": train_set,
    "val": val_set,
    "test": test_set
}
np.savez_compressed("ogbl-biokg_task.npz", **task_data)

with open("./task.json", "w") as fp:
    json.dump(task, fp, indent=4)

{'disease', 'drug', 'function', 'protein'}