In [1]:
import json
import numpy as np
import pandas as pd

In [2]:
kg = pd.read_csv('../data/MIMIC_KG.tsv', sep='\t')
entities = pd.read_csv('../data/entities.tsv', sep='\t')
relations = pd.read_csv('../data/relations.tsv', sep='\t')

feats = np.load('../data/MIMIC_KG_RotatE_relation.npy')

In [5]:
kg

Unnamed: 0,normal,5,normal.1
0,normal,5,pleural
1,normal,5,effusion
2,normal,5,pneumothorax
3,normal,5,lung
4,normal,5,lungs
...,...,...,...
14458,cicatricial,1,thorax
14459,cicatricial,1,hemidiaphragms
14460,cicatricial,1,bronchiectasis
14461,cicatricial,1,pleura


In [17]:
class KGDataset:
    def __init__(self, kg_path, entity_path, relation_path, delimiter='\t', skip_first_line=False):
        self.delimiter = delimiter
        self.entity2id, self.n_entities = self.read_entity(entity_path)
        self.relation2id, self.n_relations = self.read_relation(relation_path)
        self.train = self.read_triple(kg_path, "train", skip_first_line)
        
    def read_entity(self, entity_path):
        with open(entity_path) as f:
            entity2id = {}
            for line in f:
                eid, entity = line.strip().split(self.delimiter)
                entity2id[entity] = int(eid)

        return entity2id, len(entity2id)

    def read_relation(self, relation_path):
        with open(relation_path) as f:
            relation2id = {}
            for line in f:
                rid, relation = line.strip().split(self.delimiter)
                relation2id[relation] = int(rid)

        return relation2id, len(relation2id)

    def read_triple(self, path, mode='train', skip_first_line=False, format_order=[0,1,2]):
        # mode: train/valid/test
        if path is None:
            return None

        print('Reading {} triples....'.format(mode))
        heads = []
        tails = []
        rels = []
        with open(path) as f:
            if skip_first_line:
                _ = f.readline()
            for line in f:
                triple = line.strip().split(self.delimiter)
                h, r, t = triple[format_order[0]], triple[format_order[1]], triple[format_order[2]]
                heads.append(self.entity2id[h])
                rels.append(self.relation2id[r])
                tails.append(self.entity2id[t])

        heads = np.array(heads, dtype=np.int64)
        tails = np.array(tails, dtype=np.int64)
        rels = np.array(rels, dtype=np.int64)
        print('Finished. Read {} {} triples.'.format(len(heads), mode))

        return (heads, rels, tails)

In [18]:
kg_path = '../data/MIMIC_KG.tsv'
entity_path = '../data/entities.tsv'
relation_path = '../data/relations.tsv'
kg_dataset = KGDataset(kg_path, entity_path, relation_path)

Reading train triples....
Finished. Read 14464 train triples.


In [20]:
kg_dataset.train

(array([  0,   0,   0, ..., 123, 123, 123]),
 array([0, 0, 0, ..., 3, 3, 3]),
 array([  0,   1,   2, ..., 106, 108, 123]))

In [25]:
np.unique(kg_dataset.train[0])

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123])

In [33]:
n = len(np.unique(kg_dataset.train[0]))
edges_feats = np.load('../data/MIMIC_KG_RotatE_relation.npy')
feats = np.zeros((n, n, 200))

In [31]:
edges_feats.shape

(5, 200)

In [45]:
for (h, r, t) in zip(kg_dataset.train[0], kg_dataset.train[1], kg_dataset.train[2]):
    feats[h, t] = edges_feats[r]

In [48]:
feats.shape

(124, 124, 200)

In [50]:
np.save('../data/MIMIC_KG_RotatE_relation_feats.npy', feats)