## Reading in the data and putting it into adjacency matrices

In [1]:
# import packages
import torch
from rdflib import Graph, Literal
import scipy

This is for the AIFB+ data, but I will make it more general later on, so everthing works.

### Read in the graph:

**Note:** when I use the training, validation and test set given, it only includes the labels? "Node" "Employs" "ID", ID is the class here I think.

In [2]:
# create a graph with rdflib --> reads .nt files automatically
graph_test = Graph()
graph_test.parse("data/aifb/aifb+.nt")

# print the amount of triples as a test
print(len(graph_test))

29043


Note: this will be a very large adjacency matrix, but for the actual computation, it can be sliced.

In [3]:
# make two sets(!) to find out the amount of unique nodes and literals --> to create the adjacency matrix
# for now, disregard the relations --> are included later on, when R-GCN is created
entities = set()
literals = set()
nr_literals_total = 0 # count -- so the ratio between unique and total is clear

for head, relation, tail in graph_test:
    # note: heads can ONLY be entities
    
    entities.add(head)
    
    # add the tail to its respective set
    if isinstance(tail, Literal):
        literals.add(tail)
        nr_literals_total += 1
    else:
        entities.add(tail)

In [4]:
print('Number of entities: ', len(entities))
print('Number of unique literals:', len(literals))
print('Number of literals in total: ', nr_literals_total)

Number of entities:  2835
Number of unique literals: 5468
Number of literals in total:  8705


### Put it into adjacency matrix (tensors) - disregarding literals:

In [5]:
number_nodes = len(entities)
adjacency_matrix = torch.zeros(number_nodes, number_nodes)

In [6]:
adjacency_matrix

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [7]:
# as calling '.index()' every time would result in a O(n^2) complexity, first the mapping is created
# so it's faster to look up both from entity to index and from index to entity
# there might be a better way to do this --> this takes up memory!
map_ent_to_ind = dict()
map_ind_to_ent = dict()

# increments so every entity gets a different index
current = 0

for entity in list(entities):
    map_ent_to_ind[entity] = current
    map_ind_to_ent[current] = entity
    current += 1

In [8]:
indices = [[],[]]
values = []
size = [number_nodes, number_nodes]

for head, relation, tail in graph_test:
    # check whether this is a relational triple:
    if not isinstance(tail, Literal):
        # find out where one needs to be added:
        row_selected = map_ent_to_ind[head]
        column_selected = map_ent_to_ind[tail]
        adjacency_matrix[row_selected, column_selected] += 1

In [9]:
sum(sum(adjacency_matrix))

tensor(20338.)

In [10]:
sum(adjacency_matrix)

tensor([ 3.,  1., 23.,  ..., 12.,  5., 27.])

**Sparse version - example, similar process done for each technique:**

In [11]:
number_nodes = len(entities)

In [12]:
# as calling '.index()' every time would result in a O(n^2) complexity, first the mapping is created
# so it's faster to look up both from entity to index and from index to entity
# there might be a better way to do this --> this takes up memory!
map_ent_to_ind = dict()
map_ind_to_ent = dict()

# increments so every entity gets a different index
current = 0

for entity in list(entities):
    map_ent_to_ind[entity] = current
    map_ind_to_ent[current] = entity
    current += 1

In [13]:
heads = list()
tails = list()
values = list()

for head, relation, tail in graph_test:
    # check whether this is a relational triple:
    if not isinstance(tail, Literal):
        # find out where one needs to be added:
        row_selected = map_ent_to_ind[head]
        column_selected = map_ent_to_ind[tail]
        
        heads.append(row_selected)
        tails.append(column_selected)
        values.append(1.)

In [14]:
adjacency_matrix = scipy.sparse.coo_matrix((values, (heads, tails)), shape=(number_nodes, number_nodes))

In [15]:
adjacency_matrix.sum_duplicates()

In [16]:
adjacency_matrix.data

array([1., 1., 1., ..., 1., 1., 1.])

In [17]:
sum(adjacency_matrix.toarray())

array([ 3.,  1., 23., ..., 12.,  5., 27.])

In [18]:
sum(sum(adjacency_matrix.toarray()))

20338.0

### Put it into adjacency matrix (tensors) - collapsing literals:

In [19]:
number_nodes = len(entities) + len(literals)
adjacency_matrix = torch.zeros(number_nodes, number_nodes)

In [20]:
adjacency_matrix

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [21]:
map_ent_to_ind = dict()
map_ind_to_ent = dict()

# increments so every entity gets a different index
current = 0

for entity in list(entities):
    map_ent_to_ind[entity] = current
    map_ind_to_ent[current] = entity
    current += 1

for literal in list(literals):
    map_ent_to_ind[literal] = current
    map_ind_to_ent[current] = literal
    current += 1

In [22]:
# treat everything the same
for head, relation, tail in graph_test:
    row_selected = map_ent_to_ind[head]
    column_selected = map_ent_to_ind[tail]
    adjacency_matrix[row_selected, column_selected] += 1

In [23]:
adjacency_matrix.sum(dim=0)

tensor([ 3.,  1., 23.,  ...,  1.,  1.,  1.])

### Put it into adjacency matrix (tensors) - separating literals:

In [24]:
number_nodes = len(entities) + nr_literals_total
adjacency_matrix = torch.zeros(number_nodes, number_nodes)

In [25]:
map_ent_to_ind = dict()
map_ind_to_ent = dict()

# increments so every entity gets a different index
current = 0

for entity in list(entities):
    map_ent_to_ind[entity] = current
    map_ind_to_ent[current] = entity
    current += 1

In [26]:
# add everything to the adjency matrix - use the same 'current' as before
for head, relation, tail in graph_test:
    # check whether this is a relational triple:
    if not isinstance(tail, Literal):
        row_selected = map_ent_to_ind[head]
        column_selected = map_ent_to_ind[tail]
        adjacency_matrix[row_selected, column_selected] += 1
    # else, add it in a new row (literals only have one connection):
    else:
        row_selected = map_ent_to_ind[head]
        column_selected = current
        
        map_ent_to_ind[entity] = current
        map_ind_to_ent[current] = entity
        
        adjacency_matrix[row_selected, column_selected] += 1
        current += 1

In [27]:
adjacency_matrix.sum(dim=0)

tensor([ 3.,  1., 23.,  ...,  1.,  1.,  1.])

### Try it for multiple datasets, so everything works smoothly:

I put it in a separate file - testing this method here as well - whether it works for both AIFB+ and MUTAG. 

In [28]:
import reading_data

### AIFB+

**Not relational:**

In [29]:
adj_mat_fil, map_ent_fil = reading_data.create_adjacency_matrix_nt_dense("data/aifb/aifb+.nt", literal_representation="filtered")
print(adj_mat_fil.shape)
adj_mat_fil.sum()

torch.Size([2835, 2835])


tensor(20338.)

In [30]:
adj_mat_all, map_ent_all = reading_data.create_adjacency_matrix_nt_dense("data/aifb/aifb+.nt", literal_representation="all-to-one")
print(adj_mat_all.shape)
adj_mat_all.sum()

torch.Size([2836, 2836])


tensor(29043.)

In [31]:
adj_mat_col, map_ent_col = reading_data.create_adjacency_matrix_nt_dense("data/aifb/aifb+.nt", literal_representation="collapsed")
print(adj_mat_col.shape)
adj_mat_col.sum()

torch.Size([8303, 8303])


tensor(29043.)

In [32]:
adj_mat_sep, map_ent_sep = reading_data.create_adjacency_matrix_nt_dense("data/aifb/aifb+.nt", literal_representation="separate")
print(adj_mat_sep.shape)
adj_mat_sep.sum()

torch.Size([11540, 11540])


tensor(29043.)

**Relational:**

In [33]:
adj_mat_fil, map_ent_fil, map_rel_fil = reading_data.create_adjacency_matrix_nt_dense("data/aifb/aifb+.nt", literal_representation="filtered", relational=True)
print(adj_mat_fil.shape)
adj_mat_fil.sum()

torch.Size([2835, 2835, 22])


tensor(20338.)

In [34]:
adj_mat_fil.sum(dim=[0,1])

tensor([1.9900e+02, 1.9000e+02, 3.9860e+03, 5.7100e+02, 9.5200e+02, 1.0000e+01,
        6.8000e+01, 2.0000e+02, 1.2170e+03, 4.1630e+03, 1.5200e+02, 4.1240e+03,
        9.5200e+02, 3.5700e+02, 5.0000e+00, 3.3900e+02, 6.5000e+01, 7.9000e+01,
        1.5200e+02, 7.9000e+01, 2.4770e+03, 1.0000e+00])

In [35]:
adj_mat_all, map_ent_all, map_rel_all = reading_data.create_adjacency_matrix_nt_dense("data/aifb/aifb+.nt", literal_representation="all-to-one", relational=True)
print(adj_mat_all.shape)
adj_mat_all.sum()

torch.Size([2836, 2836, 45])


tensor(29043.)

In [36]:
adj_mat_all.sum(dim=[0,1])

tensor([1.6100e+02, 1.9900e+02, 1.9000e+02, 2.9800e+02, 3.9860e+03, 5.7100e+02,
        7.9000e+01, 9.5200e+02, 1.0000e+01, 1.2000e+01, 1.4800e+02, 6.8000e+01,
        2.0000e+02, 3.1100e+02, 4.9000e+01, 1.2170e+03, 4.1630e+03, 2.2700e+02,
        1.5200e+02, 4.1240e+03, 1.5000e+01, 2.3900e+02, 5.0000e+01, 9.5200e+02,
        1.2270e+03, 1.6000e+01, 7.6500e+02, 1.1400e+02, 5.3400e+02, 3.5700e+02,
        5.0000e+00, 3.3900e+02, 6.5000e+01, 2.2700e+02, 2.0200e+02, 7.5900e+02,
        1.2270e+03, 7.9000e+01, 1.4500e+02, 1.5200e+02, 1.3020e+03, 1.2900e+02,
        2.4770e+03, 5.4800e+02, 1.0000e+00])

In [37]:
adj_mat_col, map_ent_col, map_rel_col = reading_data.create_adjacency_matrix_nt_dense("data/aifb/aifb+.nt", literal_representation="collapsed", relational=True)
print(adj_mat_col.shape)
adj_mat_col.sum()

torch.Size([8303, 8303, 45])


tensor(29043.)

In [38]:
adj_mat_col.sum(dim=[0,1])

tensor([1.6100e+02, 1.9900e+02, 1.9000e+02, 2.9800e+02, 3.9860e+03, 5.7100e+02,
        7.9000e+01, 9.5200e+02, 1.0000e+01, 1.2000e+01, 1.4800e+02, 6.8000e+01,
        2.0000e+02, 3.1100e+02, 4.9000e+01, 1.2170e+03, 4.1630e+03, 2.2700e+02,
        1.5200e+02, 4.1240e+03, 1.5000e+01, 2.3900e+02, 5.0000e+01, 9.5200e+02,
        1.2270e+03, 1.6000e+01, 7.6500e+02, 1.1400e+02, 5.3400e+02, 3.5700e+02,
        5.0000e+00, 3.3900e+02, 6.5000e+01, 2.2700e+02, 2.0200e+02, 7.5900e+02,
        1.2270e+03, 7.9000e+01, 1.4500e+02, 1.5200e+02, 1.3020e+03, 1.2900e+02,
        2.4770e+03, 5.4800e+02, 1.0000e+00])

In [39]:
adj_mat_sep, map_ent_sep, map_rel_sep = reading_data.create_adjacency_matrix_nt_dense("data/aifb/aifb+.nt", literal_representation="separate", relational=True)
print(adj_mat_sep.shape)
adj_mat_sep.sum()

torch.Size([11540, 11540, 45])


tensor(29043.)

In [40]:
adj_mat_sep.sum(dim=[0,1])

tensor([1.6100e+02, 1.9900e+02, 1.9000e+02, 2.9800e+02, 3.9860e+03, 5.7100e+02,
        7.9000e+01, 9.5200e+02, 1.0000e+01, 1.2000e+01, 1.4800e+02, 6.8000e+01,
        2.0000e+02, 3.1100e+02, 4.9000e+01, 1.2170e+03, 4.1630e+03, 2.2700e+02,
        1.5200e+02, 4.1240e+03, 1.5000e+01, 2.3900e+02, 5.0000e+01, 9.5200e+02,
        1.2270e+03, 1.6000e+01, 7.6500e+02, 1.1400e+02, 5.3400e+02, 3.5700e+02,
        5.0000e+00, 3.3900e+02, 6.5000e+01, 2.2700e+02, 2.0200e+02, 7.5900e+02,
        1.2270e+03, 7.9000e+01, 1.4500e+02, 1.5200e+02, 1.3020e+03, 1.2900e+02,
        2.4770e+03, 5.4800e+02, 1.0000e+00])

### MUTAG

**Not relational:**

In [41]:
adj_mat_fil, map_ent_fil = reading_data.create_adjacency_matrix_nt_dense("data/mutag/mutag.nt", literal_representation="filtered")
print(adj_mat_fil.shape)
adj_mat_fil.sum()

torch.Size([22540, 22540])


tensor(63382.)

In [None]:
adj_mat_all, map_ent_all = reading_data.create_adjacency_matrix_nt_dense("data/mutag/mutag.nt", literal_representation="all-to-one")
print(adj_mat_all.shape)
adj_mat_all.sum()

In [None]:
adj_mat_col, map_ent_col = reading_data.create_adjacency_matrix_nt_dense("data/mutag/mutag.nt", literal_representation="collapsed")
print(adj_mat_col.shape)
adj_mat_col.sum()

In [None]:
adj_mat_sep, map_ent_sep = reading_data.create_adjacency_matrix_nt_dense("data/mutag/mutag.nt", literal_representation="separate")
print(adj_mat_sep.shape)
adj_mat_sep.sum()

**Relational:**

In [None]:
adj_mat_fil, map_ent_fil, map_rel_fil = reading_data.create_adjacency_matrix_nt_dense("data/mutag/mutag.nt", literal_representation="filtered", relational=True)
print(adj_mat_fil.shape)
adj_mat_fil.sum()

In [None]:
adj_mat_fil.sum(dim=[0,1])

**Kernel keeps stopping when the following code is run, due to excessive memory use - switch to sparse matrices to fix this error. Change to sparse matrix to fix this error!**

In [None]:
adj_mat_col, map_ent_col, map_rel_col = reading_data.create_adjacency_matrix_nt_dense("data/mutag/mutag.nt", literal_representation="collapsed", relational=True)
print(adj_mat_col.shape)
adj_mat_col.sum()

In [None]:
adj_mat_col.sum(dim=[0,1])

In [None]:
adj_mat_sep, map_ent_sep, map_rel_sep = reading_data.create_adjacency_matrix_nt_dense("data/mutag/mutag.nt", literal_representation="separate", relational=True)
print(adj_mat_sep.shape)
adj_mat_sep.sum()

In [None]:
adj_mat_sep.sum(dim=[0,1])

### dmg777k

In [None]:
import reading_data
adj_mat_fil, map_ent_fil = reading_data.create_adjacency_matrix_nt_dense("data/dmg777k/dmg777k_stripped.nt", literal_representation="filtered")
print(adj_mat_fil.shape)
adj_mat_fil.sum(dim=0).sum()

In [None]:
adj_mat_col, map_ent_col = reading_data.create_adjacency_matrix_nt_dense("data/dmg777k/dmg777k_stripped.nt", literal_representation="collapsed")
print(adj_mat_col.shape)
adj_mat_col.sum(dim=0).sum()

In [None]:
adj_mat_sep, map_ent_sep = reading_data.create_adjacency_matrix_nt_dense("data/dmg777k/dmg777k_stripped.nt", literal_representation="separate")
print(adj_mat_sep.shape)
adj_mat_sep.sum().sum()