## Reading in the data and putting it into adjacency matrices

In [1]:
# import packages
import torch
from rdflib import Graph, Literal

This is for the AIFB+ data, but I will make it more general later on, so everthing works.

### Read in the graph:

**Note:** when I use the training, validation and test set given, it only includes the labels? "Node" "Employs" "ID", ID is the class here I think.

In [2]:
# create a graph with rdflib --> reads .nt files automatically
graph_test = Graph()
graph_test.parse("data/aifb/aifb+.nt")

# print the amount of triples as a test
print(len(graph_test))

29043


Note: this will be a very large adjacency matrix, but for the actual computation, it can be sliced.

In [3]:
# make two sets(!) to find out the amount of unique nodes and literals --> to create the adjacency matrix
# for now, disregard the relations --> are included later on, when R-GCN is created
entities = set()
literals = set()
nr_literals_total = 0 # count -- so the ratio between unique and total is clear

for head, relation, tail in graph_test:
    # note: heads can ONLY be entities
    
    entities.add(head)
    
    # add the tail to its respective set
    if isinstance(tail, Literal):
        literals.add(tail)
        nr_literals_total += 1
    else:
        entities.add(tail)

In [4]:
print('Number of entities: ', len(entities))
print('Number of unique literals:', len(literals))
print('Number of literals in total: ', nr_literals_total)

Number of entities:  2835
Number of unique literals: 5468
Number of literals in total:  8705


### Put it into adjacency matrix (tensors) - disregarding literals:

In [5]:
number_nodes = len(entities)
adjacency_matrix = torch.zeros(number_nodes, number_nodes)

In [6]:
adjacency_matrix

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [7]:
# as calling '.index()' every time would result in a O(n^2) complexity, first the mapping is created
# so it's faster to look up both from entity to index and from index to entity
# there might be a better way to do this --> this takes up memory!
map_ent_to_ind = dict()
map_ind_to_ent = dict()

# increments so every entity gets a different index
current = 0

for entity in list(entities):
    map_ent_to_ind[entity] = current
    map_ind_to_ent[current] = entity
    current += 1

In [8]:
for head, relation, tail in graph_test:
    # check whether this is a relational triple:
    if not isinstance(tail, Literal):
        # find out where one needs to be added:
        row_selected = map_ent_to_ind[head]
        column_selected = map_ent_to_ind[tail]
        adjacency_matrix[row_selected, column_selected] += 1

In [9]:
# to show there are relations being added:
adjacency_matrix.sum(dim=0)

tensor([13.,  1.,  5.,  ...,  9.,  1.,  3.])

### Put it into adjacency matrix (tensors) - collapsing literals:

In [10]:
number_nodes = len(entities) + len(literals)
adjacency_matrix = torch.zeros(number_nodes, number_nodes)

In [11]:
adjacency_matrix

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [12]:
map_ent_to_ind = dict()
map_ind_to_ent = dict()

# increments so every entity gets a different index
current = 0

for entity in list(entities):
    map_ent_to_ind[entity] = current
    map_ind_to_ent[current] = entity
    current += 1

for literal in list(literals):
    map_ent_to_ind[literal] = current
    map_ind_to_ent[current] = literal
    current += 1

In [13]:
# treat everything the same
for head, relation, tail in graph_test:
    row_selected = map_ent_to_ind[head]
    column_selected = map_ent_to_ind[tail]
    adjacency_matrix[row_selected, column_selected] += 1

In [14]:
adjacency_matrix.sum(dim=0)

tensor([13.,  1.,  5.,  ...,  1.,  1.,  1.])

### Put it into adjacency matrix (tensors) - separating literals:

In [15]:
number_nodes = len(entities) + nr_literals_total
adjacency_matrix = torch.zeros(number_nodes, number_nodes)

In [16]:
map_ent_to_ind = dict()
map_ind_to_ent = dict()

# increments so every entity gets a different index
current = 0

for entity in list(entities):
    map_ent_to_ind[entity] = current
    map_ind_to_ent[current] = entity
    current += 1

In [17]:
# add everything to the adjency matrix - use the same 'current' as before
for head, relation, tail in graph_test:
    # check whether this is a relational triple:
    if not isinstance(tail, Literal):
        row_selected = map_ent_to_ind[head]
        column_selected = map_ent_to_ind[tail]
        adjacency_matrix[row_selected, column_selected] += 1
    # else, add it in a new row (literals only have one connection):
    else:
        row_selected = map_ent_to_ind[head]
        column_selected = current
        
        map_ent_to_ind[entity] = current
        map_ind_to_ent[current] = entity
        
        adjacency_matrix[row_selected, column_selected] += 1
        current += 1

In [18]:
adjacency_matrix.sum(dim=0)

tensor([13.,  1.,  5.,  ...,  1.,  1.,  1.])

### Try it for multiple datasets, so everything works smoothly:

I put it in a separate file - testing this method here as well - whether it works for both AIFB+ and MUTAG. 

In [19]:
import reading_data

### AIFB+

In [26]:
adj_mat_fil, map_ent_fil = reading_data.create_adjacency_matrix_nt("data/aifb/aifb+.nt", literal_representation="filtered")
print(adj_mat_fil.shape)
adj_mat_fil.sum(dim=0).sum()

torch.Size([2835, 2835])


tensor(20338.)

In [27]:
adj_mat_col, map_ent_col = reading_data.create_adjacency_matrix_nt("data/aifb/aifb+.nt", literal_representation="collapsed")
print(adj_mat_col.shape)
adj_mat_col.sum(dim=0).sum()

torch.Size([8303, 8303])


tensor(29043.)

In [28]:
adj_mat_sep, map_ent_sep = reading_data.create_adjacency_matrix_nt("data/aifb/aifb+.nt", literal_representation="separate")
print(adj_mat_sep.shape)
adj_mat_sep.sum().sum()

torch.Size([11540, 11540])


tensor(29043.)

### MUTAG

In [29]:
adj_mat_fil, map_ent_fil = reading_data.create_adjacency_matrix_nt("data/mutag/mutag.nt", literal_representation="filtered")
print(adj_mat_fil.shape)
adj_mat_fil.sum(dim=0).sum()

torch.Size([22540, 22540])


tensor(63382.)

In [30]:
adj_mat_col, map_ent_col = reading_data.create_adjacency_matrix_nt("data/mutag/mutag.nt", literal_representation="collapsed")
print(adj_mat_col.shape)
adj_mat_col.sum(dim=0).sum()

torch.Size([23644, 23644])


tensor(74227.)

In [31]:
adj_mat_sep, map_ent_sep = reading_data.create_adjacency_matrix_nt("data/mutag/mutag.nt", literal_representation="separate")
print(adj_mat_sep.shape)
adj_mat_sep.sum().sum()

torch.Size([33385, 33385])


tensor(74227.)

## Separately for the kgbench - as it uses the data differently

In [None]:
# need to ask help from supervisor for this!