## Reading data - putting it into adjacency matrices and performing some statistic tests

In [1]:
# import packages
import torch
from rdflib import Graph, Literal
import gzip

# import python files
import reading_data

This is for the AIFB+ data, but I will make it more general later on, so everthing works.

### Read in the graph:

**Note:** when I use the training, validation and test set given, it only includes the labels? "Node" "Employs" "ID", ID is the class here I think.

In [3]:
# create a graph with rdflib --> reads .nt files automatically
graph_test = Graph()
#graph_test.parse("data/aifb/aifb+.nt")

with gzip.open("data/aifb/gz_files/aifb+.nt.gz", 'r') as gf:
  graph_test.parse(data=gf.read(), format='nt')

# print the amount of triples as a test
print(len(graph_test))

29043


Note: this will be a very large adjacency matrix, but for the actual computation, it can be sliced.

In [4]:
# make two sets(!) to find out the amount of unique nodes and literals --> to create the adjacency matrix
# for now, disregard the relations --> are included later on, when R-GCN is created
entities = set()
literals = set()
nr_literals_total = 0 # count -- so the ratio between unique and total is clear

for head, relation, tail in graph_test:
    # note: heads can ONLY be entities
    
    entities.add(head)
    
    # add the tail to its respective set
    if isinstance(tail, Literal):
        literals.add(tail)
        nr_literals_total += 1
    else:
        entities.add(tail)

In [5]:
print('Number of entities: ', len(entities))
print('Number of unique literals:', len(literals))
print('Number of literals in total: ', nr_literals_total)

Number of entities:  2835
Number of unique literals: 5468
Number of literals in total:  8705


### Put it into adjacency matrix (tensors) - disregarding literals:

In [5]:
number_nodes = len(entities)
adjacency_matrix = torch.zeros(number_nodes, number_nodes)

In [6]:
adjacency_matrix

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [7]:
# as calling '.index()' every time would result in a O(n^2) complexity, first the mapping is created
# so it's faster to look up both from entity to index and from index to entity
# there might be a better way to do this --> this takes up memory!
map_ent_to_ind = dict()
map_ind_to_ent = dict()

# increments so every entity gets a different index
current = 0

for entity in list(entities):
    map_ent_to_ind[entity] = current
    map_ind_to_ent[current] = entity
    current += 1

In [8]:
indices = [[],[]]
values = []
size = [number_nodes, number_nodes]

for head, relation, tail in graph_test:
    # check whether this is a relational triple:
    if not isinstance(tail, Literal):
        # find out where one needs to be added:
        row_selected = map_ent_to_ind[head]
        column_selected = map_ent_to_ind[tail]
        adjacency_matrix[row_selected, column_selected] += 1

In [9]:
sum(sum(adjacency_matrix))

tensor(20338.)

In [10]:
sum(adjacency_matrix)

tensor([1., 2., 1.,  ..., 1., 0., 2.])

### Put it into adjacency matrix (sparse) - disregarding literals:

#### Will not show this for all techniques, as the general technique is the same --> creating lists, making a sparse tensor out of that list.

In [11]:
number_nodes = len(entities)

In [12]:
# as calling '.index()' every time would result in a O(n^2) complexity, first the mapping is created
# so it's faster to look up both from entity to index and from index to entity
# there might be a better way to do this --> this takes up memory!
map_ent_to_ind = dict()
map_ind_to_ent = dict()

# increments so every entity gets a different index
current = 0

for entity in list(entities):
    map_ent_to_ind[entity] = current
    map_ind_to_ent[current] = entity
    current += 1

In [13]:
# named differently in the function itself, in the separate file
heads = list()
tails = list()
values = list()

for head, relation, tail in graph_test:
    # check whether this is a relational triple:
    if not isinstance(tail, Literal):
        # find out where one needs to be added:
        row_selected = map_ent_to_ind[head]
        column_selected = map_ent_to_ind[tail]
        
        heads.append(row_selected)
        tails.append(column_selected)
        values.append(1.)

In [14]:
adjacency_matrix = adjacency_matrix = torch.sparse_coo_tensor(indices=torch.tensor([heads, tails]),values=torch.tensor(values), size=(number_nodes, number_nodes))

In [15]:
adjacency_matrix

tensor(indices=tensor([[2777,  330, 1626,  ..., 2112,  267,  474],
                       [1158, 1676, 1026,  ..., 1418, 2269, 2069]]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
       size=(2835, 2835), nnz=20338, layout=torch.sparse_coo)

In [16]:
torch.sparse.sum(adjacency_matrix)

tensor(20338.)

In [17]:
torch.sparse.sum(adjacency_matrix, dim=0)

tensor(indices=tensor([[   0,    1,    2,  ..., 2831, 2832, 2834]]),
       values=tensor([ 1.,  2.,  1.,  ..., 35.,  1.,  2.]),
       size=(2835,), nnz=2640, layout=torch.sparse_coo)

### Put it into adjacency matrix (tensors) - collapsing literals:

In [18]:
number_nodes = len(entities) + len(literals)
adjacency_matrix = torch.zeros(number_nodes, number_nodes)

In [19]:
adjacency_matrix

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [20]:
map_ent_to_ind = dict()
map_ind_to_ent = dict()

# increments so every entity gets a different index
current = 0

for entity in list(entities):
    map_ent_to_ind[entity] = current
    map_ind_to_ent[current] = entity
    current += 1

for literal in list(literals):
    map_ent_to_ind[literal] = current
    map_ind_to_ent[current] = literal
    current += 1

In [21]:
# treat everything the same
for head, relation, tail in graph_test:
    row_selected = map_ent_to_ind[head]
    column_selected = map_ent_to_ind[tail]
    adjacency_matrix[row_selected, column_selected] += 1

In [22]:
adjacency_matrix.sum(dim=0)

tensor([1., 2., 1.,  ..., 1., 1., 1.])

### Put it into adjacency matrix (tensors) - separating literals:

In [23]:
number_nodes = len(entities) + nr_literals_total
adjacency_matrix = torch.zeros(number_nodes, number_nodes)

In [24]:
map_ent_to_ind = dict()
map_ind_to_ent = dict()

# increments so every entity gets a different index
current = 0

for entity in list(entities):
    map_ent_to_ind[entity] = current
    map_ind_to_ent[current] = entity
    current += 1

In [25]:
# add everything to the adjency matrix - use the same 'current' as before
for head, relation, tail in graph_test:
    # check whether this is a relational triple:
    if not isinstance(tail, Literal):
        row_selected = map_ent_to_ind[head]
        column_selected = map_ent_to_ind[tail]
        adjacency_matrix[row_selected, column_selected] += 1
    # else, add it in a new row (literals only have one connection):
    else:
        row_selected = map_ent_to_ind[head]
        column_selected = current
        
        map_ent_to_ind[entity] = current
        map_ind_to_ent[current] = entity
        
        adjacency_matrix[row_selected, column_selected] += 1
        current += 1

In [26]:
adjacency_matrix.sum(dim=0)

tensor([1., 2., 1.,  ..., 1., 1., 1.])

### Try it for multiple datasets, so everything works smoothly:

I put it in a separate file - testing this method here as well - whether it works for both AIFB+ and MUTAG. 

## AIFB+

### Not relational:

**Filtered:**

In [27]:
# dense
adj_mat_fil, map_ent_fil, map_ent_to_ind_fil = reading_data.create_adjacency_matrix_nt("data/aifb/aifb+.nt", literal_representation="filtered")
print(adj_mat_fil.shape)
adj_mat_fil.sum()

torch.Size([2835, 2835])


tensor(20338.)

In [28]:
# sparse
adj_mat_fil, map_ent_fil, map_ent_to_ind_fil = reading_data.create_adjacency_matrix_nt("data/aifb/aifb+.nt", literal_representation="filtered", sparse=True)
print(adj_mat_fil.size())
torch.sparse.sum(adj_mat_fil)

torch.Size([2835, 2835])


tensor(20338.)

**All-to-one:**

In [29]:
# dense
adj_mat_all, map_ent_all, map_ent_to_ind_all = reading_data.create_adjacency_matrix_nt("data/aifb/aifb+.nt", literal_representation="all-to-one")
print(adj_mat_all.shape)
adj_mat_all.sum()

torch.Size([2836, 2836])


tensor(29043.)

In [30]:
# sparse
adj_mat_all, map_ent_all, map_ent_to_ind_all = reading_data.create_adjacency_matrix_nt("data/aifb/aifb+.nt", literal_representation="all-to-one", sparse=True)
print(adj_mat_all.size())
torch.sparse.sum(adj_mat_all)

torch.Size([2836, 2836])


tensor(29043.)

**Collapsed:**

In [31]:
# dense
adj_mat_col, map_ent_col, map_ent_to_ind_col = reading_data.create_adjacency_matrix_nt("data/aifb/aifb+.nt", literal_representation="collapsed")
print(adj_mat_col.shape)
adj_mat_col.sum()

torch.Size([8303, 8303])


tensor(29043.)

In [32]:
# sparse
adj_mat_col, map_ent_col, map_ent_to_ind_col = reading_data.create_adjacency_matrix_nt("data/aifb/aifb+.nt", literal_representation="collapsed", sparse=True)
print(adj_mat_col.size())
torch.sparse.sum(adj_mat_col)

torch.Size([8303, 8303])


tensor(29043.)

**Separate:**

In [33]:
# dense
adj_mat_sep, map_ent_sep, map_ent_to_ind_sep = reading_data.create_adjacency_matrix_nt("data/aifb/aifb+.nt", literal_representation="separate")
print(adj_mat_sep.shape)
adj_mat_sep.sum()

torch.Size([11540, 11540])


tensor(29043.)

In [34]:
# sparse
adj_mat_sep, map_ent_sep, map_ent_to_ind_sep= reading_data.create_adjacency_matrix_nt("data/aifb/aifb+.nt", literal_representation="separate", sparse=True)
print(adj_mat_sep.size())
torch.sparse.sum(adj_mat_sep)

torch.Size([11540, 11540])


tensor(29043.)

### Relational:

As a quick test, I look whether the number of relations is correct --> very hard to check whether it all gets to the same thing, so if it is correct, but this is a quick check to see whether everything works properly, and is mapped properly.

**Filtered:**

In [35]:
# dense
adj_mat_fil, map_ent_fil, map_ent_to_ind_fil, map_rel_fil = reading_data.create_adjacency_matrix_nt("data/aifb/aifb+.nt", literal_representation="filtered", relational=True)
print(adj_mat_fil.shape)
print(adj_mat_fil.sum().item())
adj_mat_fil.sum(dim=[0,1])

torch.Size([2835, 2835, 22])
20338.0


tensor([1.5200e+02, 3.3900e+02, 9.5200e+02, 7.9000e+01, 4.1630e+03, 2.0000e+02,
        1.0000e+00, 5.7100e+02, 6.8000e+01, 4.1240e+03, 1.9900e+02, 6.5000e+01,
        5.0000e+00, 1.2170e+03, 3.5700e+02, 1.5200e+02, 7.9000e+01, 3.9860e+03,
        2.4770e+03, 9.5200e+02, 1.9000e+02, 1.0000e+01])

In [36]:
# sparse
adj_mat_fil, map_ent_fil, map_ent_to_ind_fil, map_rel_fil = reading_data.create_adjacency_matrix_nt("data/aifb/aifb+.nt", literal_representation="filtered", relational=True, sparse=True)
print(adj_mat_fil.size())
print(torch.sparse.sum(adj_mat_fil).item())
torch.sparse.sum(adj_mat_fil, dim=[0,1]).values()

torch.Size([2835, 2835, 22])
20338.0


tensor([1.5200e+02, 3.3900e+02, 9.5200e+02, 7.9000e+01, 4.1630e+03, 2.0000e+02,
        1.0000e+00, 5.7100e+02, 6.8000e+01, 4.1240e+03, 1.9900e+02, 6.5000e+01,
        5.0000e+00, 1.2170e+03, 1.5200e+02, 3.5700e+02, 7.9000e+01, 3.9860e+03,
        2.4770e+03, 9.5200e+02, 1.9000e+02, 1.0000e+01])

**All-to-one:**

In [37]:
# dense
adj_mat_all, map_ent_all, map_ent_to_ind_all, map_rel_all = reading_data.create_adjacency_matrix_nt("data/aifb/aifb+.nt", literal_representation="all-to-one", relational=True)
print(adj_mat_all.shape)
print(adj_mat_all.sum().item())
adj_mat_all.sum(dim=[0,1])

torch.Size([2836, 2836, 45])
29043.0


tensor([1.3020e+03, 1.5200e+02, 3.3900e+02, 9.5200e+02, 2.3900e+02, 7.9000e+01,
        4.1630e+03, 2.2700e+02, 5.4800e+02, 2.0000e+02, 1.2270e+03, 4.9000e+01,
        1.0000e+00, 1.5000e+01, 2.2700e+02, 1.1400e+02, 5.7100e+02, 7.5900e+02,
        6.8000e+01, 1.4500e+02, 5.0000e+01, 1.0000e+01, 1.9900e+02, 4.1240e+03,
        1.2270e+03, 6.5000e+01, 1.2900e+02, 5.0000e+00, 3.1100e+02, 1.2170e+03,
        3.5700e+02, 7.9000e+01, 1.5200e+02, 1.6000e+01, 7.6500e+02, 3.9860e+03,
        2.9800e+02, 1.6100e+02, 2.4770e+03, 2.0200e+02, 1.2000e+01, 9.5200e+02,
        1.9000e+02, 1.4800e+02, 5.3400e+02])

In [38]:
# sparse
adj_mat_all, map_ent_all, map_ent_to_ind_all, map_rel_all = reading_data.create_adjacency_matrix_nt("data/aifb/aifb+.nt", literal_representation="all-to-one", relational=True, sparse=True)
print(adj_mat_all.size())
print(torch.sparse.sum(adj_mat_all).item())
torch.sparse.sum(adj_mat_all, dim=[0,1]).values()

torch.Size([2836, 2836, 45])
29043.0


tensor([1.3020e+03, 1.5200e+02, 3.3900e+02, 9.5200e+02, 2.3900e+02, 7.9000e+01,
        4.1630e+03, 2.2700e+02, 5.4800e+02, 2.0000e+02, 1.2270e+03, 4.9000e+01,
        1.0000e+00, 1.5000e+01, 2.2700e+02, 1.1400e+02, 5.7100e+02, 7.5900e+02,
        6.8000e+01, 1.4500e+02, 5.0000e+01, 1.0000e+01, 4.1240e+03, 1.9900e+02,
        1.2270e+03, 6.5000e+01, 1.2900e+02, 5.0000e+00, 3.1100e+02, 1.2170e+03,
        3.5700e+02, 1.5200e+02, 7.9000e+01, 1.6000e+01, 7.6500e+02, 3.9860e+03,
        2.9800e+02, 1.6100e+02, 2.4770e+03, 2.0200e+02, 1.2000e+01, 9.5200e+02,
        1.9000e+02, 1.4800e+02, 5.3400e+02])

**Collapsed:**

In [39]:
# dense
adj_mat_col, map_ent_col, map_ent_to_ind_col, map_rel_col = reading_data.create_adjacency_matrix_nt("data/aifb/aifb+.nt", literal_representation="collapsed", relational=True)
print(adj_mat_col.shape)
print(adj_mat_col.sum().item())
adj_mat_col.sum(dim=[0,1])

torch.Size([8303, 8303, 45])
29043.0


tensor([1.3020e+03, 1.5200e+02, 3.3900e+02, 9.5200e+02, 2.3900e+02, 7.9000e+01,
        4.1630e+03, 2.2700e+02, 5.4800e+02, 2.0000e+02, 1.2270e+03, 4.9000e+01,
        1.0000e+00, 1.5000e+01, 2.2700e+02, 1.1400e+02, 5.7100e+02, 7.5900e+02,
        6.8000e+01, 1.4500e+02, 5.0000e+01, 1.0000e+01, 1.9900e+02, 4.1240e+03,
        1.2270e+03, 6.5000e+01, 1.2900e+02, 5.0000e+00, 3.1100e+02, 1.2170e+03,
        1.5200e+02, 3.5700e+02, 7.9000e+01, 1.6000e+01, 7.6500e+02, 3.9860e+03,
        2.9800e+02, 1.6100e+02, 2.4770e+03, 2.0200e+02, 1.2000e+01, 9.5200e+02,
        1.9000e+02, 1.4800e+02, 5.3400e+02])

In [40]:
# sparse
adj_mat_col, map_ent_col, map_ent_to_ind_col, map_rel_col = reading_data.create_adjacency_matrix_nt("data/aifb/aifb+.nt", literal_representation="collapsed", relational=True, sparse=True)
print(adj_mat_col.size())
print(torch.sparse.sum(adj_mat_col).item())
torch.sparse.sum(adj_mat_col, dim=[0,1]).values()

torch.Size([8303, 8303, 45])
29043.0


tensor([1.3020e+03, 1.5200e+02, 3.3900e+02, 9.5200e+02, 2.3900e+02, 7.9000e+01,
        4.1630e+03, 2.2700e+02, 5.4800e+02, 2.0000e+02, 1.2270e+03, 4.9000e+01,
        1.0000e+00, 1.5000e+01, 2.2700e+02, 1.1400e+02, 5.7100e+02, 7.5900e+02,
        6.8000e+01, 1.4500e+02, 5.0000e+01, 1.0000e+01, 4.1240e+03, 1.9900e+02,
        1.2270e+03, 6.5000e+01, 1.2900e+02, 5.0000e+00, 3.1100e+02, 1.2170e+03,
        3.5700e+02, 1.5200e+02, 7.9000e+01, 1.6000e+01, 7.6500e+02, 3.9860e+03,
        2.9800e+02, 1.6100e+02, 2.4770e+03, 2.0200e+02, 1.2000e+01, 9.5200e+02,
        1.9000e+02, 1.4800e+02, 5.3400e+02])

**Separate:**

In [41]:
# dense
adj_mat_sep, map_ent_sep, map_ent_to_ind_sep, map_rel_sep = reading_data.create_adjacency_matrix_nt("data/aifb/aifb+.nt", literal_representation="separate", relational=True)
print(adj_mat_sep.shape)
print(adj_mat_sep.sum().item())
adj_mat_sep.sum(dim=[0,1])

torch.Size([11540, 11540, 45])
29043.0


tensor([1.3020e+03, 1.5200e+02, 3.3900e+02, 9.5200e+02, 2.3900e+02, 7.9000e+01,
        4.1630e+03, 2.2700e+02, 5.4800e+02, 2.0000e+02, 1.2270e+03, 4.9000e+01,
        1.0000e+00, 1.5000e+01, 2.2700e+02, 1.1400e+02, 5.7100e+02, 7.5900e+02,
        6.8000e+01, 1.4500e+02, 5.0000e+01, 1.0000e+01, 4.1240e+03, 1.9900e+02,
        1.2270e+03, 6.5000e+01, 1.2900e+02, 5.0000e+00, 3.1100e+02, 1.2170e+03,
        3.5700e+02, 1.5200e+02, 7.9000e+01, 1.6000e+01, 7.6500e+02, 3.9860e+03,
        2.9800e+02, 1.6100e+02, 2.4770e+03, 2.0200e+02, 1.2000e+01, 9.5200e+02,
        1.9000e+02, 1.4800e+02, 5.3400e+02])

In [42]:
# sparse
adj_mat_sep, map_ent_sep, map_ent_to_ind_sep,  map_rel_sep = reading_data.create_adjacency_matrix_nt("data/aifb/aifb+.nt", literal_representation="separate", relational=True, sparse=True)
print(adj_mat_sep.size())
print(torch.sparse.sum(adj_mat_sep).item())
torch.sparse.sum(adj_mat_sep, dim=[0,1]).values()

torch.Size([11540, 11540, 45])
29043.0


tensor([1.3020e+03, 1.5200e+02, 3.3900e+02, 9.5200e+02, 2.3900e+02, 7.9000e+01,
        4.1630e+03, 2.2700e+02, 5.4800e+02, 2.0000e+02, 1.2270e+03, 4.9000e+01,
        1.0000e+00, 1.5000e+01, 2.2700e+02, 1.1400e+02, 5.7100e+02, 7.5900e+02,
        6.8000e+01, 1.4500e+02, 5.0000e+01, 1.0000e+01, 4.1240e+03, 1.9900e+02,
        1.2270e+03, 6.5000e+01, 1.2900e+02, 5.0000e+00, 3.1100e+02, 1.2170e+03,
        3.5700e+02, 1.5200e+02, 7.9000e+01, 1.6000e+01, 7.6500e+02, 3.9860e+03,
        2.9800e+02, 1.6100e+02, 2.4770e+03, 2.0200e+02, 1.2000e+01, 9.5200e+02,
        1.9000e+02, 1.4800e+02, 5.3400e+02])

## MUTAG

As the dense versions of MUTAG take long to create, and sometimes crash due to the excessive memory use, for the MUTAG part, only the sparse COO matrices are used to demonstrate that they are correctly read in.

### Not relational:

In [43]:
adj_mat_fil, map_ent_to_ind_fil, map_ent_fil = reading_data.create_adjacency_matrix_nt("data/mutag/mutag.nt", literal_representation="filtered", sparse=True)
print(adj_mat_fil.size())
torch.sparse.sum(adj_mat_fil)

torch.Size([22540, 22540])


tensor(63382.)

In [44]:
adj_mat_all, map_ent_to_ind_all, map_ent_all = reading_data.create_adjacency_matrix_nt("data/mutag/mutag.nt", literal_representation="all-to-one", sparse=True)
print(adj_mat_all.size())
torch.sparse.sum(adj_mat_all)

torch.Size([22541, 22541])


tensor(74227.)

In [45]:
adj_mat_col, map_ent_to_ind_col, map_ent_col = reading_data.create_adjacency_matrix_nt("data/mutag/mutag.nt", literal_representation="collapsed", sparse=True)
print(adj_mat_col.size())
torch.sparse.sum(adj_mat_col)

torch.Size([23644, 23644])


tensor(74227.)

In [46]:
adj_mat_sep, map_ent_to_ind_sep, map_ent_sep = reading_data.create_adjacency_matrix_nt("data/mutag/mutag.nt", literal_representation="separate", sparse=True)
print(adj_mat_sep.size())
torch.sparse.sum(adj_mat_sep)

torch.Size([33385, 33385])


tensor(74227.)

### Relational:

**Filtered:**

In [47]:
adj_mat_fil, map_ent_fil, map_ent_to_ind_fil, map_rel_fil = reading_data.create_adjacency_matrix_nt("data/mutag/mutag.nt", literal_representation="filtered", relational=True, sparse=True)
print(adj_mat_fil.size())
print(torch.sparse.sum(adj_mat_fil).item())
torch.sparse.sum(adj_mat_fil, dim=[0,1])

torch.Size([22540, 22540, 9])
63382.0


tensor(indices=tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8]]),
       values=tensor([6.0000e+00, 1.9000e+01, 2.2534e+04, 1.3800e+02,
                      3.5260e+03, 1.9000e+01, 9.1890e+03, 1.8634e+04,
                      9.3170e+03]),
       size=(9,), nnz=9, layout=torch.sparse_coo)

**All-to-one:**

In [48]:
adj_mat_all, map_ent_all, map_ent_to_ind_all, map_rel_all = reading_data.create_adjacency_matrix_nt("data/mutag/mutag.nt", literal_representation="all-to-one", relational=True, sparse=True)
print(adj_mat_all.size())
print(torch.sparse.sum(adj_mat_all).item())
torch.sparse.sum(adj_mat_all, dim=[0,1])

torch.Size([22541, 22541, 23])
74227.0


tensor(indices=tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,
                        14, 15, 16, 17, 18, 19, 20, 21, 22]]),
       values=tensor([6.0000e+00, 1.9000e+01, 1.3000e+01, 6.1000e+01,
                      3.0500e+02, 2.8300e+02, 5.0000e+00, 3.5000e+01,
                      2.9300e+02, 3.5260e+03, 3.4000e+02, 9.1890e+03,
                      1.9000e+01, 1.3000e+01, 2.2534e+04, 2.0300e+02,
                      1.3800e+02, 2.5000e+01, 9.3170e+03, 5.5000e+01,
                      2.5000e+01, 9.1890e+03, 1.8634e+04]),
       size=(23,), nnz=23, layout=torch.sparse_coo)

**Collapsed:**

In [49]:
adj_mat_col, map_ent_col, map_ent_to_ind_col, map_rel_col = reading_data.create_adjacency_matrix_nt("data/mutag/mutag.nt", literal_representation="collapsed", relational=True, sparse=True)
print(adj_mat_col.size())
print(torch.sparse.sum(adj_mat_col).item())
torch.sparse.sum(adj_mat_col, dim=[0,1])

torch.Size([23644, 23644, 23])
74227.0


tensor(indices=tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,
                        14, 15, 16, 17, 18, 19, 20, 21, 22]]),
       values=tensor([6.0000e+00, 1.9000e+01, 1.3000e+01, 6.1000e+01,
                      3.0500e+02, 2.8300e+02, 5.0000e+00, 3.5000e+01,
                      2.9300e+02, 3.5260e+03, 3.4000e+02, 9.1890e+03,
                      1.9000e+01, 1.3000e+01, 2.2534e+04, 2.0300e+02,
                      1.3800e+02, 2.5000e+01, 9.3170e+03, 5.5000e+01,
                      2.5000e+01, 9.1890e+03, 1.8634e+04]),
       size=(23,), nnz=23, layout=torch.sparse_coo)

**Separate:**

In [50]:
adj_mat_sep, map_ent_sep, map_rel_sep = reading_data.create_adjacency_matrix_nt("data/mutag/mutag.nt", literal_representation="separate", relational=True, sparse=True)
print(adj_mat_sep.size())
print(torch.sparse.sum(adj_mat_sep).item())
torch.sparse.sum(adj_mat_sep, dim=[0,1])

torch.Size([33385, 33385, 23])
74227.0


tensor(indices=tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,
                        14, 15, 16, 17, 18, 19, 20, 21, 22]]),
       values=tensor([6.0000e+00, 1.9000e+01, 1.3000e+01, 6.1000e+01,
                      3.0500e+02, 2.8300e+02, 5.0000e+00, 3.5000e+01,
                      2.9300e+02, 3.5260e+03, 3.4000e+02, 9.1890e+03,
                      1.9000e+01, 1.3000e+01, 2.2534e+04, 2.0300e+02,
                      1.3800e+02, 2.5000e+01, 9.3170e+03, 5.5000e+01,
                      2.5000e+01, 9.1890e+03, 1.8634e+04]),
       size=(23,), nnz=23, layout=torch.sparse_coo)

### dmg777k

**Not relational:**

In [2]:
adj_mat_fil, map_ent_fil, map_ent_to_ind_fil = reading_data.create_adjacency_matrix_nt("data/dmg777k/dmg777k_stripped.nt", literal_representation="filtered", sparse=True)
print(adj_mat_fil.size())
torch.sparse.sum(adj_mat_fil, dim=[0,1])

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/isodate/isodates.py", line 201, in parse_date
    return date(sign * int(groups['year']),
ValueError: month must be in 1..12
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # typ

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/isodate/isodates.py", line 201, in parse_date
    return date(sign * int(groups['year']),
ValueError: day is out of range for month
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)

torch.Size([148127, 148127])


tensor(288379.)

In [6]:
adj_mat_fil

tensor(indices=tensor([[135729,  13622,  17660,  ...,  92671,  14010,  74795],
                       [ 18314, 144158,  18314,  ...,   8724,  18314,    608]]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
       size=(148127, 148127), nnz=288379, layout=torch.sparse_coo)

In [3]:
adj_mat_all, map_ent_all, map_ent_to_ind_all = reading_data.create_adjacency_matrix_nt("data/dmg777k/dmg777k_stripped.nt", literal_representation="all-to-one", sparse=True)
print(adj_mat_all.size())
torch.sparse.sum(adj_mat_all, dim=[0,1])

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/isodate/isodates.py", line 201, in parse_date
    return date(sign * int(groups['year']),
ValueError: month must be in 1..12
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # typ

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/isodate/isodates.py", line 201, in parse_date
    return date(sign * int(groups['year']),
ValueError: day is out of range for month
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)

torch.Size([148128, 148128])


tensor(777124.)

In [7]:
adj_mat_all

tensor(indices=tensor([[ 30118, 135729,  97534,  ...,  74795, 115950,  14051],
                       [148127,  18314, 148127,  ...,    608, 148127, 148127]]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
       size=(148128, 148128), nnz=777124, layout=torch.sparse_coo)

In [4]:
adj_mat_col, map_ent_col, map_ent_to_ind_col = reading_data.create_adjacency_matrix_nt("data/dmg777k/dmg777k_stripped.nt", literal_representation="collapsed", sparse=True)
print(adj_mat_col.size())
torch.sparse.sum(adj_mat_col, dim=[0,1])

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/isodate/isodates.py", line 201, in parse_date
    return date(sign * int(groups['year']),
ValueError: month must be in 1..12
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # typ

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/isodate/isodates.py", line 201, in parse_date
    return date(sign * int(groups['year']),
ValueError: day is out of range for month
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)

torch.Size([343219, 343219])


tensor(777124.)

In [8]:
adj_mat_col

tensor(indices=tensor([[ 30118, 135729,  97534,  ...,  74795, 115950,  14051],
                       [265629,  18314, 171189,  ...,    608, 247682, 249376]]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
       size=(343219, 343219), nnz=777124, layout=torch.sparse_coo)

In [5]:
adj_mat_sep, map_ent_sep, map_ent_to_ind_sep = reading_data.create_adjacency_matrix_nt("data/dmg777k/dmg777k_stripped.nt", literal_representation="separate", sparse=True)
print(adj_mat_sep.size())
torch.sparse.sum(adj_mat_sep, dim=[0,1])

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/isodate/isodates.py", line 201, in parse_date
    return date(sign * int(groups['year']),
ValueError: month must be in 1..12
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # typ

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/isodate/isodates.py", line 201, in parse_date
    return date(sign * int(groups['year']),
ValueError: day is out of range for month
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)

torch.Size([636872, 636872])


tensor(777124.)

In [9]:
adj_mat_sep

tensor(indices=tensor([[ 30118, 135729,  97534,  ...,  74795, 115950,  14051],
                       [148127,  18314, 148128,  ...,    608, 636870, 636871]]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
       size=(636872, 636872), nnz=777124, layout=torch.sparse_coo)

**Relational:**

In [11]:
adj_mat_fil, map_ent_fil, map_ent_to_ind_fil, map_rel_fil = reading_data.create_adjacency_matrix_nt("data/dmg777k/dmg777k_stripped.nt", literal_representation="filtered", sparse=True, relational=True)
print(adj_mat_fil.size())
print(torch.sparse.sum(adj_mat_fil).item())
torch.sparse.sum(adj_mat_fil, dim=[0,1])

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/isodate/isodates.py", line 201, in parse_date
    return date(sign * int(groups['year']),
ValueError: month must be in 1..12
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # typ

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/isodate/isodates.py", line 201, in parse_date
    return date(sign * int(groups['year']),
ValueError: day is out of range for month
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)

torch.Size([148127, 148127, 16])
288379.0


tensor(indices=tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,
                        14, 15]]),
       values=tensor([  5755.,   2407., 130126.,   2407.,   2407.,   8396.,
                       21039.,   2376.,  47872.,   1079.,   8783.,   2373.,
                        1540.,   2407.,   1540.,  47872.]),
       size=(16,), nnz=16, layout=torch.sparse_coo)

In [12]:
adj_mat_all, map_ent_all, map_ent_to_ind_all, map_rel_all = reading_data.create_adjacency_matrix_nt("data/dmg777k/dmg777k_stripped.nt", literal_representation="all-to-one", sparse=True, relational=True)
print(adj_mat_all.size())
print(torch.sparse.sum(adj_mat_all).item())
torch.sparse.sum(adj_mat_all, dim=[0,1])

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/isodate/isodates.py", line 201, in parse_date
    return date(sign * int(groups['year']),
ValueError: month must be in 1..12
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # typ

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/isodate/isodates.py", line 201, in parse_date
    return date(sign * int(groups['year']),
ValueError: day is out of range for month
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)

torch.Size([148128, 148128, 60])
777124.0


tensor(indices=tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,
                        14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
                        28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
                        42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
                        56, 57, 58, 59]]),
       values=tensor([5.7042e+04, 5.7550e+03, 9.3300e+02, 4.6123e+04,
                      8.3960e+03, 3.7200e+02, 2.1200e+02, 2.4070e+03,
                      2.4070e+03, 7.0000e+00, 1.5400e+03, 8.3960e+03,
                      4.3964e+04, 1.0790e+03, 4.4216e+04, 8.7830e+03,
                      1.0000e+00, 8.3960e+03, 1.2520e+03, 3.4865e+04,
                      2.4070e+03, 1.3013e+05, 4.7384e+04, 1.8260e+04,
                      6.3000e+02, 2.3730e+03, 2.5800e+02, 4.5111e+04,
                      8.3960e+03, 8.3960e+03, 2.0866e+04, 2.6000e+02,
                      3.7100e+02, 8.3960e+03, 8.3960e+03, 1.6150e+03,
       

In [13]:
adj_mat_col, map_ent_col, map_ent_to_ind_col, map_rel_col = reading_data.create_adjacency_matrix_nt("data/dmg777k/dmg777k_stripped.nt", literal_representation="collapsed", sparse=True, relational=True)
print(adj_mat_col.size())
print(torch.sparse.sum(adj_mat_col).item())
torch.sparse.sum(adj_mat_col, dim=[0,1])

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/isodate/isodates.py", line 201, in parse_date
    return date(sign * int(groups['year']),
ValueError: month must be in 1..12
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # typ

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/isodate/isodates.py", line 201, in parse_date
    return date(sign * int(groups['year']),
ValueError: day is out of range for month
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)

torch.Size([343219, 343219, 60])
777124.0


tensor(indices=tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,
                        14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
                        28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
                        42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
                        56, 57, 58, 59]]),
       values=tensor([5.7042e+04, 5.7550e+03, 9.3300e+02, 4.6123e+04,
                      8.3960e+03, 3.7200e+02, 2.1200e+02, 2.4070e+03,
                      2.4070e+03, 7.0000e+00, 1.5400e+03, 8.3960e+03,
                      4.3964e+04, 1.0790e+03, 4.4216e+04, 8.7830e+03,
                      1.0000e+00, 8.3960e+03, 1.2520e+03, 3.4865e+04,
                      2.4070e+03, 1.3013e+05, 4.7384e+04, 1.8260e+04,
                      6.3000e+02, 2.3730e+03, 2.5800e+02, 4.5111e+04,
                      8.3960e+03, 8.3960e+03, 2.0866e+04, 2.6000e+02,
                      3.7100e+02, 8.3960e+03, 8.3960e+03, 1.6150e+03,
       

In [14]:
adj_mat_sep, map_ent_sep, map_ent_to_ind_sep, map_rel_sep = reading_data.create_adjacency_matrix_nt("data/dmg777k/dmg777k_stripped.nt", literal_representation="separate", sparse=True, relational=True)
print(adj_mat_sep.size())
print(torch.sparse.sum(adj_mat_sep).item())
torch.sparse.sum(adj_mat_sep, dim=[0,1])

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/isodate/isodates.py", line 201, in parse_date
    return date(sign * int(groups['year']),
ValueError: month must be in 1..12
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # typ

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/isodate/isodates.py", line 201, in parse_date
    return date(sign * int(groups['year']),
ValueError: day is out of range for month
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x7fb6dac4c5e0>
Traceback (most recent call last):
  File "/Users/janneke/opt/anaconda3/envs/Master-Thesis-Graph-Representation/lib/python3.9/site-packages/rdflib/term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)

torch.Size([636872, 636872, 60])
777124.0


tensor(indices=tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,
                        14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
                        28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
                        42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
                        56, 57, 58, 59]]),
       values=tensor([5.7042e+04, 5.7550e+03, 9.3300e+02, 4.6123e+04,
                      8.3960e+03, 3.7200e+02, 2.1200e+02, 2.4070e+03,
                      2.4070e+03, 7.0000e+00, 1.5400e+03, 8.3960e+03,
                      4.3964e+04, 1.0790e+03, 4.4216e+04, 8.7830e+03,
                      1.0000e+00, 8.3960e+03, 1.2520e+03, 3.4865e+04,
                      2.4070e+03, 1.3013e+05, 4.7384e+04, 1.8260e+04,
                      6.3000e+02, 2.3730e+03, 2.5800e+02, 4.5111e+04,
                      8.3960e+03, 8.3960e+03, 2.0866e+04, 2.6000e+02,
                      3.7100e+02, 8.3960e+03, 8.3960e+03, 1.6150e+03,
       

## Reading in data - classes for the testing process --> so it can be used as a mask during training

The cells below are for developing the function.

In [3]:
# read in the train, valid and test set indices
graph_train = Graph()
graph_valid = Graph()
graph_test = Graph()

with gzip.open("data/aifb/gz_files/aifb+_train_set.nt.gz", 'r') as gf:
    graph_train.parse(data=gf.read(), format='nt')

with gzip.open("data/aifb/gz_files/aifb+_valid_set.nt.gz", 'r') as gf:
    graph_valid.parse(data=gf.read(), format='nt')
    
with gzip.open("data/aifb/gz_files/aifb+_test_set.nt.gz", 'r') as gf:
    graph_test.parse(data=gf.read(), format='nt')
    
graph_all = graph_train + graph_valid + graph_test

In [5]:
# read the matrix (to get the mapping and the number of nodes)
adj_mat_fil, map_ent_fil, map_ent_to_ind_fil = reading_data.create_adjacency_matrix_nt("data/aifb/aifb+.nt", literal_representation="filtered")
print(adj_mat_fil.shape)
print(adj_mat_fil.sum())

torch.Size([2835, 2835])
tensor(20338.)


In [6]:
# make a tensor of zeroes (everything without a class is seen as a zero)
labels = torch.zeros(adj_mat_fil.shape[0])
class_mapping = dict()
current = 1

# put these enumerators together, maybe we can append them in some way?
for head, relation, tail in graph_all:
    # get the index of head:
    head_index = map_ent_to_ind_fil[head]

    # if current class not yet in map, add it and increment
    if tail not in class_mapping:
        class_mapping[tail] = current
        labels[head_index] = current

        current += 1

    # otherwise, add the class index to the tensor at that point
    else:
        label = class_mapping[tail]
        labels[head_index] = label

### Testing this function:

In [13]:
# add the file names
file_name_train = "data/mutag/gz_files/mutag_train_set.nt.gz"
file_name_valid = "data/mutag/gz_files/mutag_valid_set.nt.gz"
file_name_test = "data/mutag/gz_files/mutag_test_set.nt.gz"

# read in graph
adj_mat_fil, map_ent_to_ind_fil, map_ent_fil = reading_data.create_adjacency_matrix_nt("data/mutag/gz_files/mutag.nt.gz", literal_representation="separate", sparse=True)

In [14]:
labels, train_entities, valid_entities, test_entities = reading_data.training_valid_test_set(file_name_train, file_name_valid, file_name_test, map_ent_fil, adj_mat_fil.size()[0])

In [15]:
labels[train_entities]

tensor([1., 1., 1., 1., 1., 2., 1., 2., 2., 1., 2., 2., 1., 1., 1., 1., 2., 1.,
        1., 2., 2., 1., 1., 1., 2., 1., 2., 1., 2., 2., 1., 2., 2., 1., 2., 2.,
        2., 2., 1., 1., 1., 1., 2., 2., 2., 2., 1., 1., 2., 2., 1., 1., 1., 1.,
        1., 2., 1., 1., 1., 1., 1., 2., 1., 1., 2., 2., 1., 1., 1., 1., 2., 2.,
        2., 2., 2., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 1., 2., 2.,
        1., 1., 1., 2., 1., 1., 2., 2., 2., 2., 1., 1., 1., 1., 2., 1., 1., 1.,
        2., 2., 2., 2., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1.,
        1., 2., 1., 1., 1., 2., 2., 1., 2., 2., 1., 2., 2., 1., 1., 2., 1., 1.,
        2., 1., 2., 1., 1., 1., 2., 1., 1., 2., 1., 1., 1., 1., 2., 2., 2., 1.,
        1., 1., 1., 2., 1., 2., 1., 2., 1., 1., 2., 1., 1., 1., 1., 2., 2., 1.,
        1., 2., 2., 1., 2., 1., 2., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        2., 1., 1., 1., 2., 1., 2., 1., 1., 1., 1., 1., 2., 1., 2., 1., 1., 2.,
        1., 1.])

In [16]:
labels[valid_entities]

tensor([1., 2., 2., 1., 1., 1., 2., 2., 1., 1., 2., 1., 2., 1., 1., 1., 1., 1.,
        1., 1., 1., 2., 1., 1., 1., 2., 2., 1., 1., 1., 1., 1., 2., 2., 2., 1.,
        2., 1., 2., 2., 2., 1., 2., 1., 1., 1., 1., 1., 2., 1., 2., 2., 2., 1.])

In [17]:
labels[test_entities]

tensor([1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 1.,
        1., 2., 2., 2., 2., 1., 1., 2., 1., 1., 2., 2., 1., 1., 2., 1., 1., 1.,
        2., 1., 1., 2., 2., 2., 1., 2., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
        2., 2., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 2.])