## Data preprocessing 

In [9]:
import os
import ast
import pandas as pd

In [3]:
# loading data
edges = pd.read_csv("data/pg/edges.csv")
nodes = pd.read_csv("data/pg/nodes.csv")


In [5]:
edges

Unnamed: 0,start,end,type,properties
0,1,3,DOCTORAL_ADVISOR,{}
1,1,4,EMPLOYER,{}
2,1,8,EMPLOYER,{}
3,1,6,KNOWN_FOR,{}
4,1,7,AWARDED,{}
5,3,9,EMPLOYER,{}
6,4,5,LOCATED_IN,{}
7,8,5,LOCATED_IN,{}


In [6]:
nodes

Unnamed: 0,id,label,name,type,properties
0,1,Person,Albert Einstein,ENTITY,"{""birthPlace"":""Ulm""}"
1,2,City,Ulm,ENTITY,"{""country"":""Germany""}"
2,3,Person,Alfred Kleiner,ENTITY,{}
3,4,University,University of Bern,ENTITY,"{""location"":""Bern""}"
4,5,City,Bern,ENTITY,"{""country"":""Switzerland""}"
5,6,ScientificTheory,Theory of Relativity,ENTITY,"{""field"":""Physics""}"
6,7,Award,Nobel Prize in Physics,ENTITY,"{""year"":1921}"
7,8,Organization,Patent Office Bern,ENTITY,"{""location"":""Bern""}"
8,9,University,University of Zurich,ENTITY,{}


In [10]:
# Convert string properties to dictionaries
edges["properties"] = edges["properties"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
nodes["properties"] = nodes["properties"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Create a set to store vocabulary
vocab = set()

# Add edge types
vocab.update(edges["type"].unique())

# Add edge property keys
for prop in edges["properties"]:
    vocab.update(prop.keys())

# Add node labels
vocab.update(nodes["label"].unique())

# Add node property keys and values
for prop in nodes["properties"]:
    for key, value in prop.items():
        vocab.add(key)
        vocab.add(str(value))  # Convert values to string for set compatibility

# Print vocabulary
print(vocab)

{'field', 'AWARDED', 'KNOWN_FOR', 'birthPlace', 'University', 'EMPLOYER', 'LOCATED_IN', 'country', 'DOCTORAL_ADVISOR', 'City', 'Person', 'Bern', 'Organization', 'Ulm', '1921', 'ScientificTheory', 'Germany', 'Physics', 'location', 'year', 'Award', 'Switzerland'}


## Tokenization 

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize vocabulary 
tokenized_vocab = set()
for word in vocab:
    tokens = tokenizer.tokenize(word)
    tokenized_vocab.update(tokens)

# Print tokenized vocabulary
print(tokenized_vocab)

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
    return F.adaptive_avg_pool1d(embedding.unsqueeze(0), 512).squeeze().numpy()

# Convert string properties to dictionaries
nodes["properties"] = nodes["properties"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Compute embeddings
label_embeddings = nodes["label"].apply(get_embedding)
key_embeddings = nodes["properties"].apply(lambda props: get_embedding(" ".join(props.keys())) if props else torch.zeros(512).numpy())
value_embeddings = nodes["properties"].apply(lambda props: get_embedding(" ".join(str(v) for v in props.values())) if props else torch.zeros(512).numpy())

# Add embeddings to DataFrame
nodes["label_embedding"] = label_embeddings
nodes["key_embedding"] = key_embeddings
nodes["value_embedding"] = value_embeddings

# Print result
print(nodes.head())