In [1]:
import json

import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Load models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_minilm = SentenceTransformer("all-MiniLM-L6-v2").to(device)
tokenizer_labse = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
model_labse = AutoModel.from_pretrained("sentence-transformers/LaBSE").to(device)



In [106]:
# Mean Pooling function
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )

# Load data
with open("data/records_dbp15k_en_clean.json", "r", encoding="utf-8-sig") as f:
    en_data = json.load(f)
with open("data/records_dbp15k_fr_clean.json", "r", encoding="utf-8-sig") as f:
    fr_data = json.load(f)

# Load relationship data
with open("data/rel_dbp15k_en.json", "r", encoding="utf-8-sig") as f:
    en_rel_data = json.load(f)
with open("data/rel_dbp15k_fr.json", "r", encoding="utf-8-sig") as f:
    fr_rel_data = json.load(f)

# Create dictionaries for quick lookup
en_rel_dict = {item["source_uri"]: item for item in en_rel_data}
fr_rel_dict = {item["source_uri"]: item for item in fr_rel_data}


# Prepare sentences for property embeddings
def prepare_property_sentence(entry):
    uri = entry["n.uri"]
    properties = "\n".join([f"\t- {k}: {v}." for k, v in entry["properties(n)"].items()])
    return f"Node name: {uri.split("/")[-1].replace("_"," ")}\nProperties:\n{properties}"


# Prepare sentences for relationship embeddings
def prepare_relationship_sentence(rel_data):
    outgoing = "\n".join(
        [
            f"\ttype: {r['type']}, value: {
                r['neighbor_uri'].split('/')[-1].replace("_", " ") if r['neighbor_uri'] else None
            }."
            for r in rel_data.get("outgoing_relationships", [])
        ]
    )
    incoming = "\n".join(
        [
            f"\ttype: {r['type']}, value: {
                r['neighbor_uri'].split('/')[-1].replace("_", " ") if r['neighbor_uri'] else None
            }."
            for r in rel_data.get("incoming_relationships", [])
        ]
    )
    return f"Outgoing relations:\n{outgoing}\nIncoming relations:\n{incoming}"

In [107]:
en_property_sentences = [prepare_property_sentence(entry) for entry in en_data]
fr_property_sentences = [prepare_property_sentence(entry) for entry in fr_data]
en_relationship_sentences = [
    prepare_relationship_sentence(en_rel_dict.get(entry["n.uri"], {}))
    for entry in en_data
]
fr_relationship_sentences = [
    prepare_relationship_sentence(fr_rel_dict.get(entry["n.uri"], {}))
    for entry in fr_data
]

In [108]:
print(en_property_sentences[0])
print("---")
print(en_property_sentences[1])
print("---")
print(en_property_sentences[2])

Node name: Ferdinand VII of Spain
Properties:
	- succession: King of Spain.
	- reignType: 1.
	- spouse: Maria Isabel of Portugal.
	- issue: Infanta Luisa Fernanda, Duchess of Montpensier.
	- name: Ferdinand VII.
	- reign: --03-19.
	- caption: Portrait by Goya, ca. 1815.
	- birthDate: 1784-10-14.
	- years: 1788.
	- deathDate: 1833-09-29.
	- uri: http://dbpedia.org/resource/Ferdinand_VII_of_Spain.
	- signature: Firma de Fernando VII.png.
---
Node name: Pete Rock
Properties:
	- occupation: Music producer, disc jockey, rapper.
	- associatedActs: CL Smooth, Grap Luva, Marley Marl, INI, Heavy D, The UN, YGz, DJ Premier, Ed O.G., 9th Wonder, DJ Green Lantern, Rakim, Nas, J Dilla, AZ, Wu-Tang Clan, Pharoahe Monch,  Kanye West, Smif-N-Wessun, DJ Jazzy Jeff and The Fresh Prince, Ill Bill, Non Phixion.
	- alias: The Chocolate Boy Wonder.
	- origin: United States.
	- caption: Pete Rock performing at Marvel/Stussy launch party in Los Angeles, 2011.
	- label: Elektra, Loud, Rapster/BBE, Nature Sound

In [101]:
print(en_relationship_sentences[0])

Outgoing relations:
	type: title, value: List of Spanish monarchs.
	type: title, value: Prince of Asturias.
	type: issue, value: Isabel II of Spain.
	type: deathPlace, value: Madrid.
	type: predecessor, value: Charles IV of Spain.
	type: predecessor, value: Joseph Bonaparte.
	type: successor, value: Isabel II of Spain.
	type: successor, value: Joseph Bonaparte.
	type: father, value: Charles IV of Spain.
	type: house, value: House of Bourbon.
	type: placeOfBurial, value: El Escorial.
Incoming relations:
	type: leader, value: Captaincy General of Chile.
	type: leader, value: Viceroyalty of Peru.
	type: predecessor, value: Joseph Bonaparte.
	type: predecessor, value: Isabel II of Spain.
	type: spouse, value: Maria Christina of the Two Sicilies.
	type: successor, value: Charles IV of Spain.
	type: successor, value: Joseph Bonaparte.
	type: father, value: Infanta Luisa Fernanda, Duchess of Montpensier.
	type: father, value: Isabel II of Spain.
	type: monarch, value: Francisco Javier Venegas

In [111]:
# Compute property embeddings
en_property_embeddings_minilm = model_minilm.encode(
    en_property_sentences, convert_to_tensor=True, device=device
)
fr_property_embeddings_minilm = model_minilm.encode(
    fr_property_sentences, convert_to_tensor=True, device=device
)

# Compute relationship embeddings
en_relationship_embeddings = model_minilm.encode(
    en_relationship_sentences, convert_to_tensor=True, device=device
)
fr_relationship_embeddings = model_minilm.encode(
    fr_relationship_sentences, convert_to_tensor=True, device=device
)


In [110]:
# Normalize embeddings
en_property_embeddings_minilm = torch.nn.functional.normalize(
    en_property_embeddings_minilm, p=2, dim=1
)
fr_property_embeddings_minilm = torch.nn.functional.normalize(
    fr_property_embeddings_minilm, p=2, dim=1
)

en_relationship_embeddings = torch.nn.functional.normalize(
    en_relationship_embeddings, p=2, dim=1
)
fr_relationship_embeddings = torch.nn.functional.normalize(
    fr_relationship_embeddings, p=2, dim=1
)

In [112]:
# Compute similarity matrices separately for property and relationship embeddings
property_similarity_matrix_minilm = (
    torch.matmul(en_property_embeddings_minilm, fr_property_embeddings_minilm.T)
    .cpu()
    .numpy()
)

relationship_similarity_matrix = (
    torch.matmul(en_relationship_embeddings, fr_relationship_embeddings.T).cpu().numpy()
)

In [None]:
def compute_labse_embeddings(sentences):
    embeddings = []
    batch_size = 32
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i : i + batch_size]
        encoded_input = tokenizer_labse(
            batch, padding=True, truncation=True, max_length=512, return_tensors="pt"
        ).to(device)
        with torch.no_grad():
            model_output = model_labse(**encoded_input)
        batch_embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
        embeddings.append(batch_embeddings)
    return torch.cat(embeddings)


en_property_embeddings_labse = compute_labse_embeddings(en_property_sentences)
fr_property_embeddings_labse = compute_labse_embeddings(fr_property_sentences)

en_property_embeddings_labse = torch.nn.functional.normalize(
    en_property_embeddings_labse, p=2, dim=1
)
fr_property_embeddings_labse = torch.nn.functional.normalize(
    fr_property_embeddings_labse, p=2, dim=1
)

property_similarity_matrix_labse = torch.matmul(en_property_embeddings_labse, fr_property_embeddings_labse.T).cpu().numpy()

# Combine property and relationship similarity matrices using the provided weights
combined_similarity_matrix = (
    0.3 * property_similarity_matrix_minilm
    + 0.5 * property_similarity_matrix_labse
    + 0.2 * relationship_similarity_matrix
)

In [113]:
# Combine property and relationship similarity matrices using the provided weights
combined_similarity_matrix = (
    0.6 * property_similarity_matrix_minilm
    + 0.4 * relationship_similarity_matrix
)

In [122]:
# Find the top 20 matches for each entity
top_k = 20
direct_matches = []
for i, en_entry in enumerate(en_data):
    similarity_scores = combined_similarity_matrix[i]
    top_indices = np.argsort(-similarity_scores)[:top_k]
    top_similar_nodes = [
        {"uri": fr_data[idx]["n.uri"], "score": float(similarity_scores[idx])}
        for idx in top_indices
    ]
    direct_matches.append(
        {"n.uri": en_entry["n.uri"], "topSimilarNodes": top_similar_nodes}
    )

In [123]:
# Save the results
with open(
    "Results/idriss_similar_entities_dbp15k_with_labse.json", "w", encoding="utf-8-sig"
) as f:
    json.dump(direct_matches, f, indent=2)

# Prepare data

In [135]:
import json

meta_data_array = []
for data in en_data:
    meta_data = {}
    meta_data["name"] = data["n.uri"].split("/")[-1].replace("_", " ")

    meta_data_array.append(meta_data)

In [137]:
meta_data_array

[{'name': 'Ferdinand VII of Spain'},
 {'name': 'Pete Rock'},
 {'name': 'Waldemar Pawlak'},
 {'name': 'Metallica'},
 {'name': 'Ardennes (department)'},
 {'name': 'Feel Good Inc.'},
 {'name': 'Marylebone'},
 {'name': 'Minnesota United FC'},
 {'name': 'Ride (Lana Del Rey song)'},
 {'name': 'Princess Märtha of Sweden'},
 {'name': 'Paddy Ashdown'},
 {'name': 'Teen pop'},
 {'name': 'Charles III of Naples'},
 {'name': 'Will.i.am'},
 {'name': 'Fiat Doblò'},
 {'name': 'University of Prince Edward Island'},
 {'name': 'Sylvia Pinel'},
 {'name': 'Oscar II of Sweden'},
 {'name': "Sainte-Émélie-de-l'Énergie, Quebec"},
 {'name': 'Bob Kerrey'},
 {'name': 'John Bercow'},
 {'name': 'Gord Dineen'},
 {'name': 'Sergio Mattarella'},
 {'name': 'Mosnang'},
 {'name': 'Margaret of Austria, Queen of Spain'},
 {'name': 'Lac-Huron, Quebec'},
 {'name': 'Jewelry Box (T-ara album)'},
 {'name': 'K.V.C. Westerlo'},
 {'name': 'James K. Polk'},
 {'name': 'Physical Graffiti'},
 {'name': 'Maltese Government 2008–13'},
 {'n