In [42]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import QueryRequest
import qdrant_client.models as models
from sentence_transformers import SentenceTransformer
import json
from tqdm import tqdm
import torch

In [43]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [44]:
encoder = SentenceTransformer('intfloat/multilingual-e5-large-instruct').to(device) # encoder model

In [52]:
documents = []
with open('ANDRA_CLASSIFICATION.json', encoding='utf8') as f:
    d = json.load(f)
    f.close()

In [53]:
for ent in d:
    if ent["type"] == "person":
        if len(ent["description"]) > 1000: # Limit description to 1000 characters
            ent2 = ent.copy()
            ent2["description"] = ent["description"][:1000]
            documents.append(ent2)
        else:
            documents.append(ent)

In [54]:
print(len(documents))
print(len(d))

43795
197487


In [55]:
client = QdrantClient("localhost", port=6333)

In [11]:
client.delete_collection(collection_name="upplaga_2_collection")

False

In [56]:
client.collection_exists(collection_name="upplaga_2_collection")

False

In [57]:
client.create_collection(
    collection_name="upplaga_2_collection",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model
        distance=models.Distance.COSINE, #models.Distance.DOT
    ),
)

True

In [58]:
batch_size = 2
u2_embeddings = []

for i in tqdm(range(0, len(documents), batch_size)):
    batch = documents[i:i+batch_size]
    batch_descs = [doc["description"] for doc in batch]
    batch_vecs = encoder.encode(batch_descs, batch_size=batch_size)
    u2_embeddings.extend(batch_vecs)

100%|██████████| 21898/21898 [27:16<00:00, 13.38it/s] 


In [59]:
print(len(u2_embeddings))

43795


In [61]:
client.upload_points(
    collection_name="upplaga_2_collection",
    points=[
        models.PointStruct(
            id=idx, 
            vector=emb,
            payload=documents[idx]
        )
        for idx, emb in enumerate(tqdm(u2_embeddings))
    ],
)

100%|██████████| 43795/43795 [02:52<00:00, 254.12it/s]


In [62]:
u3_documents = []
with open('TREDJE_CLASSIFICATION.json', encoding='utf8') as f:
    s = json.load(f)
    f.close()

for ent in s:
    if ent["type"] == "person":
        if len(ent["description"]) > 1000:
            u3_documents.append({"id": ent["id"], "headword": ent["headword"], "description": ent["description"][:1000]})
        else:
            u3_documents.append({"id": ent["id"], "headword": ent["headword"], "description": ent["description"]})

print(len(u3_documents))

35241


In [63]:
batch_size = 4
u3_embeddings = []

for i in tqdm(range(0, len(u3_documents), batch_size)):
    batch = u3_documents[i:i+batch_size]
    batch_descs = [doc["description"] for doc in batch]
    batch_vecs = encoder.encode(batch_descs, batch_size=batch_size)
    u3_embeddings.extend(batch_vecs)

100%|██████████| 8811/8811 [14:28<00:00, 10.15it/s]


In [64]:
requests = [
    models.QueryRequest(
        query=u3_embeddings[i],
        limit=1,   # only top result
        with_payload=True
    )
    for i in tqdm(range(len(u3_embeddings)))
]

100%|██████████| 35241/35241 [02:14<00:00, 262.41it/s]


In [65]:
print(len(requests))

35241


In [66]:
batch_size = 64
res = []

for i in tqdm(range(0, len(requests), batch_size)):
    batch = requests[i:i+batch_size]
    result = client.query_batch_points(collection_name="upplaga_2_collection", requests=batch)
    res.extend(result)


100%|██████████| 551/551 [02:30<00:00,  3.66it/s]


In [67]:
print(len(res))

35241


In [69]:
matches = []

for idx, hit in enumerate(res):
    
    matches.append({
        "entity_id": u3_documents[idx]["id"],
        "entity_headword": u3_documents[idx]["headword"],
        "entity_description": u3_documents[idx]["description"],
        "match_id": hit.points[0].payload["id"],
        "match_headword": hit.points[0].payload["headword"],
        "match_description": hit.points[0].payload["description"],
        "similarity_score": hit.points[0].score
    })

In [70]:
nbr = 3102
print(matches[nbr]["entity_description"],"\n\n",matches[numba]["match_description"], "\n\n", matches[numba]["similarity_score"])

Borda [bårda'], Jean Charles de, fransk
matematiker, sjöfarande (1733—99). Var först
ingenjörofficer, därpå från 1767 sjöofficer och
företog resor till Amerika och Afrikas
västkust för att pröva sjöurens tillförlitlighet,
upprätta kartor o. s. v. Han deltog som
generalmajor vid franska sjötrupperna 1777—78
i nordamerikanska frihetskriget. B. införde
likformighet i de franska krigsskeppens
byggnad och i flottans manövrer. Han uppfann
flera efter honom uppkallade instrument och
deltog i reglerandet av det nya franska
mått-och viktsystemet (metersystemet).
 

 Borda, Jean Charles de, fransk matematiker och
sjöfarande, f. 1733, fick sin uppfostran bland
jesuiterna och blef därefter ingenjörofficer. Redan
1756 kallades han, med anledning af sin Mémoire
sur le mouvement des projectiles, till medlem af
franska vetenskapsakademien. 1767 öfvergick han till
sjövapnet, och 1771 gjorde han såsom eskaderchef en
resa till Amerika för att pröfva sjöurens pålitlighet,
hvarvid han samtidigt bestämde de

In [71]:
with open('3_TO_2_MATCHES.json', 'w', encoding='utf-8') as f:
    json.dump(matches, f, ensure_ascii=False, indent=4)
    f.close()