In [1]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from data_repository import DataRepository

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
data_repository = DataRepository()



In [4]:
entity_list = list(data_repository.get_ent2lbl().values())

In [6]:
ner_embeddings = data_repository.get_ner_embeddings()
ner_entities_list = data_repository.get_ner_entities_list()
torch_ner_model = data_repository.get_torch_ner_model()

In [10]:
query_embed = torch_ner_model.encode("inception")

similarities = cosine_similarity([query_embed], ner_embeddings)[0]

top_indices = np.argsort(similarities)[-5:][::-1]

matches = [(list(ner_entities_list)[idx], similarities[idx]) for idx in top_indices]

for entity, score in matches:
    print(f"Entity: {entity}, Similarity: {score:.4f}")

Entity: Inception, Similarity: 1.0000
Entity: Inception, Similarity: 1.0000
Entity: Inception, Similarity: 1.0000
Entity: Deep End, Similarity: 0.6156
Entity: The Deep End, Similarity: 0.5774


In [7]:
embeddings = model.encode(list(entity_list))

KeyboardInterrupt: 

In [None]:
query_embed = model.encode("kungg Fu Panda")

similarities = cosine_similarity([query_embed], embeddings)[0]

top_indices = np.argsort(similarities)[-5:][::-1]

matches = [(list(entity_list)[idx], similarities[idx]) for idx in top_indices]

for entity, score in matches:
    print(f"Entity: {entity}, Similarity: {score:.4f}")

In [12]:
# save the embeddings and model to disk

import pickle
import torch

with open("data/embeddings_and_entities.pkl", "wb") as f:
    pickle.dump((embeddings, entity_list), f)

In [13]:
# save the model
torch.save(model, "data/ner_embedding_model.pt")