In [12]:
import json
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import joblib

# Load hospital Names from a Text File
def load_hospital_names(text_file):
    with open(text_file, 'r', encoding='utf-8') as f:
        hospital_names = [line.strip() for line in f if line.strip()]
    return hospital_names

# Compute Embeddings
def compute_embeddings(model, sentences, batch_size=512):
    embeddings = []
    for i in tqdm(range(0, len(sentences), batch_size), desc="Computing Embeddings"):
        batch = sentences[i:i+batch_size]
        emb = model.encode(batch, show_progress_bar=False, convert_to_numpy=True)
        embeddings.append(emb)
    embeddings = np.vstack(embeddings)
    return embeddings

# Fit and Save NearestNeighbors Model
# Instance-based learning is a category of machine learning that relies on storing the training data and making predictions based on the direct comparison of new instances with stored data
def fit_nearest_neighbors(embeddings, n_neighbors=10, metric='cosine'):
    nn_model = NearestNeighbors(n_neighbors=n_neighbors, metric=metric) # non-parametric algorithm used for finding the closest points in a dataset.
    nn_model.fit(embeddings) # organizes the embeddings into a structure that allows efficient querying
    return nn_model

# Query Function
def query_similar_hospitals(target_sentence, model, nn_model, embeddings, hospital_names, top_k=5):
    # Compute embedding for the target sentence
    target_embedding = model.encode([target_sentence], convert_to_numpy=True)
    
    # Perform similarity search with the specified top_k
    distances, indices = nn_model.kneighbors(target_embedding, n_neighbors=top_k)
    
    # Retrieve the hospital names and their similarity scores
    results = []
    for idx, distance in zip(indices[0], distances[0]):
        hospital = hospital_names[idx]  # Directly use the list of hospital names
        similarity = 1 - distance  # Convert cosine distance to similarity
        results.append((hospital, float(similarity)))
    return results

In [82]:
# Paths
data_path = '/home/mseiferling/vector_search/data/OpenStreetMap_data/Combined_healthcare_facilities.txt' 
embeddings_path = '/home/mseiferling/vector_search/data/hospital_embeddings.npy'
nearest_neighbors_model_path = '/home/mseiferling/vector_search/data/nearest_neighbors_model.joblib'

# model
embedding_model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'

# Check if embeddings and model already exist
if os.path.exists(embeddings_path) and os.path.exists(nearest_neighbors_model_path):
    embeddings = np.load(embeddings_path)
    nn_model = joblib.load(nearest_neighbors_model_path)
    model = SentenceTransformer(embedding_model_name)
    hospital_names = load_hospital_names(data_path)
else:
    # Load hospital names
    hospital_names = load_hospital_names(data_path)
    
    # Load embedding model
    model = SentenceTransformer(embedding_model_name)
    
    # Compute embeddings
    embeddings = compute_embeddings(model, hospital_names)
    
    # Save embeddings
    np.save(embeddings_path, embeddings)
    
    # Fit NearestNeighbors model
    nn_model = fit_nearest_neighbors(embeddings, n_neighbors=10, metric='cosine')
    
    # Save NearestNeighbors model
    joblib.dump(nn_model, nearest_neighbors_model_path)
    

# Query
target_word = "Allgemeinarztpraxis Dr. Killer und Kollegen"
top_k = 10 

print(f"\nQuerying for top {top_k} similar hospitals to '{target_word}':")
results = query_similar_hospitals(target_word, model, nn_model, embeddings, hospital_names, top_k=top_k)

print("\nTop similar hospitals:")
for hospital, score in results:
    print(f"{hospital} (Similarity Score: {score:.4f})")


Querying for top 10 similar hospitals to 'Allgemeinarztpraxis Dr. Killer und Kollegen':

Top similar hospitals:
Allgemeinarztpraxis Dr. Killer und Kollegen (Similarity Score: 1.0000)
Arztpraxis Dörverden (Similarity Score: 0.8401)
Arztpraxis Dr. Sevinç Çağlar​ (Similarity Score: 0.8348)
Fachärzte für Freuenheilkunde und Gebursthilfe (Similarity Score: 0.8286)
Facharzt-Zentrum Viersen (Similarity Score: 0.8234)
Allgemeinarztpraxis Drechsel (Similarity Score: 0.8171)
Arztpraxis Greten & Kollegen (Similarity Score: 0.8157)
Fachärztinnen für Allgemeinmedizin (Similarity Score: 0.8152)
Facharzt für Allgemeinmedizin (Similarity Score: 0.8147)
Dr. Jüstel (Hautarzt) (Similarity Score: 0.8136)
