In [21]:
import json
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import joblib

# Paths
data_path = '/home/mseiferling/vector_search/data/Krankenhaus.txt'  # Update this path if necessary
embedding_model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
embeddings_path = '/home/mseiferling/vector_search/data/hospital_embeddings.npy'
nearest_neighbors_model_path = '/home/mseiferling/vector_search/data/nearest_neighbors_model.joblib'
metadata_path = '/home/mseiferling/vector_search/data/hospital_metadata.csv'

# Load hospital Names from a Text File
def load_hospital_names(text_file):
    with open(text_file, 'r', encoding='utf-8') as f:
        hospital_names = [line.strip() for line in f if line.strip()]
    return hospital_names

# Compute Embeddings
def compute_embeddings(model, sentences, batch_size=512):
    embeddings = []
    for i in tqdm(range(0, len(sentences), batch_size), desc="Computing Embeddings"):
        batch = sentences[i:i+batch_size]
        emb = model.encode(batch, show_progress_bar=False, convert_to_numpy=True)
        embeddings.append(emb)
    embeddings = np.vstack(embeddings)
    return embeddings

# Save Metadata
def save_metadata(hospital_names, metadata_path):
    df = pd.DataFrame({'hospital_name': hospital_names})
    df.to_csv(metadata_path, index=False, encoding='utf-8')

# Fit and Save NearestNeighbors Model
def fit_nearest_neighbors(embeddings, n_neighbors=10, metric='cosine'):
    nn_model = NearestNeighbors(n_neighbors=n_neighbors, metric=metric)
    nn_model.fit(embeddings)
    return nn_model

# Load Metadata
def load_metadata(metadata_path):
    return pd.read_csv(metadata_path)

# Query Function
def query_similar_hospitals(target_sentence, model, nn_model, embeddings, metadata, top_k=5):
    # Compute embedding for the target sentence
    target_embedding = model.encode([target_sentence], convert_to_numpy=True)
    
    # Perform similarity search with the specified top_k
    distances, indices = nn_model.kneighbors(target_embedding, n_neighbors=top_k)
    
    # Retrieve the hospital names and their similarity scores
    results = []
    for idx, distance in zip(indices[0], distances[0]):
        hospital = metadata.iloc[idx]['hospital_name']
        similarity = 1 - distance  # Convert cosine distance to similarity
        results.append((hospital, float(similarity)))
    return results


In [28]:
# Check if embeddings and model already exist
if os.path.exists(embeddings_path) and os.path.exists(nearest_neighbors_model_path) and os.path.exists(metadata_path):
    print("Loading existing embeddings, metadata, and NearestNeighbors model...")
    embeddings = np.load(embeddings_path)
    metadata = load_metadata(metadata_path)
    nn_model = joblib.load(nearest_neighbors_model_path)
else:
    # Load hospital names
    print("Loading hospital names...")
    hospital_names = load_hospital_names(data_path)
    
    # Save metadata
    print("Saving metadata...")
    save_metadata(hospital_names, metadata_path)
    
    # Load embedding model
    print("Loading embedding model...")
    model = SentenceTransformer(embedding_model_name)
    
    # Compute embeddings
    print("Computing embeddings...")
    embeddings = compute_embeddings(model, hospital_names)
    
    # Save embeddings
    print("Saving embeddings...")
    np.save(embeddings_path, embeddings)
    
    # Fit NearestNeighbors model
    print("Fitting NearestNeighbors model...")
    nn_model = fit_nearest_neighbors(embeddings, n_neighbors=10, metric='cosine')
    
    # Save NearestNeighbors model
    print("Saving NearestNeighbors model...")
    joblib.dump(nn_model, nearest_neighbors_model_path)
    
    # Load metadata for querying
    metadata = load_metadata(metadata_path)

# Load embedding model (if not already loaded)
if 'model' not in locals():
    print("Loading embedding model for querying...")
    model = SentenceTransformer(embedding_model_name)

# Query
target_word = "kardiologie Berlin"  # Replace with your target hospital name
top_k = 10  # Change this value as needed

print(f"\nQuerying for top {top_k} similar hospitals to '{target_word}':")
results = query_similar_hospitals(target_word, model, nn_model, embeddings, metadata, top_k=top_k)

print("\nTop similar hospitals:")
for hospital, score in results:
    print(f"{hospital} (Similarity Score: {score:.4f})")

Loading existing embeddings, metadata, and NearestNeighbors model...

Querying for top 10 similar hospitals to 'kardiologie Berlin':

Top similar hospitals:
Deutsches Herzzentrum München (Similarity Score: 0.8216)
Deutsches Herzzentrum Berlin (Similarity Score: 0.8164)
Herzzentrum Leipzig (Similarity Score: 0.7582)
Klinik für Herzchirurgie Karlsruhe (Similarity Score: 0.7574)
Unfallkrankenhaus Berlin (Similarity Score: 0.7159)
Krankenhaus des Maßregelvollzugs Berlin (Similarity Score: 0.7096)
Justizvollzugskrankenhaus Berlin (Similarity Score: 0.6886)
Bundeswehrkrankenhaus Berlin (Similarity Score: 0.6797)
Bundeswehrkrankenhaus Hamburg (Similarity Score: 0.6742)
Gertraudenhospital (Berlin) (Similarity Score: 0.6729)
