# Semantic search

Ready to go models.

| Model Name                                                               | Type                | Dim | Quality (Biomedical)                | Speed (CPU)    | Memory Usage            | Sentence-Level Optimized |
| ------------------------------------------------------------------------ | ------------------- | --- | ----------------------------------- | -------------- | ----------------------- | ------------------------ |
| **BioWordVec (BioSentVec)**<br>`BioWordVec_PubMed_MIMICIII_d200.vec.bin` | Static (word-level) | 200 | ⚠️ Low–Moderate                     | ✅✅✅ Very Fast  | ✅ Very Low (\~1 GB RAM) | ❌ No                     |
| **`all-MiniLM-L6-v2`**                                                   | SBERT (MiniLM)      | 384 | ✅ Moderate (general)                | ✅✅✅ Very Fast  | ✅ Low (\~80 MB)         | ✅ Yes                    |
| **`pritamdeka/S-PubMedBert-MS-MARCO`**                                   | SBERT (PubMedBERT)  | 768 | ✅✅✅ Excellent                       | ⚠️ Medium      | ⚠️ Moderate-High        | ✅ Yes                    |
| **`thenlper/gte-base`**                                                  | GTE (BERT)          | 768 | ✅✅ Good                             | ✅✅ Fast        | ✅ Moderate (\~400 MB)   | ✅ Yes                    |
| **`nomic-ai/nomic-embed-text-v1.5`**                                     | OpenCLIP-style      | 768 | ✅✅ Very Good (general + scientific) | ⚠️ Medium-Slow | ❗ High (\~1 GB+)        | ⚠️ Partial (CLS token)   |
| **`microsoft/BiomedNLP-PubMedBERT...`**                                  | Raw BERT            | 768 | ✅✅✅ Best-in-domain                  | 🐢 Slow        | ❗ High (\~1.2 GB)       | ❌ No (needs pooling)     |


In [None]:
import logging

import numpy as np
import pandas as pd
import psycopg2

from pysrc.config import PubtrendsConfig

config = PubtrendsConfig(test=False)

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')
logger = logging.getLogger('notebook')

%matplotlib inline
%config InlineBackend.figure_format='retina'

# Chunking

In [None]:
from pysrc.papers.analysis.text import get_chunks

MAX_TOKENS = 128

text = "Staphylococcus aureus is a rare cause of postinfectious glomerulonephritis, and Staphylococcus-related glo-merulonephritis primarily occurs in middle-aged or elderly patients. Patients with Staphylococcus-related glomerulonephritis also present with hematuria, proteinuria of varying degrees, rising serum creatinine levels, and/or edema. The severity of renal insufficiency is proportional to the degree of proliferation and crescent formation. Here, we present a diabetic patient admitted with a history of 1 week of left elbow pain. Laboratory results revealed that erythrocyte sedimentation rate was 110 mm/hour, serum creatinine level was 1 mg/dL, C-reactive protein level was 150 mg/L, and magnetic resonance imaging showed signal changes in favor of osteomyelitis at the olecranon level, with diffuse edematous appearance in the elbow skin tissue and increased intra-articular effusion. After diagnosis of osteomyelitis, ampicillin/sulbactam and teicoplanin were administered. After day 7 of admission, the patient developed acute kidney injury requiring hemodialysis under antibiotic treatment. Kidney biopsy was performed to determine the underlying cause, which showed Staphylococcus-related glomerulonephritis. Recovery of renal func-tions was observed after antibiotic and supportive treatment."

chunks = get_chunks(text, MAX_TOKENS)
print(f"Number of chunks: {len(chunks)}")

for i, chunk in enumerate(chunks):
    print(f"\nChunk {i + 1}:")
    print(chunk)

# Embeddings with Sentence Transformer

In [None]:
from pysrc.endpoints.embeddings.sentence_transformer.sentence_transformer import SentenceTransformerModel

sentence_transformer_model = SentenceTransformerModel()
# noinspection PyStatementEffect
sentence_transformer_model.download_and_load_model
emb = sentence_transformer_model.encode(['This is a test.', 'This is a test2'])
print(emb.shape)

In [None]:
embedding_dimension = emb.shape[1]
text_embedding = lambda t: sentence_transformer_model.encode(t)
batch_texts_embeddings = lambda t: sentence_transformer_model.encode(t)
embeddings_model = sentence_transformer_model

In [None]:
device = sentence_transformer_model.device
embeddings_model_name = 'all_MiniLM_L6_v2'
embedding_dimension = 384

# Embeddings with HugginFace Wrapper model

In [None]:
# from more_itertools import sliced
# import numpy as np
# import torch
# from transformers import AutoModel, AutoTokenizer
#
# if torch.backends.mps.is_available() and torch.backends.mps.is_built():
#     device = 'mps'
# elif torch.cuda.is_available():
#     device = 'gpu'
# else:
#     device = 'cpu'
#
# class SentenceTransformerWrapper:
#     def __init__(self, model_name, attention):
#         print(f'Loading model into {device}')
#         self.device = device
#         self.attention = attention
#         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
#         self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True).to(device)
#         self.model.eval()
#
#     @staticmethod
#     def mean_pooling(model_output, attention_mask):
#         token_embeddings = model_output.last_hidden_state  # (batch_size, seq_len, hidden_size)
#         input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
#         summed = torch.sum(token_embeddings * input_mask_expanded, dim=1)
#         summed_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
#         return summed / summed_mask
#
#     def encode(self, sentences, batch_size=32):
#         all_embeddings = []
#
#         with torch.no_grad():
#             for batch in tqdm(list(sliced(sentences, batch_size))):
#                 inputs = self.tokenizer(
#                     batch,
#                     return_tensors="pt",
#                     padding=True,
#                     truncation=True,
#                     max_length=1024,
#                 ).to(self.device)
#
#                 outputs = self.model(**inputs)
#                 if self.attention:
#                     embeddings = SentenceTransformerWrapper.mean_pooling(outputs, inputs['attention_mask'])
#                 else:
#                     embeddings = outputs.last_hidden_state[:, 0, :]
#
#                 all_embeddings.append(embeddings.cpu().numpy())
#
#         return np.vstack(all_embeddings)

In [None]:
# # Decent model for biomedical embeddings
# # wrapped_model = SentenceTransformerWrapper("nomic-ai/nomic-embed-text-v1.5", False)
# # Also good, and slightly faster than nomic-embed
# wrapped_model = SentenceTransformerWrapper("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext", True)
# embeddings = wrapped_model.encode('Test sentence')
# embeddings.shape

In [None]:
# from more_itertools import sliced
# from math import ceil
# import concurrent
# import multiprocessing
# import numpy as np
#
# def parallel_texts_embeddings_wrapper(texts):
#     if device != 'cpu':
#         return wrapped_model.encode(texts)
#     # Default to number of CPUs for max workers
#     max_workers = multiprocessing.cpu_count()
#     # Compute parallel on different threads, since we use the same fasttext model
#     with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
#         futures = [
#             executor.submit(lambda ts: wrapped_model.encode(ts), ts)
#                    for ts in sliced(texts, int(ceil(len(texts) / max_workers)))
#         ]
#         # Important: keep order of results!!!
#         return np.vstack([future.result() for future in futures])

In [None]:
# embeddings_model_name = BiomedNLP_PubMedBERT
# text_embedding = lambda t: wrapped_model.encode([t])
# batch_texts_embeddings = parallel_texts_embeddings_wrapper
# embeddings_model = wrapped_model
# embedding_dimension = embeddings.shape[0]

# Prepare Postgresql + pgvector for embeddings

Create DB Postgresql + pgvector
```
docker run --rm --name pubtrends-postgres -p 5430:5432 \
        -m 32G \
        -e POSTGRES_USER=biolabs -e POSTGRES_PASSWORD=mysecretpassword \
        -e POSTGRES_DB=pubtrends \
        -v ~/pgvector/:/var/lib/postgresql/data \
        -e PGDATA=/var/lib/postgresql/data/pgdata \
        -d pgvector/pgvector:pg17
```

# Semantic search with Postgresql

In [None]:
from pysrc.preprocess.embeddings.embeddings_model_connector import EmbeddingsModelConnector
from pysrc.preprocess.embeddings.embeddings_db_connector import EmbeddingsDBConnector

embeddings_model_connector = EmbeddingsModelConnector()

embeddings_db_connector = EmbeddingsDBConnector(
    host='localhost',
    port=5430,
    database='pubtrends',
    user='biolabs',
    password='mysecretpassword',
    embeddings_model_name=embeddings_model_connector.embeddings_model_name,
    embedding_dimension=embeddings_model_connector.embeddings_dimension
)


In [None]:
# Create an index for fast vector similarity search using cosine distance
# Index may slightly change results vs exact match search, but it's much faster!
with psycopg2.connect(embeddings_db_connector.connection_string) as connection:
    connection.set_session(readonly=False)
    query = f'''
                CREATE INDEX embedding_idx_{embeddings_model_name}
                ON {embeddings_model_name}
                USING ivfflat (embedding vector_cosine_ops)
                WITH (lists = 100);
            '''
    with connection.cursor() as cursor:
        cursor.execute(query)
    connection.commit()

In [None]:
from pysrc.endpoints.semantic_search.semantic_search import l2norm


def semantic_search_postgresql(query, k):
    query_vector = text_embedding(query)
    # Normalize embeddings if using cosine similarity
    embedding = l2norm(query_vector).tolist()
    with psycopg2.connect(embeddings_db_connector.connection_string) as connection:
        with connection.cursor() as cursor:
            cursor.execute(f"""
                   SELECT pmid, chunk, embedding <=> %s::vector AS distance
                   FROM {embeddings_model_name}
                   ORDER BY distance
                   LIMIT %s
                   """, (embedding, k))

            results = cursor.fetchall()
            return pd.DataFrame(data=results, columns=['pmid', 'chunk', 'distance'])

In [None]:
search_pg = semantic_search_postgresql("epigenetic human aging", 1000)
search_pg

# Semantic search with Faiss

In [None]:
from pysrc.preprocess.embeddings.faiss_connector import FaissConnector

faiss_connector = FaissConnector(
    embeddings_model_name=embeddings_model_connector.embeddings_model_name,
    embeddings_dimension=embeddings_model_connector.embeddings_dimension
)
faiss_connector.create_or_load_faiss()

In [None]:
import faiss


def semantic_search_faiss(query_text, k):
    query_vector = text_embedding(query_text).reshape(1, -1)
    # Normalize embeddings if using cosine similarity
    faiss.normalize_L2(query_vector)
    similarities, indices = faiss_connector.faiss_index.search(query_vector.astype('float32'), k)
    t = faiss_connector.pids_idx.iloc[indices[0]].copy().reset_index(drop=True)
    t['similarity'] = similarities[0]
    return t

In [None]:
search_fs = semantic_search_faiss("epigenetic human aging", 10_000)
search_fs

# Comparison Postgresql vs Faiss semantic search

In [None]:
# print(f'Postgresql {len(pmids_pg.unique())}')
# print(f'Faiss {len(pmids_fs.unique())}')
# overlap = set(list(pmids_pg)) & set(list(pmids_fs))
# print(f'Overlap {len(overlap)}')

# Apply additional semantic filtering on search results

In [None]:
search = semantic_search_faiss(
    "epigenetic changes in stem cell differentiation in human",
    1000
)
search_ids = search['pmid']
print(len(search_ids.unique()))
search

In [None]:
from pysrc.preprocess.embeddings.publications_db_connector import PublicationsDBConnector

publications_db_connector = PublicationsDBConnector()

publications = publications_db_connector.load_publications(search_ids)
search_ids = publications['id']
publications.head(5)

In [None]:
from pysrc.papers.analysis.text import parallel_collect_chunks


def collect_chunks_embeddings(df):
    print('\rCollecting chunks           ', end='')
    pids = list(df['id'])
    texts = [f'{title}. {abstract}' for title, abstract in zip(df['title'], df['abstract'])]
    chunks, chunk_idx = parallel_collect_chunks(pids, texts, MAX_TOKENS)
    print(f'\rComputing {len(chunks)} embeddings   ', end='')
    chunk_embeddings = batch_texts_embeddings(chunks)
    return chunk_embeddings, chunk_idx

In [None]:
print('Compute documents embeddings')
embeddings, chunk_idx = collect_chunks_embeddings(publications)
embeddings = [l2norm(e) for e in embeddings]

In [None]:
print('Compute filters embeddings')

positive_filters = ['homo sapience', 'human', 'mammal', 'human cell']
negative_filters = ['cancer', 'tumor', 'tumor genesis', 'adenoma', 'carcinoma', 'mouse']

print(f'Computing filters embeddings embeddings')
negative_filters_embeddings = [l2norm(e) for e in batch_texts_embeddings(positive_filters)]
positive_filters_embeddings = [l2norm(e) for e in batch_texts_embeddings(negative_filters)]

negative_filters_scores = [(pmid, max([np.dot(e, ne) for ne in negative_filters_embeddings]))
                           for (pmid, _), e in zip(chunk_idx, embeddings)]
positive_filters_scores = [(pmid, min([np.dot(e, ne) for ne in positive_filters_embeddings]))
                           for (pmid, _), e in zip(chunk_idx, embeddings)]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 4))
axes = [plt.subplot(1, 3, i + 1) for i in range(3)]
ax = axes[0]
ns = [s for _, s in negative_filters_scores]
sns.histplot(ns, kde=True, ax=ax)
ax.set_title('Negative filters')

ax = axes[1]
ps = [s for _, s in positive_filters_scores]
sns.histplot(ps, kde=True, ax=ax)
ax.set_title('Positive filters')

ax = axes[2]
sns.scatterplot(x=ns, y=ps, ax=ax)
sns.rugplot(x=ns, y=ps, height=.1, alpha=0.01, ax=ax)
ax.set_xlabel('Negative filters')
ax.set_ylabel('Positive filters')
ax.set_title('Positive filters vs negative filters')

plt.show()

In [None]:
max_negative_filter_score = 0.1
min_positive_filter_score = 0.05

filtered_ids = [
    pmid for (pmid, ps), (_, ns) in zip(positive_filters_scores, negative_filters_scores)
    if ps > min_positive_filter_score and ns < max_negative_filter_score
]

filtered_publications = publications_db_connector.load_publications(filtered_ids)
filtered_publications['title']

# Visualization of semantic search results

Launch fasttext endpoint API so that analyzer can use it
  ```
  conda activate pubtrends
  export PYTHONPATH=$PYTHONPATH:$(pwd)
  python pysrc/fasttext/fasttext_app.py
  ```

In [None]:
from pysrc.papers.db.pm_postgres_loader import PubmedPostgresLoader
from pysrc.papers.analyzer import PapersAnalyzer

loader = PubmedPostgresLoader(config)
analyzer = PapersAnalyzer(loader, config)

In [None]:
try:
    analyzer.analyze_papers(filtered_ids, 5)
finally:
    loader.close_connection()
    analyzer.teardown()

In [None]:
from bokeh.plotting import show
from pysrc.papers.plot.plotter import Plotter

analyzer.search_ids = filtered_ids
plotter = Plotter(config, analyzer)


In [None]:
# show(plotter.plot_top_cited_papers())

In [None]:
show(plotter.plot_papers_graph())

In [None]:
show(plotter.topics_hierarchy_with_keywords())