# Semantic search with fasttext embeddings and postgresql

Create DB Postgresql + pgvector
```
docker run --rm --name pubtrends-postgres -p 5432:5432 \
        -m 32G \
        -e POSTGRES_USER=biolabs -e POSTGRES_PASSWORD=mysecretpassword \
        -e POSTGRES_DB=pubtrends \
        -v ~/postgres/:/var/lib/postgresql/data \
        -e PGDATA=/var/lib/postgresql/data/pgdata \
        -d pgvector/pgvector:pg17
```


In [None]:
import logging
import pandas as pd
from tqdm.auto import tqdm
import os
import psycopg2

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')
logger = logging.getLogger('notebook')

%matplotlib inline
%config InlineBackend.figure_format='retina'

# Connections with main PubTrends database

In [None]:
full_db_host = ''
full_db_port = 5432
full_db_database = 'pubtrends'
full_db_username = 'biolabs'
full_db_password = 'mysecretpassword'

connection_string_full_db = f"""
                    host={full_db_host} \
                    port={full_db_port} \
                    dbname={full_db_database} \
                    user={full_db_username} \
                    password={full_db_password}
                """.strip()

In [None]:
def load_publications(pids):
    with psycopg2.connect(connection_string_full_db) as connection:
        connection.set_session(readonly=True)
    vals = ints_to_vals(pids)
    query = f'''
                SELECT P.pmid as id, title, abstract, year
                FROM PMPublications P
                WHERE P.pmid IN (VALUES {vals});
                '''
    with connection.cursor() as cursor:
        cursor.execute(query)
        df = pd.DataFrame(cursor.fetchall(),
                          columns=['id', 'title', 'abstract', 'year'],
                          dtype=object)
        return df


In [None]:
def load_publications_year(year):
    with psycopg2.connect(connection_string_full_db) as connection:
        connection.set_session(readonly=True)
        query = f'''
                SELECT P.pmid as id, title, abstract
                FROM PMPublications P
                WHERE year = {year};
                '''
        with connection.cursor() as cursor:
            cursor.execute(query)
            df = pd.DataFrame(cursor.fetchall(),
                              columns=['id', 'title', 'abstract'],
                              dtype=object)
            return df

In [None]:
load_publications_year(2025).head(10)

# Embeddings with fasttext

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

def universal_chunk(text, max_tokens=64, overlap_sentences=1):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]

    chunks = []
    current_chunk = []
    current_tokens = 0

    i = 0
    while i < len(sentences):
        sentence_tokens = len(sentences[i].split())

        if current_tokens + sentence_tokens <= max_tokens:
            current_chunk.append(sentences[i])
            current_tokens += sentence_tokens
            i += 1
        else:
            chunks.append(" ".join(current_chunk))
            # Retain overlap
            current_chunk = current_chunk[-overlap_sentences:] if overlap_sentences else []
            current_tokens = sum(len(s.split()) for s in current_chunk)

            # Check if we'll get stuck in an infinite loop
            if current_chunk and current_tokens + sentence_tokens > max_tokens:
                # If the sentence still won't fit after keeping overlap, handle it specially
                if sentence_tokens > max_tokens:
                    # If the sentence itself is too long, split it
                    if current_chunk:
                        chunks.append(" ".join(current_chunk))
                        current_chunk = []
                        current_tokens = 0
                    words = sentences[i].split()
                    for j in range(0, len(words), max_tokens):
                        subchunk = " ".join(words[j:j + max_tokens])
                        chunks.append(subchunk)
                    i += 1
                else:
                    # If the sentence is not too long but won't fit with overlap,
                    # start a new chunk with just this sentence
                    chunks.append(" ".join(current_chunk))
                    current_chunk = [sentences[i]]
                    current_tokens = sentence_tokens
                    i += 1

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

In [None]:
text = "Staphylococcus aureus is a rare cause of postinfectious glomerulonephritis, and Staphylococcus-related glo-merulonephritis primarily occurs in middle-aged or elderly patients. Patients with Staphylococcus-related glomerulonephritis also present with hematuria, proteinuria of varying degrees, rising serum creatinine levels, and/or edema. The severity of renal insufficiency is proportional to the degree of proliferation and crescent formation. Here, we present a diabetic patient admitted with a history of 1 week of left elbow pain. Laboratory results revealed that erythrocyte sedimentation rate was 110 mm/hour, serum creatinine level was 1 mg/dL, C-reactive protein level was 150 mg/L, and magnetic resonance imaging showed signal changes in favor of osteomyelitis at the olecranon level, with diffuse edematous appearance in the elbow skin tissue and increased intra-articular effusion. After diagnosis of osteomyelitis, ampicillin/sulbactam and teicoplanin were administered. After day 7 of admission, the patient developed acute kidney injury requiring hemodialysis under antibiotic treatment. Kidney biopsy was performed to determine the underlying cause, which showed Staphylococcus-related glomerulonephritis. Recovery of renal func-tions was observed after antibiotic and supportive treatment."
universal_chunk(text)

In [None]:
from pysrc.fasttext.fasttext import PRETRAINED_MODEL_CACHE

logger.info('Compute words embeddings using pretrained fasttext model')
# Model will be loaded when needed through the lazy property
logger.info('Done')

In [None]:
import numpy as np


def tokens_embeddings_fasttext(text):
    # Access the model correctly as a lazy property
    model_instance = PRETRAINED_MODEL_CACHE.download_and_load_model
    return np.mean([
        model_instance.get_vector(t) if model_instance.has_index_for(t)
        else np.zeros(model_instance.vector_size)  # Support out-of-dictionary missing embeddings
        for t in text.split()
    ], axis=0).tolist()


In [None]:
len(tokens_embeddings_fasttext(text))

# Prepare Postgresql + pgvector for embeddings search

In [None]:
semantics_search_host = ''
semantics_search_port = 5432
semantics_search_database = 'pubtrends'
semantics_search_username = 'biolabs'
semantics_search_password = 'mysecretpassword'

semantics_search_connection_string = f"""
                    host={semantics_search_host} \
                    port={semantics_search_port} \
                    dbname={semantics_search_database} \
                    user={semantics_search_username} \
                    password={semantics_search_password}
                """.strip()

In [None]:
with psycopg2.connect(semantics_search_connection_string) as connection:
    connection.set_session(readonly=False)
    query = '''
drop table if exists PMPublicationsSmall;
create table PMPublicationsSmall(
pmid    integer,
title   varchar(1023),
abstract text
);
            '''
    with connection.cursor() as cursor:
        cursor.execute(query)
    connection.commit()

In [None]:
with psycopg2.connect(semantics_search_connection_string) as connection:
    connection.set_session(readonly=False)
    query = '''
            CREATE EXTENSION IF NOT EXISTS vector;
            drop table if exists PMPublicationsEmbeddings;
            create table PMPublicationsEmbeddings(
                                                pmid    integer,
                                                chunk   integer,
                                                embedding vector(200)
            );
            '''
    with connection.cursor() as cursor:
        cursor.execute(query)
    connection.commit()

In [None]:
# Create an index for fast vector similarity search
with psycopg2.connect(semantics_search_connection_string) as connection:
    connection.set_session(readonly=False)
    query = '''
            CREATE INDEX texts_embedding_idx
                ON PMPublicationsEmbeddings
                USING ivfflat (embedding vector_cosine_ops)
                WITH (lists = 100);
            '''
    with connection.cursor() as cursor:
        cursor.execute(query)
    connection.commit()

# Compute embeddings for publications

In [None]:
from psycopg2.extras import execute_values

from more_itertools import sliced
CHUNK_SIZE = 1000

for year in range(2025, 2024, -1):
    print(f'Processing year {year}')
    df = load_publications_year(year)
    print('Storing embeddings into DB')
    index_slices = sliced(range(len(df)), CHUNK_SIZE)
    for index_slice in tqdm(list(index_slices)):
        chunk = df.iloc[index_slice]
        chunk_embeddings = []
        for (pid, title, abstract) in zip(chunk['id'], chunk['title'], chunk['abstract']):
            if not title or not abstract:
                continue
            for i, text_chunk in enumerate(universal_chunk(f'{title}. {abstract}')):
                chunk_embeddings.append(
                    (pid, i, tokens_embeddings_fasttext(text_chunk)))
        print(f'Storing {len(chunk_embeddings)} embeddings')
        with psycopg2.connect(semantics_search_connection_string) as connection:
            with connection.cursor() as cursor:
                execute_values(
                    cursor,
                    "INSERT INTO PMPublicationsEmbeddings (pmid, chunk, embedding) VALUES %s",
                    chunk_embeddings
                )
            connection.commit()

# Semantic search

In [None]:
def semantic_search(query, k=5):
    embedding = tokens_embeddings_fasttext(query)
    with psycopg2.connect(semantics_search_connection_string) as connection:
        with connection.cursor() as cursor:
            cursor.execute("""
                   SELECT pmid, chunk, embedding <=> %s::vector AS distance
                   FROM PMPublicationsEmbeddings
                   ORDER BY distance ASC
                       LIMIT %s
                   """, (embedding, k))

            results = cursor.fetchall()
            return results

In [None]:
semantic_search("epigenetic modifications in healthy human aging", 10)

In [None]:
from pysrc.papers.db.postgres_utils import ints_to_vals

def find_papers_semantic_search(query, k):
    search = semantic_search(query, k)
    pids = [pid for pid, _, _ in search]
    return load_publications(pids)


In [None]:
find_papers_semantic_search("epigenetic modifications in healthy human aging", 10)