In [4]:
import psycopg2
import numpy as np
from sentence_transformers import SentenceTransformer
# from .autonotebook import tqdm as notebook_tqdm


In [5]:
SentenceTransformer("intfloat/multilingual-e5-large")


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [6]:
model = SentenceTransformer("intfloat/multilingual-e5-large")

In [22]:
def get_embedding(text: str) -> list:
    """Получить векторное представление текста"""
    # Префикс для E5-модели
    formatted_text = f"query: {text}"  # Используем `query:` для текстов запроса
    embedding = model.encode(formatted_text, normalize_embeddings=True)
    return embedding.tolist()

def add_text_to_db(conn, text: str):
    """Добавить текст и его векторное представление в базу данных"""
    embedding = get_embedding(text)
    with conn.cursor() as cur:
        cur.execute(
            "INSERT INTO texts (text, embedding) VALUES (%s, %s)",
            (text, embedding)
        )
    conn.commit()

def search_similar_texts(conn, query: str, top_k: int = 5):
    """Искать похожие тексты"""
    query_embedding = get_embedding(query)
    query_embedding_str = ",".join(map(str, query_embedding))

    with conn.cursor() as cur:
        cur.execute(
            f"""
            SELECT text, 1 - (embedding <=> '[{query_embedding_str}]'::vector) AS similarity
            FROM texts
            ORDER BY similarity DESC
            LIMIT %s
            """,
            (top_k,)
        )
        results = cur.fetchall()

    return results

def search_with_bm25(conn, query: str, top_k: int = 5):
    """Поиск по BM25"""
    ts_query = " & ".join(query.split())  # Преобразовать текст запроса в формат to_tsquery
    with conn.cursor() as cur:
        cur.execute(
            """
            SELECT text, ts_rank(tsv, to_tsquery('english', %s)) AS rank
            FROM texts
            WHERE tsv @@ to_tsquery('english', %s)
            ORDER BY rank DESC
            LIMIT %s;
            """,
            (ts_query, ts_query, top_k)
        )
        results = cur.fetchall()

    return results

def add_text_to_db_with_bm25(conn, text: str):
    """Добавить текст и его полнотекстовое представление для BM25"""
    embedding = get_embedding(text)
    with conn.cursor() as cur:
        cur.execute(
            """
            INSERT INTO texts (text, embedding, tsv)
            VALUES (%s, %s, to_tsvector('english', %s));
            """,
            (text, embedding, text)
        )
    conn.commit()


In [9]:
DB_PARAMS = {
    "dbname": "vector_db",
    "user": "postgres",
    "password": "postgres",
    "host": "localhost",
    "port": "5436"
}

In [31]:
conn = psycopg2.connect(**DB_PARAMS)

# # Добавление текста
# text = "This is a multilingual test example."
# print(f"Adding text: {text}")
# add_text_to_db_with_bm25(conn, text)

# Поиск по BM25
query = "Татарникова"
print(f"Searching for texts using BM25 for: {query}")
results = search_with_bm25(conn, query)
for rank, (text, score) in enumerate(results, start=1):
    print(f"{rank}. {text} (score: {score:.4f})")

conn.close()


Searching for texts using BM25 for: Татарникова
1. Татарникова деректер 4 института, она отвечает почти за все вопросы в универе (score: 0.0608)
