In [1]:
import pandas as pd
import numpy as np
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from tqdm.auto import tqdm

In [2]:
# Подключаемся к локальному Qdrant
client = QdrantClient("localhost", port=6333)

# КОНФИГ
COLLECTION_NAME = "nlp2025_chunks"
VECTOR_SIZE = 1024
BATCH_SIZE = 500

In [3]:
# 1. Проверяем и удаляем, если уже есть (чтобы начать с чистого листа)
if client.collection_exists(collection_name=COLLECTION_NAME):
    client.delete_collection(collection_name=COLLECTION_NAME)
    print(f"Коллекция '{COLLECTION_NAME}' удалена.")

# 2. Создаем заново
client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(
        size=VECTOR_SIZE, 
        distance=Distance.COSINE
    )
)
print(f"Коллекция '{COLLECTION_NAME}' успешно создана! Готовы к заливке.")

Коллекция 'nlp2025_chunks' успешно создана! Готовы к заливке.


In [4]:
# Путь к файлу
parquet_file = "../data/processed/all_chunks_with_embeddings.parquet"

print("Читаем Parquet файл...")
df = pd.read_parquet(parquet_file)
print(f"Загружено строк: {len(df)}")

points_batch = []
total_records = len(df)

# Итерируемся по датафрейму
print("Начинаем заливку в Qdrant...")
for i, row in tqdm(df.iterrows(), total=total_records, desc="Uploading"):
    
    # Конвертация numpy array -> list float32
    # Это критично, чтобы не словить ошибку сериализации JSON
    vector = row["embedding"].astype(np.float32).tolist()
    
    # Сборка метаданных
    payload = {
        "text": row["text"],
        "source": row["source_path"],
        # Распаковка остальных метаданных, если есть
        **(row["metadata"] if row["metadata"] else {}) 
    }

    # Добавляем точку в буфер
    points_batch.append(
        PointStruct(
            id=i,  # Используем индекс DataFrame как ID
            vector=vector,
            payload=payload
        )
    )

    # Если буфер заполнился — отправляем
    if len(points_batch) >= BATCH_SIZE:
        client.upsert(
            collection_name=COLLECTION_NAME,
            points=points_batch,
            wait=False # Не ждем подтверждения записи на диск (быстрее)
        )
        points_batch = [] # Очищаем буфер

# Дозаливаем остатки, если буфер не пуст
if points_batch:
    client.upsert(
        collection_name=COLLECTION_NAME,
        points=points_batch,
        wait=True # Тут ждем, чтобы убедиться, что всё закончилось
    )

print("Заливка завершена!")

Читаем Parquet файл...
Загружено строк: 767801
Начинаем заливку в Qdrant...


Uploading:   0%|          | 0/767801 [00:00<?, ?it/s]

Заливка завершена!


In [5]:
# Тестовый поиск (берем вектор из датасета для примера)
test_vector = df.iloc[0]["embedding"].astype(np.float32).tolist()

hits = client.query_points(
    collection_name=COLLECTION_NAME,
    query=test_vector,
    limit=3
)

print("\nТестовый поиск (Top-3):")
for hit in hits.points:
    print(f"ID: {hit.id}, Score: {hit.score}")
    if hit.payload:
        print(f"Payload: {hit.payload}")
    print("---")


Тестовый поиск (Top-3):
ID: 0, Score: 1.0
Payload: {'text': 'Labels Generated by Large Language Models Help Measure People’s Empathy in Vitro: ###### Abstract  \nLarge language models (LLMs) have revolutionised many fields, with LLM-as-a-service (LLMSaaS) offering accessible, general-purpose solutions without costly task-specific training. In contrast to the widely studied prompt engineering for directly solving tasks (in vivo), this paper explores LLMs’ potential for in-vitro applications: using LLM-generated labels to improve supervised training of mainstream models. We examine two strategies – (1) noisy label correction and (2) training data augmentation – in empathy computing, an emerging task to predict psychology-based questionnaire outcomes from inputs like textual narratives. Crowdsourced datasets in this domain often suffer from noisy labels that misrepresent underlying empathy. We show that replacing or supplementing these crowdsourced labels with LLM-generated labels, devel

In [6]:
synthetic_vector = np.random.uniform(low=-1.0, high=1.0, size=(VECTOR_SIZE,)).tolist()

In [7]:
search_result = client.query_points(
    collection_name=COLLECTION_NAME,
    query=synthetic_vector,
    limit=5,
    with_payload=True,
    with_vectors=False
)

In [8]:
search_result

QueryResponse(points=[ScoredPoint(id=502714, version=1006, score=0.14965864, payload={'text': "What makes an entity salient in discourse? > 2 Defining salience > 2.3 Graded summary-based salience: Boswijk and Coler ([2020](https://arxiv.org/html/2508.16464v1#bib.bib5); 713) point out that ``even when definitions [of salience] are given, they are often circular'' – the main issue issue responsible for this circularity in their analysis is reliance on text-internal criteria to identify salient mentions (e.g.subjecthood in Centering Theory or definiteness in the Mental Salience framework). These criteria interfere with, and in some cases preclude using corpus data to identify correlates of salience, as these may end up being the very features that define the term in that approach. Similarly, markedness or surprisal based approaches, while not tied to a specific marker such as definite articles or pronouns, also fall into the class of text-based definitions, because they are a direct funct