In [1]:
import torch
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from pathlib import Path

In [2]:
# --- КОНФИГ ---
MODEL_ID = "Qwen/Qwen3-Embedding-0.6B"
INPUT_PARQUET = Path("../data/processed/all_chunks.parquet")
OUTPUT_PARQUET = Path("../data/processed/all_chunks_with_embeddings.parquet")
BATCH_SIZE = 32

In [3]:
model = SentenceTransformer(
    MODEL_ID,
    device="cuda", 
    model_kwargs={
        "dtype": torch.bfloat16,
        "attn_implementation": "flash_attention_2",
        "trust_remote_code": True
    },
    tokenizer_kwargs={"padding_side": "left"}
)

In [4]:
def compute_embeddings(batch):
    embeddings = model.encode(
        batch["text"],
        batch_size=len(batch["text"]),
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=False
    )
    
    return {"embedding": embeddings}

In [5]:
ds = Dataset.from_parquet(str(INPUT_PARQUET))

In [6]:
print(f"Начинаем векторизацию {len(ds)} чанков...")
ds_with_embeddings = ds.map(
    compute_embeddings,
    batched=True,
    batch_size=BATCH_SIZE,
    desc="Embedding generation"
)

Начинаем векторизацию 767801 чанков...


Embedding generation:   0%|          | 0/767801 [00:00<?, ? examples/s]

In [7]:
print(f"Сохранение в {OUTPUT_PARQUET}...")
ds_with_embeddings.to_parquet(str(OUTPUT_PARQUET))
print("Готово.")

Сохранение в ..\data\processed\all_chunks_with_embeddings.parquet...


Creating parquet from Arrow format:   0%|          | 0/44 [00:00<?, ?ba/s]

Готово.
