In [1]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import faiss, numpy as np, torch, os
from tqdm.auto import tqdm
from huggingface_hub import HfApi, hf_hub_download, upload_file


In [4]:
REPO_ID     = "GingerBled/RAG_corpus_docs"
CHUNK_FILE  = "rag_build/chunks.parquet"
OUT_INDEX   = "index.faiss"
OUT_IDMAP   = "id_map.npy"
BATCH       = 256

# ---- 1. Load dataset streaming ------------------------------------------
ds = load_dataset(REPO_ID, split="train", streaming=True)
total = load_dataset(REPO_ID, split="train").num_rows


train-00000-of-00001.parquet:   0%|          | 0.00/168M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/484987 [00:00<?, ? examples/s]

In [5]:
total

484987

In [7]:
# ---- 2. Load encoder -----------------------------------------------------
model = SentenceTransformer("BAAI/bge-large-en-v1.5", device="cuda")
model.max_seq_length = 512

embeds, id_map = [], []

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [8]:
# ---- 3. Embed ------------------------------------------------------------
batch_txt, batch_ids = [], []
for ex in tqdm(ds, total=total, desc="encoding"):
    batch_txt.append(ex["text"])
    batch_ids.append(ex["id"])
    if len(batch_txt) == BATCH:
        vecs = model.encode(batch_txt, convert_to_numpy=True, batch_size=BATCH, show_progress_bar=False)
        embeds.append(vecs.astype("float16"))
        id_map.extend(batch_ids)
        batch_txt, batch_ids = [], []

# flush tail
if batch_txt:
    embeds.append(model.encode(batch_txt, convert_to_numpy=True, batch_size=len(batch_txt)).astype("float16"))
    id_map.extend(batch_ids)

embeds = np.concatenate(embeds, axis=0).astype("float16")
np.save(OUT_IDMAP, np.array(id_map, dtype="S40"))  # bytes strings


encoding:   0%|          | 0/484987 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# ---- 4. Build FAISS HNSW -------------------------------------------------
d = embeds.shape[1]
index = faiss.IndexHNSWFlat(d, 32)
index.hnsw.efConstruction = 200
index.add(embeds)
faiss.write_index(index, OUT_INDEX)
print(f"[DONE] Index size: {index.ntotal:,} vectors")

In [None]:
# ---- 5. Push to repo -----------------------------------------------------
api = HfApi()
# ensure LFS pointers by uploading via HF Hub helper
for f in [OUT_INDEX, OUT_IDMAP]:
    upload_file(
        path_or_fileobj=f,
        path_in_repo=f"index/{f}",
        repo_id=REPO_ID,
        repo_type="dataset",
    )
print(f"✅  Uploaded index files to https://huggingface.co/datasets/{REPO_ID}/tree/main/index")