In [1]:
import os
import json
import pickle
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# -------- Paths --------
PROCESSED_TEXT_PATH = "../Data/processed_text/chunks_metadata.json"
EMBEDDINGS_DIR = "../Data/embeddings"
FAISS_INDEX_PATH = os.path.join(EMBEDDINGS_DIR, "faiss_index.bin")
METADATA_PATH = os.path.join(EMBEDDINGS_DIR, "metadata.pkl")

os.makedirs(EMBEDDINGS_DIR, exist_ok=True)

# -------- Load chunks --------
with open(PROCESSED_TEXT_PATH, "r", encoding="utf-8") as f:
    chunks_data = json.load(f)

# Initialize embedding model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

model.save('../models/all-MiniLM-L6-v2')

# -------- 1. Embed text chunks --------
texts = [chunk["content"] for chunk in chunks_data]
print(f"Total chunks to embed: {len(texts)}")

text_embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)

# -------- 2. Embed captions --------
for chunk in chunks_data:
    for caption in chunk.get("captions", []):
        if caption["caption_text"] and caption["caption_text"].lower() != "no caption detected":
            caption_emb = model.encode(caption["caption_text"], convert_to_numpy=True)
            caption["embedding"] = caption_emb.tolist()  # store as list for JSON/Pickle
        else:
            caption["embedding"] = None

# -------- 3. Create FAISS index for text chunks --------
dimension = text_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(text_embeddings)
print(f"FAISS index size: {index.ntotal}")

# -------- 4. Save FAISS + metadata (with caption embeddings) --------
faiss.write_index(index, FAISS_INDEX_PATH)

with open(METADATA_PATH, "wb") as f:
    pickle.dump(chunks_data, f)

print(f"FAISS index saved to {FAISS_INDEX_PATH}")
print(f"Metadata saved to {METADATA_PATH}")

# -------- 5. Reload check --------
loaded_index = faiss.read_index(FAISS_INDEX_PATH)
with open(METADATA_PATH, "rb") as f:
    metadata_loaded = pickle.load(f)

print(f"Reloaded index size: {loaded_index.ntotal}")
print(f"Metadata entries: {len(metadata_loaded)}")

Total chunks to embed: 13895


Batches:   0%|          | 0/435 [00:00<?, ?it/s]

FAISS index size: 13895
FAISS index saved to ../Data/embeddings\faiss_index.bin
Metadata saved to ../Data/embeddings\metadata.pkl
Reloaded index size: 13895
Metadata entries: 13895
