In [None]:
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_core.documents import Document
import json, os

# === Settings ===
DATA_PATH = "laws_processed.json"
CHROMA_PATH = "chroma_db"  # Persistent storage folder
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"

# === Step 1: Load the embedding model ===
print("🔹 Loading embedding model...")
embedding_model = SentenceTransformerEmbeddings(model_name=MODEL_NAME)
print("✅ Model loaded successfully.")

# === Step 2: Load preprocessed data ===
print(f"🔹 Loading data from {DATA_PATH}...")
with open(DATA_PATH, "r", encoding="utf-8") as f:
    laws_data = json.load(f)
print(f"✅ Loaded {len(laws_data)} documents.")

# === Step 3: Convert to LangChain Documents ===
print("🔹 Converting to LangChain Document objects...")
docs = [
    Document(
        page_content=entry["article_text"],
        metadata={
            "law_name": entry.get("law_name", ""),
            "article_title": entry.get("article_title", ""),
            "category": entry.get("category", ""),
            "subcategory": entry.get("subcategory", ""),
        },
    )
    for entry in laws_data
]
print(f"✅ Converted {len(docs)} documents.")

# === Step 4: Create or load ChromaDB ===
print("🔹 Creating persistent Chroma database...")
vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embedding_model,
    persist_directory=CHROMA_PATH
)
print("✅ Embeddings created and stored successfully!")

# === Step 5: Persist data on disk ===
vectorstore.persist()
print(f"💾 Database saved at: {os.path.abspath(CHROMA_PATH)}")

print("\n🎉 All done! Your ChromaDB now contains all {len(docs)} legal articles ready for retrieval.")
