In [1]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain.schema import Document
from langchain_core.runnables import RunnableLambda
import json
import os

In [2]:

embed_model = FastEmbedEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    cache_dir="./embedding_cache"  # Custom directory for model files
)

  from .autonotebook import tqdm as notebook_tqdm


### Process metadata files

In [3]:
with open("data/vectorstore.json", "r", encoding="utf-8") as f:
    chunks_with_metadata = json.load(f)

In [4]:
doc_splits = [
    Document(
        page_content=chunk["text"],
        metadata={
            "chapter": chunk["metadata"]["chapter"],
            "title": chunk["metadata"]["title"],
            "date": chunk["metadata"]["date"],
            "article": chunk["metadata"]["article"]
        }
    ) for chunk in chunks_with_metadata
]

### save to rag

In [5]:
persist_directory = './real_estate_db/vectorstore'

In [6]:
# delete current rag if create new one
import shutil
shutil.rmtree(persist_directory, ignore_errors=True)

In [7]:
# create db
vectorstore_created = Chroma.from_documents(documents=doc_splits,
                                    embedding=embed_model,
                                    persist_directory=persist_directory,
                                    collection_name="vectorstore")
vectorstore_created.persist()

  vectorstore_created.persist()


In [8]:
# call from existed db
vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embed_model, collection_name="vectorstore")
# vectorstore.get()

  vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embed_model, collection_name="vectorstore")


In [9]:
print("Number of stored documents:", vectorstore._collection.count())
# print("First document:", doc_splits[-1].page_content if doc_splits else "No documents found!")
print("Files in persistence directory:", os.listdir(persist_directory))

Number of stored documents: 2607
Files in persistence directory: ['1aead4fb-bcb3-496d-86d1-dda9b04741f6', 'chroma.sqlite3']


In [10]:
from chromadb import PersistentClient
client = PersistentClient(path=persist_directory)
collections = client.list_collections()
print("Available collections:", collections)


Available collections: ['vectorstore']


In [11]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
retriever_lambda = RunnableLambda(lambda x: retriever.get_relevant_documents(x["question"]))