In [61]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores.chroma import Chroma
import os
import shutil

In [62]:
#Set paths
CHROMA_PATH = "chroma"
DATA_PATH = "data"

In [63]:
def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.md")
    documents = loader.load()
    return documents

In [64]:
def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

In [65]:
def save_to_chroma(chunks: list[Document]):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

    # Create a new DB from the documents.
    db = Chroma.from_documents(
        chunks, embedding_function, persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [66]:
def generate_data_store():
    documents = load_documents()
    chunks = split_text(documents)
    save_to_chroma(chunks)

In [67]:
generate_data_store()

Split 13 documents into 224 chunks.
For boarding houses, landlords must provide 24 hours’ notice before entering the boarding house room.
{'source': 'data\\access.md', 'start_index': 2075}
Saved 224 chunks to chroma.


In [69]:
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

db = Chroma(persist_directory="./chroma", embedding_function=embedding_function)

In [77]:
query = "What are examples of fair wear and tear?"

docs = db.similarity_search(query)

In [78]:
docs

[Document(page_content='Examples of what is usually considered fair wear and tear are:\n\nExamples of what is not normally considered fair wear and tear are:\n\nMaintenance and repair tips for landlords and tenants\n\nInsurance in case of damage', metadata={'source': 'data\\damage and repairs.md', 'start_index': 3309}),
 Document(page_content='Fair wear and tear', metadata={'source': 'data\\damage and repairs.md', 'start_index': 2712}),
 Document(page_content='Fair wear and tear refers to the gradual deterioration of things that are used regularly in a property when people live in it.', metadata={'source': 'data\\damage and repairs.md', 'start_index': 2732}),
 Document(page_content='A tenant is not responsible for normal fair wear and tear to the property or any chattels provided by the landlord when they use them normally. The tenant is responsible for any intentional or careless damage.', metadata={'source': 'data\\damage and repairs.md', 'start_index': 2860})]

In [31]:
import argparse
from dataclasses import dataclass
from langchain.vectorstores.chroma import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain.embeddings.spacy_embeddings import SpacyEmbeddings

In [25]:
CHROMA_PATH = "chroma"

In [26]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [28]:
query_text = "Can my landlord enter my property without my permission?"

In [32]:
# Prepare the DB.
embedding_function = SpacyEmbeddings()
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

ValidationError: 1 validation error for SpacyEmbeddings
__root__
  multiple bases have instance lay-out conflict (type=type_error)

In [None]:
results = db.similarity_search_with_relevance_scores(query_text, k=3)

if len(results) == 0 or results[0][1] < 0.7:
    print(f"Unable to find matching results.")
    return

context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)
print(prompt)

In [None]:
model = ChatOpenAI()
response_text = model.predict(prompt)

sources = [doc.metadata.get("source", None) for doc, _score in results]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)