In [68]:
import os
import shutil
from dotenv import load_dotenv
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.evaluation import load_evaluator
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnableSequence
from langchain_core.output_parsers import StrOutputParser

In [2]:
def load_documents(DATA_PATH):
    loader = DirectoryLoader(DATA_PATH, glob="*.md")
    return loader.load()

def split_text(documents, chunk_size=1000, chunk_overlap=500, verbose=True):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        add_start_index=True
    )
    chunks = text_splitter.split_documents(documents)
    if verbose:
        print(f"Splitting {len(documents)} documents into {len(chunks)} chunks")
    return chunks

def save_to_chroma(chunks, chroma_path, verbose=True):
    if os.path.exists(chroma_path):
        shutil.rmtree(chroma_path)

    db = Chroma.from_documents(chunks, OpenAIEmbeddings(), persist_directory=chroma_path)
    if verbose:
        print(f"Saved {len(chunks)} chunks to {chroma_path}")

In [3]:
DATA_PATH = "data"
documents = load_documents(DATA_PATH)

In [4]:
chunks = split_text(documents)

Splitting 1 documents into 21 chunks


In [5]:
CHROMA_PATH = "chroma"

load_dotenv("../.env")
save_to_chroma(chunks, CHROMA_PATH)

Saved 21 chunks to chroma


### Vector Embeddings

In [6]:
evaluator = load_evaluator("pairwise_embedding_distance") # calculate pairwise embedding distance

In [7]:
x = evaluator.evaluate_string_pairs(prediction="apple", prediction_b="orange")
y = evaluator.evaluate_string_pairs(prediction="apple", prediction_b="car")
z = evaluator.evaluate_string_pairs(prediction="apple", prediction_b="apple")
k = evaluator.evaluate_string_pairs(prediction="apple", prediction_b="iphone")
print(x, y, z, k)

{'score': 0.13560089656702645} {'score': 0.1712324076293733} {'score': 3.3306690738754696e-16} {'score': 0.09711195935740158}


### Search in the db

In [55]:
def retrieve_information(query: str, db: Chroma, top_k=3):
    results = db.similarity_search_with_relevance_scores(query, k=top_k)    
    if len(results) == 0 or results[0][1] < 0.7: # if the most similar document has a similarity score less than 0.7
        return None
    
    context_text = "\n\n=====\n\n".join([doc.page_content for doc, _ in results])
    sources = [doc.metadata.get("source", None) for doc, _ in results]
    return context_text, sources

In [56]:
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=OpenAIEmbeddings())

In [57]:
prompt = "How to add a code section in markdown?"

In [59]:
context_text, sources = retrieve_information(prompt, db)

### Format the proper response

In [60]:
PROMPT_TEMPLATE = """
Answer the question based on only the following context:
{context}
=====

Answer the question based on the above context: {query}
"""

In [67]:
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

In [69]:
llm = ChatOpenAI(name="gpt-4o-mini", temperature=0, max_tokens=256)
chain = RunnableSequence(prompt_template | llm | StrOutputParser())

In [71]:
response_text = chain.invoke({"query": prompt, "context": context_text})

In [72]:
formatted_response = f"Response: {response_text}\nSources: {sources}"

In [73]:
print(formatted_response)

Response: To add a code section in markdown, you can use three backticks ``` before and after the code block.
Sources: ['data/sample.md', 'data/sample.md', 'data/sample.md']
