## Load Environment Variable

In [19]:
import dotenv
dotenv.load_dotenv()

True

## Intialize embeddings and LLM

In [2]:
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAI, GoogleGenerativeAIEmbeddings
import weaviate
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.prompts import PromptTemplate

In [4]:
llm = GoogleGenerativeAI(model="gemini-pro")
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

## Department Extraction

In [5]:
def get_departments():
    with weaviate.connect_to_local() as client:
        departments =  client.collections.get("Departments")
        return [department.properties for department in departments.query.fetch_objects().objects]

def format_departments(departments):
    formatted = "\n".join([f"- {dept['name']}: {dept['description']}" for dept in departments])
    return formatted

def get_formatted_depatments():
    return format_departments(get_departments())

In [6]:
# Define prompt templates
DEPARTMENT_PROMPT = """
You are an AI that identifies which department is most related to a given query. 
Based on the descriptions of departments, suggest the department that best matches the query.

Departments:
{departments}

Query:
{query}

Department:
"""

In [7]:
department_prompt = PromptTemplate(
    input_variables=["departments", "query"],
    template=DEPARTMENT_PROMPT
)

department_chain = department_prompt | llm

In [8]:
# Department identification
def get_relevant_department(query):
    department_response = department_chain.invoke({"departments":get_formatted_depatments(), "query":query})
    return department_response.strip()

In [9]:
get_relevant_department("Why is my Windows stuck on boot screen ?")

'IT and Technology'

In [10]:
get_relevant_department("Where should distributor truck be parked in the event of warehouse being non-operational ?")

'Operations'

## Setup Chroma DB Vector store

In [13]:
from langchain_chroma import Chroma

In [49]:
vector_store = Chroma(
    collection_name="chunks",
    embedding_function=embeddings,
    persist_directory="./chroma_db",  # Where to save data locally, remove if not necessary
)

## Chunks and load data

In [42]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from uuid import uuid4

In [43]:
# Function to chunk file content
def chunk_text(text, chunk_size=500):
    text_splitter = RecursiveCharacterTextSplitter(
        # Set a really small chunk size, just to show.
        chunk_size=100,
        chunk_overlap=20,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.create_documents([text])

In [44]:
with open("book.txt", "r") as file:
    text = file.read()
    docs = chunk_text(text)

In [51]:
docs = docs[:10]

In [57]:
for doc in docs:
    doc.metadata = {"department": "operations"}

In [58]:
uuids = [str(uuid4()) for _ in range(len(docs))]

In [59]:
vector_store.add_documents(documents=docs, ids=uuids)

['39ae60bd-bff1-40bf-a4e6-4fd716d97ef7',
 '422de88a-b745-4ff9-86ee-1afc282ae883',
 'ad926f3c-1d00-46f0-a3e0-e59f5a612cb1',
 '0d5b0de0-5d3e-4bdf-b41c-2cc81f3b85e4',
 'f3c91aa4-4915-45e5-bfc3-85a48fe1bd4d',
 '85e77ea2-531b-444e-b138-8a1d45ba721a',
 '62a62f67-9972-4346-badc-d96ca02fc431',
 'd4a46f48-2c4f-493b-97e8-892a4343f021',
 'e53863bc-13dc-4126-8722-7ec257f70e2a',
 '3ce81881-553c-4cf0-bcf8-6a64eeab68a1']

In [61]:
vector_store.similarity_search("Gutenberg Project")

[Document(metadata={'department': 'operations'}, page_content='at www.gutenberg.org. If you are not located in the United States,'),
 Document(metadata={}, page_content='at www.gutenberg.org. If you are not located in the United States,'),
 Document(metadata={}, page_content='of the Project Gutenberg License included with this ebook or online'),
 Document(metadata={'department': 'operations'}, page_content='of the Project Gutenberg License included with this ebook or online')]

In [64]:
retriever = vector_store.as_retriever(
    search_type="mmr", search_kwargs = {"k": 3, "fetch_k": 5}
)

In [67]:
retriever.invoke("Who is gutenberg ?", filter={"department": "operations"})

[Document(metadata={'department': 'operations'}, page_content='at www.gutenberg.org. If you are not located in the United States,'),
 Document(metadata={}, page_content='of the Project Gutenberg License included with this ebook or online'),
 Document(metadata={}, page_content="\ufeffThe Project Gutenberg eBook of The Beggar's Purse: A Fairy Tale of Familiar Finance")]

## Simple RAG

In [75]:
from langchain_core.runnables import RunnablePassthrough

In [72]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [73]:
QUERY_PROMPT = """
Query: 
{query}

Context:
{context}

Answer:
"""

In [74]:
query_prompt = PromptTemplate(
    input_variable = ["query", "context"],
    template=QUERY_PROMPT
)

In [78]:
rag_chain = (
    {"context": retriever | format_docs, "query": RunnablePassthrough()}
    | query_prompt
    | llm
)

In [79]:
res = rag_chain.invoke("What is Gutenberg Project ?")

In [80]:
res

'The Gutenberg Project is a volunteer effort to digitize and archive cultural works, such as books, movies, and audio recordings, and to make them freely available to the public.'