In [1]:
from dotenv import load_dotenv
import os
import warnings
import logging

# Set logging level to INFO
logging.basicConfig(level=logging.INFO)

warnings.filterwarnings("ignore")

# Load variables from .env file
load_dotenv()


True

In [50]:
logging.info("Starting the application")

INFO:root:Starting the application


In [2]:
import getpass
import os

if not os.environ.get("MISTRALAI_API_KEY"):
    os.environ["GROQ_API_KEY"] = getpass.getpass("Enter API key for Groq: ")

from langchain_groq import ChatGroq

model = ChatGroq(model="llama3-8b-8192")

model.invoke("Hello, how are you!")


INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


AIMessage(content="I'm just a language model, I don't have emotions or feelings like humans do, but I'm functioning properly and ready to help you with any questions or tasks you may have! It's great to chat with you. How can I assist you today?", additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 53, 'prompt_tokens': 16, 'total_tokens': 69, 'completion_time': 0.044166667, 'prompt_time': 0.002345226, 'queue_time': 0.019151174, 'total_time': 0.046511893}, 'model_name': 'llama3-8b-8192', 'system_fingerprint': 'fp_6a6771ae9c', 'finish_reason': 'stop', 'logprobs': None}, id='run-09d49eb3-9a19-4d08-b12f-0c791324a1ec-0', usage_metadata={'input_tokens': 16, 'output_tokens': 53, 'total_tokens': 69})

In [3]:
from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader

def load_pdf(file_path):
    return PyMuPDFLoader(file_path).load()

documents = load_pdf("input/gen_ai_langchain_2024.pdf")


In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
        add_start_index=True,
    )
    
    # Split the documents into chunks
    return text_splitter.split_documents(documents)

chunks = split_documents(documents)

# add unique ids to each chunk combining source, page and start_index
for chunk in chunks:
    source = chunk.metadata.get("source")
    page = chunk.metadata.get("page")
    start_index = chunk.metadata.get("start_index")
    chunk.metadata["id"] = f"{source}_{page}_{start_index}"
print(f"Number of Documents: {len(documents)} and Number of Chunks: {len(chunks)}")

Number of Documents: 361 and Number of Chunks: 1014


In [5]:
chunks[100]

Document(metadata={'source': 'input/gen_ai_langchain_2024.pdf', 'file_path': 'input/gen_ai_langchain_2024.pdf', 'page': 38, 'total_pages': 361, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe InDesign 18.0 (Windows)', 'producer': 'Adobe PDF Library 17.0', 'creationDate': "D:20231215102723+05'30'", 'modDate': "D:20231215103312+05'30'", 'trapped': '', 'start_index': 706, 'id': 'input/gen_ai_langchain_2024.pdf_38_706'}, page_content='skipping tokens. This is a risky strategy because – depending on the threshold of the confidence \nof the oracle’s responses – the quality could deteriorate.\nThere’s also a multi-modal version of GPT-4 that incorporates a separate vision encoder, trained \non joined image and text data, giving the model the capability to read web pages and transcribe \nwhat’s in images and video.\nAs can be seen in Figure 1.5, there are quite a few models besides OpenAI’s, some of which are \nsuitable as a substitute for the O

In [6]:
from langchain_huggingface import HuggingFaceEmbeddings

def get_embedding_function():
    return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# test the embedding function
embedding_function = get_embedding_function()
len(embedding_function.embed_query(chunks[100].page_content))

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


768

In [12]:
from langchain.vectorstores.chroma import Chroma
from langchain_core.documents import Document
CHROMA_PATH = "chroma_db"
def add_to_chroma(chunks: list[Document]):
    db = Chroma(
        persist_directory=CHROMA_PATH,
        embedding_function=get_embedding_function(),
    )

    # add or update the chunks in the chroma db
    existing_items = db.get()
    if existing_items is None:
        existing_items = {"id": []}
    existing_ids = set(existing_items["ids"])

    print(f"Number of existing items: {len(existing_ids)}")

    new_chunks = [chunk for chunk in chunks if chunk.metadata["id"] not in existing_ids]
    new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
    if not new_chunks:
        print("No new chunks to add")
        return
    db.add_documents(new_chunks, ids=new_chunk_ids)
    db.persist()
    
import time
try:
    add_to_chroma(chunks)
except Exception as e:
    print(e)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:chromadb.api.segment:Collection langchain is not created.


Number of existing items: 1014
No new chunks to add


In [45]:
PROMPT_TEMPLATE = """
Answer the question based on only on the following context:
```
{context}
```

Answer the question based on the context above: 
```
{question}
```

If the question is out of context, please answer with "I don't know".
"""



In [46]:
# import ChatPromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq

def query_rag(query_text: str, model):
    db = Chroma(
        persist_directory=CHROMA_PATH,
        embedding_function=get_embedding_function(),
    )
    
    results = db.similarity_search_with_score(query_text, k=5)
    
    context_text = "\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate(
        [("system", "You are a helpful AI bot."),
        ("human", PROMPT_TEMPLATE),]
    )
    prompt = prompt_template.format(context=context_text, question=query_text)
    
    response = model.invoke(prompt)
    
    sources = [doc.metadata.get("id") for doc, _score in results]
    
    return response, sources

In [49]:
# query the model with a question
query_rag("What is Transformer", model)


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:chromadb.api.segment:Collection langchain is not created.
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


(AIMessage(content='According to the context, a Transformer is a Deep Learning (DL) architecture that was first introduced in 2017 by researchers at Google and the University of Toronto. It is an encoder-decoder structure that uses self-attention and feed-forward neural networks to capture long-range dependencies in a sentence.', additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 60, 'prompt_tokens': 850, 'total_tokens': 910, 'completion_time': 0.05, 'prompt_time': 0.106209612, 'queue_time': 0.019885206000000002, 'total_time': 0.156209612}, 'model_name': 'llama3-8b-8192', 'system_fingerprint': 'fp_6a6771ae9c', 'finish_reason': 'stop', 'logprobs': None}, id='run-cbbe125e-fc0a-4d67-b37b-d76c30a0f8ff-0', usage_metadata={'input_tokens': 850, 'output_tokens': 60, 'total_tokens': 910}),
 ['input/gen_ai_langchain_2024.pdf_42_1500',
  'input/gen_ai_langchain_2024.pdf_43_0',
  'input/gen_ai_langchain_2024.pdf_43_738',
  'input/gen_ai_langchain_2024.pdf_347_0',
  'inpu