### RAG using Pinecone vectorDB

In [37]:
# Document Loading: PyPDFDirectoryLoader
# Document Splitting and Chunking: RecursiveCharacterTextSplitter
# VectorDB: Pinecone (raturihimanshu077@gmail.com)
# Embedding Model(Hugging Face): sentence-transformers/all-MiniLM-L12-v2
# LLM Model(Groq): llama-3.1-8b-instant

In [38]:
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pinecone.grpc import PineconeGRPC as Pinecone
from sentence_transformers import SentenceTransformer
load_dotenv()


True

In [2]:
# Reading pdf document
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    document = file_loader.load()
    return document

In [3]:
# Document splitting and chunking
def chunk_data(document , chunk_size=1000,chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size , chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(document)
    return chunks

In [4]:
# Embedding using "sentence-transformers/all-MiniLM-L12-v2" : It maps sentences & paragraphs to a 384 dimensional dense vector space
def embedding_chunks(chunks):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
    text = [doc.page_content for doc in chunks]
    embeddings = model.encode(text)
    return embeddings

In [5]:
# Pinecone initialization
import os
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_env = os.getenv("PINECONE_ENVIRONMENT")
pc = Pinecone(api_key=pinecone_api_key)
index_name = "test-rag"

In [6]:
documents = read_doc("../data/research_data/")
len(documents)

459

In [7]:
chunks = chunk_data(documents)
print(len(chunks))

1258


In [8]:
embeddings = embedding_chunks(chunks)
print(len(embeddings))

1258


In [9]:
for doc in documents:
    print(doc.metadata)

{'producer': 'GPL Ghostscript 9.05', 'creator': '', 'creationdate': '2015-01-18T06:20:31-08:00', 'moddate': '2015-01-18T06:20:31-08:00', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'source': '..\\data\\research_data\\A_New_Generation_of_the_IMAGIC_Image_Pro.pdf', 'total_pages': 8, 'page': 0, 'page_label': '1'}
{'producer': 'GPL Ghostscript 9.05', 'creator': '', 'creationdate': '2015-01-18T06:20:31-08:00', 'moddate': '2015-01-18T06:20:31-08:00', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'source': '..\\data\\research_data\\A_New_Generation_of_the_IMAGIC_Image_Pro.pdf', 'total_pages': 8, 'page': 1, 'page_label': '2'}
{'producer': 'GPL Ghostscript 9.05', 'creator': '', 'creationdate': '2015-01-18T06:20:31-08:00', 'moddate': '2015-01-18T06:20:31-08:00', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'source': '..\\data\\research_data\\A_New_Generation_of_the_IMAGIC_Image_Pro.pdf', 'total_pages': 8, 'page': 2, 'page_label': '3'}
{'producer': 'GPL Ghos

In [10]:
host = os.getenv("PINECONE_HOST")
index = pc.Index(host= host)
vectors = []

for i, embedding in enumerate(embeddings):
    vectors.append({
        "id": f"chunk-{i}",
        "values": embedding.tolist(),
        "metadata": {
            "text": chunks[i].page_content,
            "source": chunks[i].metadata.get("source"),
            "page": chunks[i].metadata.get("page")
        }
    })


In [11]:
def batch_upsert(index, vectors, namespace, batch_size=500):
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i + batch_size]
        index.upsert(vectors=batch, namespace=namespace)


In [12]:
batch_upsert(
    index=index,
    vectors=vectors,
    namespace="rag-docs",
    batch_size=500   # safe value
)


In [13]:
# cosine similarity search
def retrieve_query(query,k=3):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
    query_embedding = model.encode(query).tolist()
    matching_result = index.query(
        vector=query_embedding,
        top_k=k,
        namespace="rag-docs",
        include_metadata=True
    )
    return matching_result

In [35]:
answer = retrieve_query("what is Discrete Wavelet Transformation")
print(answer.matches)

[{'id': 'chunk-1141',
 'metadata': {'page': 399.0,
              'source': '..\\data\\research_data\\Image_processing_principles_and_applicat.pdf',
              'text': 'discrete wavelet transform (DWT), (2) quantization, and '
                      '(3) entropy encod- \n'
                      'ing. After preprocessing, each component is '
                      'independently analyzed by a \n'
                      "suitable discrete wavelet transform (DW'T). The DWT "
                      'essentially decomposes \n'
                      'each component into a number of subbands in different '
                      'resolution levels. Each \n'
                      'www.thietbiysinh.com.vn'},
 'score': 0.70496756,
 'sparse_values': {'indices': [], 'values': []},
 'values': []}, {'id': 'chunk-309',
 'metadata': {'page': 105.0,
              'source': '..\\data\\research_data\\Image_processing_principles_and_applicat.pdf',
              'text': 'WAVELET TRANSFORMS 83 \n'
            

In [36]:
print(answer.matches[0].id)
print(answer.matches[0].metadata)
print(answer.matches[0].metadata['page'])
print(answer.matches[0].metadata['source'])
print(answer.matches[0].metadata['text'])
print(answer.matches[0].score)

chunk-1141
{'page': 399.0, 'text': "discrete wavelet transform (DWT), (2) quantization, and (3) entropy encod- \ning. After preprocessing, each component is independently analyzed by a \nsuitable discrete wavelet transform (DW'T). The DWT essentially decomposes \neach component into a number of subbands in different resolution levels. Each \nwww.thietbiysinh.com.vn", 'source': '..\\data\\research_data\\Image_processing_principles_and_applicat.pdf'}
399.0
..\data\research_data\Image_processing_principles_and_applicat.pdf
discrete wavelet transform (DWT), (2) quantization, and (3) entropy encod- 
ing. After preprocessing, each component is independently analyzed by a 
suitable discrete wavelet transform (DW'T). The DWT essentially decomposes 
each component into a number of subbands in different resolution levels. Each 
www.thietbiysinh.com.vn
0.70496756


In [28]:
# context = "\n\n".join([doc.metadata['text'] for doc in answer.matches])
# print(context)
context = []
for doc in answer.matches:
    context.append(doc.metadata['text'])
print(context)

['17.3 PARTS OF THE JPEG2000 STANDARD \nAs of writing this book, the standard has 11 parts (because Part 7 has been \nabandoned) with each part adding new features to the core standard in Part \n1. The 11 parts and their features are as follows: \n0 Part 1-Core Coding System [l, 141 specifies the basic feature set and \ncode-stream syntax for JPEG2000. \n0 Part 2-Extensions [15] to Part 1. This part adds a lot more features \nto the core coding system. \n0 Part 3-Motion JPEG2000 [16] specifies a file format (MJ2) that con- \ntains an image sequence encoded with the JPEG2000 core coding algo- \nrithm for motion video. \n0 Part 4-Conformance Testing [17] is now published as an International \nStandard (ISO/IEC 15444-4:2002). It specifies compliance-testing pro-', "372 JPEG2000 STANDARD FOR IMAGE COMPRESSION \nFig 17.1 Example of capabilities of' JPEG2000 technology.", "372 JPEG2000 STANDARD FOR IMAGE COMPRESSION \nFig 17.1 Example of capabilities of' JPEG2000 technology."]


In [32]:
# LLM integration: llama-3.1-8b-instant
from langchain_groq import ChatGroq

### Initialize the Groq LLM (set your GROQ_API_KEY in environment)
groq_api_key = os.getenv("GROQ_API_KEY")

llm=ChatGroq(groq_api_key=groq_api_key,model_name="llama-3.1-8b-instant",temperature=0.1,max_tokens=1024)

## 2. Simple RAG function: retrieve context + generate response
def rag_simple(query,llm,top_k=3):
    ## retriever the context
    results=answer.matches[:top_k]
    context = []
    for doc in answer.matches:
        context.append(doc.metadata['text'])
    if not context:
        return "No relevant context found to answer the question."
    
    ## generate the answwer using GROQ LLM
    prompt=f"""Use the following context to answer the question concisely.
        Context:
        {context}

        Question: {query}

        Answer:"""
    
    response=llm.invoke([prompt.format(context=context,query=query)])
    return response.content


In [33]:
answer = rag_simple("Discrete Wavelet Transformation", llm, top_k=3)
print(answer)

The Discrete Wavelet Transform (DWT) is a signal processing tool that decomposes a digital signal into different subbands with varying frequency and time resolutions. It is commonly used for image compression and supports features like progressive image transmission.
