<a href="https://colab.research.google.com/github/Nareshedagotti/RAG/blob/main/Standard_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
pip install langchain groq faiss-cpu sentence-transformers PyPDF2 python-docx langchain-community

Collecting groq
  Downloading groq-0.25.0-py3-none-any.whl.metadata (15 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3

In [6]:
#text extraction
from PyPDF2 import PdfReader
from docx import Document

def extract_text(file_paths):
    documents = []
    for file_path in file_paths:
        if file_path.endswith('.pdf'):
            with open(file_path, 'rb') as file:
                pdf_reader = PdfReader(file)
                text = "".join(page.extract_text() or "" for page in pdf_reader.pages)
                documents.append({"content": text, "source": file_path})
        elif file_path.endswith('.docx'):
            doc = Document(file_path)
            text = "\n".join(para.text for para in doc.paragraphs)
            documents.append({"content": text, "source": file_path})
    return documents

file_path = ["/content/How Vectors - 1_merged.pdf" ]
text = extract_text(file_path)
print(text)

[{'content': 'HOW \nVECTORS ARE STORED,\n INDEXED, AND RETRIEVED\nINTRODUCTION\nHow text is converted into vectors, stored in a database, \nindexed efficiently, and used to retrieve similar \ninformation — with code & real-world examples.\nSTATFUSIONAI"Let\'s go deep behind the scenes."\nTEXT TO VECTOR – EMBEDDING\nCode:\nOutput Example:\nfrom sentence_transformers import SentenceTransformer\nmodel = SentenceTransformer(\'all-MiniLM-L6-v2\')\ntexts = [\n    "Apples are sweet and red.",\n    "Bananas are yellow and soft.",\n    "Kids love mangoes during summer."\n]\nembeddings = model.encode(texts)\nBehind the Scenes: \n•When model.encode() runs, it:\n1.Tokenizes text → converts to word pieces (subwords like \n"embed" + "##ding")\n2.Feeds tokens through transformer layers with attention \nmechanisms\n3.Pools token representations to create a single vector\n4.Outputs a vector (384 dimensions) representing semantic \nmeaning\nprint(embeddings[0][:5])  # First 5 values of first embedding\n

In [7]:
#Splitting text into chunkings
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_text(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
    chunks = []
    for doc in documents:
        split_docs = text_splitter.create_documents(
            [doc["content"]],
            metadatas=[{"source": doc["source"]}]
        )
        chunks.extend(split_docs)
    return chunks

chunks = chunk_text(text)
for i, chunk in enumerate(chunks, 1):
    print(f"\n🔹 **Chunk {i}:**\n{chunk.page_content}\n{'-'*50}")


🔹 **Chunk 1:**
HOW 
VECTORS ARE STORED,
 INDEXED, AND RETRIEVED
INTRODUCTION
How text is converted into vectors, stored in a database, 
indexed efficiently, and used to retrieve similar 
information — with code & real-world examples.
STATFUSIONAI"Let's go deep behind the scenes."
TEXT TO VECTOR – EMBEDDING
Code:
Output Example:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
texts = [
    "Apples are sweet and red.",
    "Bananas are yellow and soft.",
    "Kids love mangoes during summer."
]
embeddings = model.encode(texts)
Behind the Scenes: 
•When model.encode() runs, it:
1.Tokenizes text → converts to word pieces (subwords like 
"embed" + "##ding")
2.Feeds tokens through transformer layers with attention 
mechanisms
3.Pools token representations to create a single vector
4.Outputs a vector (384 dimensions) representing semantic 
meaning
print(embeddings[0][:5])  # First 5 values of first embedding
# [0.1213, 0.3421, -0.5632, 0.

In [None]:
#Converting Chunks into embeddings
from langchain.embeddings import HuggingFaceEmbeddings

def generate_embeddings(chunks):
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    return embedding_model, chunks

embedding_model, chunks = generate_embeddings(chunks)

In [None]:
#storing embeddings in Faiss vectordb
from langchain.vectorstores import FAISS

def store_in_db(chunks, embedding_model):
    vector_db = FAISS.from_documents(chunks, embedding_model)
    return vector_db

vector_db = store_in_db(chunks, embedding_model)

In [None]:
#converting query into embeddings
def query_to_embeddings(query, embedding_model):
    query_embedding = embedding_model.embed_query(query)
    return query_embedding

query = "give me different types of vector indexing and uses?"
query_embedding = query_to_embeddings(query, embedding_model)

In [None]:
#retrieving top matched context from vectordb
def retrieve_top_matches(vector_db, query_embedding):
    top_matches = vector_db.similarity_search_by_vector(query_embedding, k=3)
    return top_matches

top_matches = retrieve_top_matches(vector_db, query_embedding)
print(top_matches)

In [None]:
#sending retrieved context to llm
from groq import Groq

def pass_to_llm(query, top_matches, api_key):
    context = "\n\n".join(match.page_content for match in top_matches)
    prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    client = Groq(api_key=api_key)
    chat_completion = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        model="gemma2-9b-it"
    )
    return chat_completion.choices[0].message.content, top_matches


api_key="your_api_key"
llm_response, top_matches = pass_to_llm(query, top_matches, api_key)
print(llm_response)