<a href="https://colab.research.google.com/github/KrishOberoi/RAG_MODELS/blob/main/g.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q llama-hub
!pip install -q arxiv
!pip install -q semanticscholar
!pip install -q sentence-transformers==2.3.0

In [None]:
!pip install -q ragatouille
!pip install -q llama-index-readers-file

In [None]:
from google.colab import files
from llama_index.readers.file import PDFReader

# Step 1: Upload multiple PDF files
uploaded = files.upload()  # allows you to select and upload multiple files

# Step 2: Initialize PDFReader
loader = PDFReader()

# Step 3: Load data from all uploaded PDFs
documents = []
for filename in uploaded.keys():
    print(f"Loading {filename}...")
    docs = loader.load_data(filename)
    documents.extend(docs)

print(f"✅ Loaded {len(documents)} documents from {len(uploaded)} PDFs.")


In [None]:
list_pdf_documents = [document.text for document in documents]


In [None]:
# Monkey patch transformers.AdamW if it's missing
import transformers
import torch.optim

if not hasattr(transformers, "AdamW"):
    transformers.AdamW = torch.optim.AdamW

# Now import RAGatouille
from ragatouille import RAGPretrainedModel

# Load ColBERTv2
RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")


In [None]:
def split_text_overlap(text, chunk_size=400, overlap=50):
    assert chunk_size > overlap, "Chunk size must be greater than overlap"

    chunks = []
    start = 0
    text_len = len(text)

    while start < text_len:
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap  # move forward with overlap

    return chunks

# Usage example for all docs:
processed_documents = []
for doc in list_pdf_documents:
    chunks = split_text_overlap(doc, chunk_size=400, overlap=50)
    processed_documents.extend(chunks)

# Now index with RAG (disable automatic splitting)
RAG.index(
    collection=processed_documents,
    index_name="constitution_index",
    split_documents=False,
)


In [None]:
results = RAG.search(query="diary entries of august 1942", k=15, index_name="constitution_index")


In [None]:
results

In [None]:
for i, doc, in enumerate(results):
    print(f"---------------------------------- doc-{i} ------------------------------------")
    print(doc["content"])


In [None]:
# prompt: feed the output chunks im getting to an gemini api LLM which gives me a response based on the chunks which are fed to it strictly no hallucinations hence making it a strict RAG implementation

from google.colab import userdata
import google.generativeai as genai


GOOGLE_API_KEY = 'AIzaSyClBgxT6W5fa2c2CDadiJN5EfPwfKajXjk'
genai.configure(api_key=GOOGLE_API_KEY)


model = genai.GenerativeModel('gemini-2.0-flash')

def get_rag_response(query, search_results):
    """
    Feeds search results (chunks) to the LLM to answer the query.

    Args:
        query (str): The user's query.
        search_results (list): A list of search results from RAGatouille,
                                where each item has a 'content' key.

    Returns:
        str: The response from the LLM based on the provided chunks.
    """
    if not search_results:
        return "No relevant information found in the documents."


    context = "\n\n".join([result['content'] for result in search_results])

    prompt = f"""
   You are a highly detailed and comprehensive knowledge retrieval system.
Using ONLY the following information, answer the user's query.
Do NOT use any external knowledge. If the information provided does not contain the answer,
state that you cannot answer based on the provided context.

**Instructions for Detailed Responses:**
If the user's query explicitly asks for an **explanation, summary, "tell me more," "what is," "describe," or "elaborate"** on a topic, then:
1.  Provide a **thorough and detailed answer** by synthesizing **ALL relevant points from ALL provided chunks**.
2.  **Expand on each concept and clause present in the provided information**, breaking down its meaning as extensively as possible *using only the words and implications found within the given context*.
3.  **Do NOT introduce any external knowledge or interpretations not directly stated or clearly implied by the provided text.**
4.  If the provided information is very brief and does not offer sufficient detail for a comprehensive explanation, state that you can only explain it based on the limited information provided, and then proceed to expand as much as possible *from that limited text*.

    Information:
    {context}

    User Query:
    {query}

    Answer:
    """

    try:
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        print(f"An error occurred during LLM generation: {e}")
        return "An error occurred while generating the response."

query = 'diary entries of august 1942 explain in detail'
llm_response = get_rag_response(query, results)
print("\n--- LLM Response ---")
llm_response

