In [1]:
# Install Required Libraries
!pip install langchain==0.1.13 langchain-community langchain-google-genai sentence-transformers chromadb pypdf




In [2]:
import os

# Create Folder
os.makedirs("/content/research_docs", exist_ok=True)

# Download Files from Google Drive
research_papers = {
    "dataset1.pdf": "1zjqznRqVio-Cwy3ERo7v-wOA_ox7SM3g",
    "dataset2.pdf": "1YWO34f2tXoUSx_wx3Jj9RhkiaNf3HJGG",
    "dataset3.pdf": "1z2cGgc-Hg8Q_AU5mtZoFXEUlGH9oKzYD"
}

for paper_name, drive_id in research_papers.items():
    file_url = f"https://drive.google.com/uc?export=download&id={drive_id}"
    destination = f"/content/research_docs/{paper_name}"
    !wget -q --show-progress "{file_url}" -O "{destination}"

print("Research papers downloaded successfully!")


Research papers downloaded successfully!


In [3]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load PDFs
loader = PyPDFDirectoryLoader("/content/research_docs")
documents = loader.load()

# Add File Metadata
for i, doc in enumerate(documents):
    source = doc.metadata.get('source', f'doc_{i}.pdf')
    doc.metadata['filename'] = source.split('/')[-1]
    doc.metadata['page'] = doc.metadata.get('page', i + 1)

# Split Documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

print(f"Loaded {len(documents)} pages and split into {len(chunks)} chunks.")


Loaded 109 pages and split into 1422 chunks.


In [4]:
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

# Create Embeddings
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Create Vector Store
vectorstore = Chroma.from_documents(chunks, embeddings)

# Setup Retriever
retriever = vectorstore.as_retriever(search_kwargs={'k': 5})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
import os

# Set your Google API Key
os.environ["GOOGLE_API_KEY"] = "AIzaSyBa4cTqZPNYUk9bKuA4ZFQWOWIY4TPID98"

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import ChatPromptTemplate

# Load Gemini Model
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.5)


In [6]:
def prepare_context_with_sources(documents):
    context_blocks = []
    source_citations = set()

    for doc in documents:
        filename = doc.metadata.get("filename", "unknown_file")
        page = doc.metadata.get("page", "N/A")
        content = doc.page_content.strip().replace("\n", " ")

        context_blocks.append(f"[{filename}, Page {page}]: {content}")
        source_citations.add((filename, page))

    return "\n\n".join(context_blocks), source_citations

template = """
<context>
{context}
</context>

You are an AI assistant answering questions based on academic papers.
Answer the following question truthfully and clearly using only the above context.
Do not hallucinate or make up information.

Question: {query}
"""

prompt = ChatPromptTemplate.from_template(template)

qa_history = []

def rag_with_sources(query):
    docs = retriever.get_relevant_documents(query)
    context, sources = prepare_context_with_sources(docs)

    inputs = {"context": context, "query": query}
    answer = llm.invoke(prompt.format_prompt(**inputs).to_messages())

    formatted_sources = [f"{file}, Page {page}" for file, page in sources]
    qa_entry = {
        "question": query,
        "answer": answer.content.strip(),
        "sources": formatted_sources
    }
    qa_history.append(qa_entry)

    return qa_entry


In [7]:
sample_questions = [
    "What are the main components of a RAG model, and how do they interact?",
    "What are the two sub-layers in each encoder layer of the Transformer model?",
    "Explain how positional encoding is implemented in Transformers and why it is necessary.",
    "Describe the concept of multi-head attention in the Transformer architecture. Why is it beneficial?",
    "What is few-shot learning, and how does GPT-3 implement it during inference?"
]

for question in sample_questions:
    result = rag_with_sources(question)
    print(f"\nQuestion: {result['question']}")
    print(f"Answer: {result['answer']}")
    print(f"Sources: {', '.join(result['sources'])}\n")


  warn_deprecated(



Question: What are the main components of a RAG model, and how do they interact?
Answer: Based on the provided text, RAG models use an input sequence (x) to retrieve text documents (z).  These retrieved documents (z) are then used as additional context when generating the target sequence (y).  The text mentions pη and pθ components, but does not describe their interaction within the model.
Sources: dataset2.pdf, Page 1, dataset2.pdf, Page 2, dataset2.pdf, Page 9


Question: What are the two sub-layers in each encoder layer of the Transformer model?
Answer: The first sub-layer is a multi-head self-attention mechanism, and the second is a simple, position-wise fully connected feed-forward network.
Sources: dataset1.pdf, Page 7, dataset1.pdf, Page 2


Question: Explain how positional encoding is implemented in Transformers and why it is necessary.
Answer: In the Transformer model, positional encodings are added to the input embeddings at the bottom of both the encoder and decoder stacks.

In [8]:
print("\nYou can now ask your own questions! Type 'exit' to quit.\n")

while True:
    user_input = input("Ask a question (or type 'exit'): ")
    if user_input.lower() == "exit":
        print("Exiting Q&A.")
        break
    if user_input.strip() == "":
        continue
    result = rag_with_sources(user_input)
    print(f"\nQuestion: {result['question']}")
    print(f"Answer: {result['answer']}")
    print(f"Sources: {', '.join(result['sources'])}\n")



You can now ask your own questions! Type 'exit' to quit.

Ask a question (or type 'exit'): What are the key differences between RAG models and traditional retrieval systems?

Question: What are the key differences between RAG models and traditional retrieval systems?
Answer: Based on the provided text, the key difference highlighted is that RAG models learn to retrieve relevant information (a trainable retrieval component), unlike traditional retrieval systems which may not have this learning aspect.  The text also shows that when the retrieval component of a RAG model fails ("collapsed"), the model performs similarly to a purely parametric model like BART, suggesting a dependence on the learned retrieval for superior performance.  The effectiveness of the learned retrieval mechanism is explicitly assessed through ablation studies.
Sources: dataset2.pdf, Page 6, dataset2.pdf, Page 18, dataset2.pdf, Page 8, dataset2.pdf, Page 17, dataset2.pdf, Page 1

Ask a question (or type 'exit'): H