<a href="https://colab.research.google.com/github/MeenakshiRajpurohit/RAG-Retrieval-Augmented-Generation-/blob/main/RAG_RESEARCH_ASSISTANT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# ============================================================================
# RAG RESEARCH ASSISTANT - FINAL FIXED VERSION
# Google Colab Compatible - All errors fixed
# ============================================================================

# ============================================================================
# CELL 1: INSTALL DEPENDENCIES
# ============================================================================

!pip install -q langchain langchain-community langchain-core
!pip install -q faiss-cpu sentence-transformers
!pip install -q torch transformers huggingface-hub
!pip install -q tqdm

print("‚úì All packages installed!")

# ============================================================================
# CELL 2: IMPORTS
# ============================================================================

import os
import json
import numpy as np
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from tqdm import tqdm

print("‚úì All imports successful!")
print(f"GPU Available: {torch.cuda.is_available()}")

# ============================================================================
# CELL 3: PAPER DATABASE
# ============================================================================

PAPERS_DATABASE = {
    "NLP": [
        {
            "title": "Attention Is All You Need",
            "authors": ["Vaswani", "Shazeer", "Parmar"],
            "published": "2017-06-12",
            "arxiv_id": "1706.03762",
            "category": "NLP",
            "summary": """Transformers have become ubiquitous in NLP. The architecture uses self-attention mechanisms instead of recurrence.
            Key features: (1) Processes sequences in parallel for fast training, (2) Multi-head attention allows focusing on different parts,
            (3) Positional encodings capture sequence order, (4) Encoder-decoder architecture enables many applications.
            Impact: Foundation for BERT, GPT, T5, and modern language models."""
        },
        {
            "title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding",
            "authors": ["Devlin", "Chang", "Lee", "Toutanova"],
            "published": "2018-10-11",
            "arxiv_id": "1810.04805",
            "category": "NLP",
            "summary": """BERT introduces bidirectional pre-training for language models. Key innovation: Masked Language Modeling (MLM)
            where random words are masked during training. The model must predict masked tokens using context from both left and right.
            Improvements: (1) Bidirectional context, (2) Masked token prediction, (3) Next sentence prediction task.
            Impact: Significantly improved GLUE benchmark scores and showed pre-training benefits."""
        },
        {
            "title": "Language Models are Unsupervised Multitask Learners",
            "authors": ["Radford", "Wu", "Child", "Luan"],
            "published": "2019-02-14",
            "arxiv_id": "1902.10165",
            "category": "NLP",
            "summary": """GPT-2 demonstrates that large language models trained on diverse text learn multiple tasks without explicit supervision.
            Key findings: (1) Scaling improves performance, (2) Models learn tasks naturally from data, (3) No task-specific training needed.
            Applications: Machine translation, summarization, question-answering all work with same base model.
            Impact: Showed the power of scale and diversity in language modeling."""
        },
        {
            "title": "RoFormer: Enhanced Transformer with Rotary Position Embedding",
            "authors": ["Su", "Ahmed", "Lu", "Pan"],
            "published": "2021-04-20",
            "arxiv_id": "2104.09864",
            "category": "NLP",
            "summary": """Position encoding is crucial for capturing sequence order. RoFormer proposes Rotary Position Embeddings (RoPE).
            Key idea: Encode position using rotation matrices instead of additive encodings.
            Advantages: (1) Better extrapolation to longer sequences, (2) Natural representation of relative positions, (3) Improved generalization.
            Impact: Used in LLaMA and other modern language models."""
        },
        {
            "title": "LLaMA: Open and Efficient Foundation Language Models",
            "authors": ["Touvron", "Lavril", "Izacard"],
            "published": "2023-02-27",
            "arxiv_id": "2302.13971",
            "category": "NLP",
            "summary": """LLaMA shows that efficient training on public data can match proprietary models.
            Key achievements: (1) 7B-65B parameter models, (2) Trained on 1.4T public tokens, (3) LLaMA-13B outperforms GPT-3 65B.
            Techniques: (1) Careful data curation, (2) Efficient scaling, (3) Improved training recipes.
            Impact: Democratized large language models by proving public data suffices."""
        }
    ],
    "Computer Vision": [
        {
            "title": "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale",
            "authors": ["Dosovitskiy", "Beyer", "Kolesnikov"],
            "published": "2020-10-22",
            "arxiv_id": "2010.11929",
            "category": "Computer Vision",
            "summary": """Vision Transformer (ViT) applies pure transformer architecture to image classification.
            Key idea: Divide image into patches (16x16), treat patches as tokens like in NLP.
            Architecture: (1) Linear projection of patches, (2) Positional encodings, (3) Standard transformer encoder.
            Results: Competitive with CNNs when trained on large datasets, better scaling properties.
            Impact: Showed transformers work beyond NLP, inspired multimodal models."""
        },
        {
            "title": "Masked Autoencoders Are Scalable Vision Learners",
            "authors": ["He", "Chen", "Xie", "Li"],
            "published": "2021-11-11",
            "arxiv_id": "2111.06377",
            "category": "Computer Vision",
            "summary": """Masked Autoencoders (MAE) extend masked language modeling to vision.
            Key approach: (1) Randomly mask image patches, (2) Encoder processes visible patches, (3) Decoder reconstructs masked patches.
            Advantages: (1) Asymmetric encoder-decoder, (2) Scales to large models, (3) Learns powerful representations.
            Results: Self-supervised learning works well for vision similar to NLP.
            Impact: Foundation for self-supervised computer vision."""
        },
        {
            "title": "Training data-efficient image transformers & distillation through attention",
            "authors": ["Touvron", "Cord", "Douze"],
            "published": "2020-12-23",
            "arxiv_id": "2012.12556",
            "category": "Computer Vision",
            "summary": """DeiT shows how to train vision transformers efficiently on ImageNet-sized datasets.
            Key techniques: (1) Knowledge distillation from teacher model, (2) Careful augmentation, (3) Training recipes.
            Results: Competitive transformer models without requiring massive datasets like original ViT.
            Impact: Made vision transformers practical for standard datasets."""
        },
        {
            "title": "Vision Transformer Slimming",
            "authors": ["Wang", "Huang", "Song"],
            "published": "2021-06-24",
            "arxiv_id": "2106.02852",
            "category": "Computer Vision",
            "summary": """Addresses efficiency of vision transformers through architectural optimization.
            Key ideas: (1) Reduce patch dimensions, (2) Optimize attention mechanisms, (3) Careful pruning strategies.
            Results: Smaller models with competitive accuracy compared to original ViT.
            Impact: Made vision transformers more efficient and practical."""
        },
        {
            "title": "Exploring Simple Siamese Representation Learning",
            "authors": ["Chen", "He", "Fan"],
            "published": "2020-11-04",
            "arxiv_id": "2011.10566",
            "category": "Computer Vision",
            "summary": """SimSiam shows that contrastive learning works without negative pairs.
            Key finding: Simple siamese networks with stop-gradient operation enable self-supervised learning.
            Architecture: (1) Two branches with weight sharing, (2) MLP projectors, (3) Stop-gradient operation.
            Results: Competitive self-supervised learning without negative sampling or momentum encoder.
            Impact: Simplified self-supervised vision learning."""
        }
    ],
    "Multimodal": [
        {
            "title": "Learning Transferable Visual Models From Natural Language Supervision",
            "authors": ["Radford", "Kim", "Hallacy"],
            "published": "2021-02-26",
            "arxiv_id": "2103.00020",
            "category": "Multimodal",
            "summary": """CLIP learns visual representations from natural language supervision at scale.
            Key innovation: Contrastive learning on image-text pairs from internet data.
            Architecture: (1) Image encoder (CNN or ViT), (2) Text encoder (transformer), (3) Contrastive loss.
            Capability: Zero-shot transfer to new categories described in language.
            Impact: Foundation for multimodal models and zero-shot vision."""
        },
        {
            "title": "BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation",
            "authors": ["Li", "Gan", "Du"],
            "published": "2022-01-18",
            "arxiv_id": "2201.12086",
            "category": "Multimodal",
            "summary": """BLIP unifies vision-language understanding and generation in one model.
            Key contribution: (1) CapFilt module for filtering noisy captions, (2) Unified encoder-decoder architecture.
            Tasks: (1) Image-text retrieval, (2) Visual question answering, (3) Image captioning.
            Results: State-of-the-art on VQA, retrieval, and captioning benchmarks.
            Impact: Showed unified V-L models can excel at both understanding and generation."""
        },
        {
            "title": "Flamingo: a Visual Language Model for Few-Shot Learning",
            "authors": ["Alayrac", "Donahue", "Luc"],
            "published": "2022-04-29",
            "arxiv_id": "2204.14198",
            "category": "Multimodal",
            "summary": """Flamingo enables few-shot learning in vision-language models.
            Key design: (1) Interleaved vision and text tokens, (2) Gated cross-attention, (3) Few-shot demonstration capability.
            Capability: Learn new tasks from just a few examples.
            Results: Strong few-shot performance across diverse vision-language tasks.
            Impact: Brought few-shot learning to multimodal domain."""
        },
        {
            "title": "Visual Instruction Tuning",
            "authors": ["Liu", "Li", "Xu"],
            "published": "2023-04-17",
            "arxiv_id": "2304.08485",
            "category": "Multimodal",
            "summary": """LLaVA connects vision encoder with large language model via instruction tuning.
            Key idea: (1) Freeze pre-trained vision and language models, (2) Learn projection between them, (3) Instruction tune.
            Approach: (1) Use CLIP for vision, (2) Use LLaMA for language, (3) Connect with linear projection.
            Capability: Follow natural language instructions about images.
            Impact: Simplified approach to building multimodal models."""
        },
        {
            "title": "Multimodal Chain-of-Thought Reasoning in Language Models",
            "authors": ["Zhang", "Hashimoto", "Liang"],
            "published": "2023-02-07",
            "arxiv_id": "2302.00923",
            "category": "Multimodal",
            "summary": """Extends chain-of-thought reasoning to multimodal settings.
            Key idea: (1) Decompose problem into reasoning steps, (2) Show example step-by-step, (3) Improve reasoning across modalities.
            Results: Improved performance on complex vision-language tasks requiring reasoning.
            Impact: Showed explicit reasoning helps multimodal understanding."""
        }
    ]
}

print(f"Loaded {sum(len(p) for p in PAPERS_DATABASE.values())} papers")

# ============================================================================
# CELL 4: PROCESS DOCUMENTS
# ============================================================================

class DocumentProcessor:
    """Convert papers to document chunks"""

    def __init__(self, chunk_size=800, chunk_overlap=150):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", ". ", " ", ""]
        )

    def process_papers(self, papers: Dict) -> Tuple[List[str], List[Dict]]:
        """Process papers into chunks"""
        documents = []
        metadata = []

        for category, paper_list in papers.items():
            for paper in paper_list:
                doc_text = f"""Title: {paper['title']}
Authors: {', '.join(paper['authors'])}
Category: {category}
Published: {paper['published']}

Summary:
{paper['summary']}

ArXiv ID: {paper['arxiv_id']}
"""
                chunks = self.text_splitter.split_text(doc_text)

                for chunk in chunks:
                    documents.append(chunk)
                    metadata.append({
                        "title": paper['title'],
                        "category": category,
                        "arxiv_id": paper['arxiv_id'],
                        "authors": paper['authors'],
                    })

        return documents, metadata

processor = DocumentProcessor()
documents, metadata = processor.process_papers(PAPERS_DATABASE)
print(f"Processing documents...")
print(f"‚úì Created {len(documents)} document chunks\n")

# ============================================================================
# CELL 5: BUILD EMBEDDINGS
# ============================================================================

print("="*80)
print("BUILDING RAG SYSTEM")
print("="*80)

print("\n1Ô∏è‚É£ Loading embeddings model...")
use_gpu = torch.cuda.is_available()
device = "cuda" if use_gpu else "cpu"
print(f"   Device: {device.upper()}")

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": device}
)
print("‚úì Embeddings model ready")

# ============================================================================
# CELL 6: BUILD VECTOR STORE
# ============================================================================

print("\n2Ô∏è‚É£ Building FAISS vector store...")

vectorstore = FAISS.from_texts(
    texts=documents,
    embedding=embeddings,
    metadatas=metadata
)

print(f"‚úì Vector store created")

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

print("‚úì Retriever configured")

# ============================================================================
# CELL 7: LOAD LLM (FIXED - NO PIPELINE)
# ============================================================================

print("\n3Ô∏è‚É£ Loading LLM (google/flan-t5-base)...")

model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

if use_gpu:
    model = model.to("cuda")

print("‚úì LLM loaded")

# ============================================================================
# CELL 8: SIMPLE LLM WRAPPER (NO PIPELINE)
# ============================================================================

class SimpleLLM:
    """Simple LLM wrapper that avoids pipeline issues"""

    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.max_length = 512

    def generate(self, prompt: str) -> str:
        """Generate text from prompt"""
        try:
            # Tokenize
            inputs = self.tokenizer(
                prompt,
                return_tensors="pt",
                truncation=True,
                max_length=1024
            ).to(self.device)

            # Generate
            outputs = self.model.generate(
                **inputs,
                max_length=self.max_length,
                num_beams=1,
                temperature=0.7,
                do_sample=False
            )

            # Decode
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            return response
        except Exception as e:
            return f"Error generating response: {str(e)[:100]}"

llm = SimpleLLM(model, tokenizer, device)

# ============================================================================
# CELL 9: BUILD RAG CHAIN (MANUAL)
# ============================================================================

print("\n4Ô∏è‚É£ Creating RAG chain...")

template = """You are an expert research assistant. Answer questions about research papers.

Papers:
{context}

Question: {question}

Answer based on the papers. Be concise and helpful."""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=template
)

def format_docs(docs):
    """Format documents"""
    if not docs:
        return "No papers found."
    formatted = []
    for i, doc in enumerate(docs[:3], 1):
        title = doc.metadata.get("title", "Unknown")
        arxiv = doc.metadata.get("arxiv_id", "N/A")
        formatted.append(f"[{i}] {title} ({arxiv})\n{doc.page_content[:300]}...")
    return "\n\n".join(formatted)

def rag_chain_invoke(question: str):
    """Simple RAG chain"""
    # Retrieve
    docs = retriever.invoke(question)
    context = format_docs(docs)

    # Format prompt
    formatted_prompt = template.format(context=context, question=question)

    # Generate answer
    answer = llm.generate(formatted_prompt)

    return answer, docs

print("‚úì RAG chain ready!")
print("\n" + "="*80)
print("‚úÖ RAG SYSTEM READY!")
print("="*80)

# ============================================================================
# CELL 10: QUERY FUNCTION
# ============================================================================

def query_research(question: str, verbose=True):
    """Query the RAG system"""
    if verbose:
        print(f"\nüîç Question: {question}")
        print("-" * 80)

    try:
        # Get answer
        answer, docs = rag_chain_invoke(question)

        if verbose:
            print(f"\nüí° Answer:\n{answer}\n")

        # Show sources
        if verbose and docs:
            print(f"üìö Sources:")
            seen = set()
            for doc in docs:
                title = doc.metadata.get("title")
                if title not in seen:
                    seen.add(title)
                    arxiv = doc.metadata.get("arxiv_id")
                    category = doc.metadata.get("category")
                    print(f"  ‚Ä¢ {title}")
                    print(f"    Category: {category} | ArXiv: {arxiv}")

        return answer

    except Exception as e:
        print(f"‚ùå Error: {e}")
        return None

# ============================================================================
# CELL 11: TEST QUESTIONS
# ============================================================================

print("\n" + "="*80)
print("TESTING WITH QUESTIONS")
print("="*80)

test_questions = [
    "What are transformers and how do they work?",
    "Explain Vision Transformers and their advantages",
    "What is CLIP and how does it enable zero-shot learning?",
]

for i, question in enumerate(test_questions, 1):
    print(f"\n{'='*80}")
    print(f"QUESTION {i}/{len(test_questions)}")
    print(f"{'='*80}")
    query_research(question)

# ============================================================================
# CELL 12: SUMMARY & YOUR QUESTIONS
# ============================================================================

print("\n\n" + "="*80)
print("‚úÖ COMPLETE!")
print("="*80)

print(f"\nSystem Status:")
print(f"  ‚úì Papers: 15 (5 NLP + 5 Vision + 5 Multimodal)")
print(f"  ‚úì Document chunks: {len(documents)}")
print(f"  ‚úì Device: {device.upper()}")
print(f"  ‚úì Status: READY TO USE")

print("\n" + "="*80)
print("ASK YOUR OWN QUESTIONS!")
print("="*80)

print("\nUsage:")
print('  query_research("Your question here?")')

print("\nExamples:")
print('  query_research("What makes BERT different from GPT?")')
print('  query_research("How do multimodal models work?")')
print('  query_research("Explain self-supervised learning in vision")')

# ============================================================================
# CELL 13: YOUR CUSTOM QUESTIONS
# ============================================================================

# Uncomment and modify to ask your own questions:

# query_research("What is attention mechanism?")
# query_research("How does CLIP work for zero-shot classification?")
# query_research("What are the key innovations in Vision Transformers?")
# query_research("Explain the difference between supervised and self-supervised learning")
# query_research("What papers should I read to understand modern AI?")

‚úì All packages installed!
‚úì All imports successful!
GPU Available: True
Loaded 15 papers
Processing documents...
‚úì Created 15 document chunks

BUILDING RAG SYSTEM

1Ô∏è‚É£ Loading embeddings model...
   Device: CUDA


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


‚úì Embeddings model ready

2Ô∏è‚É£ Building FAISS vector store...
‚úì Vector store created
‚úì Retriever configured

3Ô∏è‚É£ Loading LLM (google/flan-t5-base)...


Loading weights:   0%|          | 0/282 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


‚úì LLM loaded

4Ô∏è‚É£ Creating RAG chain...
‚úì RAG chain ready!

‚úÖ RAG SYSTEM READY!

TESTING WITH QUESTIONS

QUESTION 1/3

üîç Question: What are transformers and how do they work?
--------------------------------------------------------------------------------

üí° Answer:
What are transformers and how do they work?

üìö Sources:
  ‚Ä¢ An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale
    Category: Computer Vision | ArXiv: 2010.11929
  ‚Ä¢ Attention Is All You Need
    Category: NLP | ArXiv: 1706.03762
  ‚Ä¢ Vision Transformer Slimming
    Category: Computer Vision | ArXiv: 2106.02852

QUESTION 2/3

üîç Question: Explain Vision Transformers and their advantages
--------------------------------------------------------------------------------

üí° Answer:
Summary: Vision transformers are a type of image classification system.

üìö Sources:
  ‚Ä¢ Vision Transformer Slimming
    Category: Computer Vision | ArXiv: 2106.02852
  ‚Ä¢ Training data-efficien

In [14]:
# ============================================================================
# COMPLETE RAG + EVALUATION SYSTEM
# Google Colab - Ready to Use
# ============================================================================

# ============================================================================
# PART A: RAG SYSTEM (From Previous Code)
# ============================================================================

# CELL 1: Install all packages
!pip install -q langchain langchain-community langchain-core
!pip install -q faiss-cpu sentence-transformers
!pip install -q torch transformers huggingface-hub
!pip install -q tqdm rouge-score scikit-learn pandas numpy

print("‚úì All packages installed!")

# ============================================================================
# CELL 2: Imports
# ============================================================================

import os
import json
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple
import time
import warnings
warnings.filterwarnings('ignore')

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from tqdm import tqdm
from rouge_score import rouge_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

print("‚úì All imports successful!")

# ============================================================================
# CELL 3: Paper Database
# ============================================================================

PAPERS_DATABASE = {
    "NLP": [
        {
            "title": "Attention Is All You Need",
            "authors": ["Vaswani", "Shazeer"],
            "published": "2017-06-12",
            "arxiv_id": "1706.03762",
            "category": "NLP",
            "summary": """Transformers use self-attention mechanisms instead of recurrence. Key: multi-head attention, positional encodings, encoder-decoder architecture. Enables parallel processing and fast training."""
        },
        {
            "title": "BERT: Pre-training of Deep Bidirectional Transformers",
            "authors": ["Devlin", "Chang"],
            "published": "2018-10-11",
            "arxiv_id": "1810.04805",
            "category": "NLP",
            "summary": """BERT introduces bidirectional pre-training with Masked Language Modeling (MLM). Reads context from both directions. Significantly improved GLUE benchmarks."""
        },
        {
            "title": "Language Models are Unsupervised Multitask Learners",
            "authors": ["Radford", "Wu"],
            "published": "2019-02-14",
            "arxiv_id": "1902.10165",
            "category": "NLP",
            "summary": """GPT-2 shows large language models learn multiple tasks without supervision. Models learn translation, summarization, QA naturally from data."""
        },
        {
            "title": "RoFormer: Enhanced Transformer with Rotary Position Embedding",
            "authors": ["Su", "Ahmed"],
            "published": "2021-04-20",
            "arxiv_id": "2104.09864",
            "category": "NLP",
            "summary": """Rotary Position Embeddings (RoPE) encode positions using rotation matrices. Better extrapolation to longer sequences."""
        },
        {
            "title": "LLaMA: Open and Efficient Foundation Language Models",
            "authors": ["Touvron", "Lavril"],
            "published": "2023-02-27",
            "arxiv_id": "2302.13971",
            "category": "NLP",
            "summary": """LLaMA-13B outperforms GPT-3 65B on public data. Shows efficient training on public datasets."""
        }
    ],
    "Computer Vision": [
        {
            "title": "An Image is Worth 16x16 Words: Transformers for Image Recognition",
            "authors": ["Dosovitskiy", "Beyer"],
            "published": "2020-10-22",
            "arxiv_id": "2010.11929",
            "category": "Computer Vision",
            "summary": """Vision Transformer (ViT) divides images into patches as tokens. Competitive with CNNs on large datasets."""
        },
        {
            "title": "Masked Autoencoders Are Scalable Vision Learners",
            "authors": ["He", "Chen"],
            "published": "2021-11-11",
            "arxiv_id": "2111.06377",
            "category": "Computer Vision",
            "summary": """MAE extends masked language modeling to vision. Randomly mask patches and reconstruct. Self-supervised learning works for vision."""
        },
        {
            "title": "Training data-efficient image transformers",
            "authors": ["Touvron", "Cord"],
            "published": "2020-12-23",
            "arxiv_id": "2012.12556",
            "category": "Computer Vision",
            "summary": """DeiT shows vision transformers work on standard ImageNet. Uses knowledge distillation and careful training."""
        },
        {
            "title": "Vision Transformer Slimming",
            "authors": ["Wang", "Huang"],
            "published": "2021-06-24",
            "arxiv_id": "2106.02852",
            "category": "Computer Vision",
            "summary": """Reduces ViT complexity while maintaining performance. Architectural optimizations and pruning."""
        },
        {
            "title": "Exploring Simple Siamese Representation Learning",
            "authors": ["Chen", "He"],
            "published": "2020-11-04",
            "arxiv_id": "2011.10566",
            "category": "Computer Vision",
            "summary": """SimSiam shows contrastive learning works without negative pairs. Simplified self-supervised learning."""
        }
    ],
    "Multimodal": [
        {
            "title": "Learning Transferable Visual Models From Natural Language Supervision",
            "authors": ["Radford", "Kim"],
            "published": "2021-02-26",
            "arxiv_id": "2103.00020",
            "category": "Multimodal",
            "summary": """CLIP learns from image-text pairs. Enables zero-shot classification by understanding image-text relationships."""
        },
        {
            "title": "BLIP: Bootstrapping Language-Image Pre-training",
            "authors": ["Li", "Gan"],
            "published": "2022-01-18",
            "arxiv_id": "2201.12086",
            "category": "Multimodal",
            "summary": """BLIP unifies understanding and generation. Bootstraps on noisy web data."""
        },
        {
            "title": "Flamingo: a Visual Language Model for Few-Shot Learning",
            "authors": ["Alayrac", "Donahue"],
            "published": "2022-04-29",
            "arxiv_id": "2204.14198",
            "category": "Multimodal",
            "summary": """Flamingo enables few-shot learning in multimodal models. Learns from few examples."""
        },
        {
            "title": "Visual Instruction Tuning",
            "authors": ["Liu", "Li"],
            "published": "2023-04-17",
            "arxiv_id": "2304.08485",
            "category": "Multimodal",
            "summary": """LLaVA connects vision and language models. Follows natural language instructions about images."""
        },
        {
            "title": "Multimodal Chain-of-Thought Reasoning",
            "authors": ["Zhang", "Hashimoto"],
            "published": "2023-02-07",
            "arxiv_id": "2302.00923",
            "category": "Multimodal",
            "summary": """Extends chain-of-thought to multimodal. Improves reasoning on complex vision-language tasks."""
        }
    ]
}

print(f"Loaded {sum(len(p) for p in PAPERS_DATABASE.values())} papers")

# ============================================================================
# CELL 4: Process Documents
# ============================================================================

class DocumentProcessor:
    def __init__(self, chunk_size=800, chunk_overlap=150):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", ". ", " ", ""]
        )

    def process_papers(self, papers: Dict) -> Tuple[List[str], List[Dict]]:
        documents = []
        metadata = []

        for category, paper_list in papers.items():
            for paper in paper_list:
                doc_text = f"""Title: {paper['title']}
Authors: {', '.join(paper['authors'])}
Category: {category}
Published: {paper['published']}

{paper['summary']}

ArXiv ID: {paper['arxiv_id']}"""

                chunks = self.text_splitter.split_text(doc_text)
                for chunk in chunks:
                    documents.append(chunk)
                    metadata.append({
                        "title": paper['title'],
                        "category": category,
                        "arxiv_id": paper['arxiv_id'],
                    })

        return documents, metadata

processor = DocumentProcessor()
documents, metadata = processor.process_papers(PAPERS_DATABASE)
print(f"‚úì Created {len(documents)} document chunks\n")

# ============================================================================
# CELL 5: Build RAG System
# ============================================================================

print("="*80)
print("BUILDING RAG SYSTEM")
print("="*80)

print("\n1Ô∏è‚É£ Loading embeddings...")
use_gpu = torch.cuda.is_available()
device = "cuda" if use_gpu else "cpu"

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": device}
)
print(f"‚úì Embeddings ready (device: {device})")

print("\n2Ô∏è‚É£ Building FAISS vector store...")
vectorstore = FAISS.from_texts(texts=documents, embedding=embeddings, metadatas=metadata)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
print(f"‚úì Vector store created ({len(documents)} chunks)")

print("\n3Ô∏è‚É£ Loading LLM...")
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
if use_gpu:
    model = model.to("cuda")
print("‚úì LLM loaded")

class SimpleLLM:
    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device

    def generate(self, prompt: str) -> str:
        try:
            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(self.device)
            outputs = self.model.generate(**inputs, max_length=512, num_beams=1, temperature=0.7, do_sample=False)
            return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        except:
            return "Error generating response"

llm = SimpleLLM(model, tokenizer, device)

print("\n4Ô∏è‚É£ Creating RAG chain...")
template = """You are an expert research assistant. Answer questions about papers.

Papers:
{context}

Question: {question}

Answer based on papers. Be concise."""

prompt = PromptTemplate(input_variables=["context", "question"], template=template)

def format_docs(docs):
    if not docs:
        return "No papers found."
    formatted = []
    for i, doc in enumerate(docs[:3], 1):
        title = doc.metadata.get("title", "Unknown")
        arxiv = doc.metadata.get("arxiv_id", "N/A")
        formatted.append(f"[{i}] {title} ({arxiv})\n{doc.page_content[:250]}...")
    return "\n\n".join(formatted)

def rag_chain(question: str):
    docs = retriever.invoke(question)
    context = format_docs(docs)
    formatted_prompt = template.format(context=context, question=question)
    answer = llm.generate(formatted_prompt)
    return answer, docs

print("‚úì RAG chain ready!")

# ============================================================================
# PART B: EVALUATION SYSTEM
# ============================================================================

print("\n" + "="*80)
print("EVALUATION METRICS")
print("="*80)

# Simple text utilities (no NLTK!)
def simple_tokenize(text: str) -> List[str]:
    import re
    text = text.lower()
    return re.findall(r'\w+', text)

# Evaluation metrics
class EvalMetrics:
    @staticmethod
    def rouge(ref: str, gen: str) -> Dict[str, float]:
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        scores = scorer.score(ref, gen)
        return {
            'rouge1': scores['rouge1'].fmeasure,
            'rouge2': scores['rouge2'].fmeasure,
            'rougeL': scores['rougeL'].fmeasure,
        }

    @staticmethod
    def semantic_sim(text1: str, text2: str) -> float:
        try:
            emb1 = np.array(embeddings.embed_query(text1)).reshape(1, -1)
            emb2 = np.array(embeddings.embed_query(text2)).reshape(1, -1)
            return float(cosine_similarity(emb1, emb2)[0][0])
        except:
            return 0.0

    @staticmethod
    def jaccard(text1: str, text2: str) -> float:
        set1 = set(simple_tokenize(text1))
        set2 = set(simple_tokenize(text2))
        if not set1 and not set2:
            return 1.0
        intersection = len(set1 & set2)
        union = len(set1 | set2)
        return intersection / union if union > 0 else 0.0

    @staticmethod
    def mrr(retrieved: List[str], relevant: List[str]) -> float:
        for i, doc_id in enumerate(retrieved):
            if doc_id in relevant:
                return 1.0 / (i + 1)
        return 0.0

    @staticmethod
    def precision_at_k(retrieved: List[str], relevant: List[str], k: int = 3) -> float:
        if k == 0:
            return 0.0
        top_k = retrieved[:k]
        correct = sum(1 for doc in top_k if doc in relevant)
        return correct / k

print("‚úì Evaluation metrics ready\n")

# ============================================================================
# CELL 6: Evaluation Dataset
# ============================================================================

EVAL_DATA = [
    {
        "question": "What are transformers?",
        "expected": "Transformers use self-attention mechanisms for parallel processing.",
        "papers": ["1706.03762"]
    },
    {
        "question": "Explain Vision Transformers",
        "expected": "ViT divides images into patches and applies transformer architecture.",
        "papers": ["2010.11929"]
    },
    {
        "question": "What is CLIP?",
        "expected": "CLIP learns from image-text pairs enabling zero-shot classification.",
        "papers": ["2103.00020"]
    },
    {
        "question": "What is BERT?",
        "expected": "BERT uses bidirectional pre-training with masked language modeling.",
        "papers": ["1810.04805"]
    },
    {
        "question": "How do multimodal models work?",
        "expected": "They combine vision and language through connection layers.",
        "papers": ["2201.12086", "2304.08485"]
    }
]

print(f"Loaded {len(EVAL_DATA)} evaluation questions\n")

# ============================================================================
# CELL 7: Run Evaluation
# ============================================================================

print("="*80)
print("RUNNING EVALUATION")
print("="*80)

results = []
for i, test in enumerate(EVAL_DATA, 1):
    print(f"\n{i}. {test['question']}")

    start = time.time()
    answer, docs = rag_chain(test['question'])
    latency = time.time() - start

    retrieved_papers = [doc.metadata.get('arxiv_id') for doc in docs]

    # Calculate metrics
    rouge = EvalMetrics.rouge(test['expected'], answer)
    semantic = EvalMetrics.semantic_sim(test['expected'], answer)
    jaccard = EvalMetrics.jaccard(test['expected'], answer)
    mrr = EvalMetrics.mrr(retrieved_papers, test['papers'])
    precision = EvalMetrics.precision_at_k(retrieved_papers, test['papers'])

    result = {
        "question": test['question'],
        "latency": latency,
        "rouge1": rouge['rouge1'],
        "rouge2": rouge['rouge2'],
        "rougeL": rouge['rougeL'],
        "semantic": semantic,
        "jaccard": jaccard,
        "mrr": mrr,
        "precision_at_3": precision,
    }
    results.append(result)

    print(f"   Latency: {latency:.2f}s")
    print(f"   ROUGE-1: {rouge['rouge1']:.3f}")
    print(f"   Semantic: {semantic:.3f}")
    print(f"   MRR: {mrr:.3f}")

# ============================================================================
# CELL 8: Results Summary
# ============================================================================

print("\n" + "="*80)
print("EVALUATION RESULTS")
print("="*80)

df = pd.DataFrame(results)

print("\nDETAILED RESULTS:")
print(df[['question', 'latency', 'rouge1', 'semantic', 'mrr']].to_string(index=False))

print("\n" + "="*80)
print("AGGREGATE METRICS")
print("="*80)

agg = {
    "Avg Latency (sec)": df['latency'].mean(),
    "ROUGE-1": df['rouge1'].mean(),
    "ROUGE-2": df['rouge2'].mean(),
    "ROUGE-L": df['rougeL'].mean(),
    "Semantic Similarity": df['semantic'].mean(),
    "Jaccard Similarity": df['jaccard'].mean(),
    "MRR": df['mrr'].mean(),
    "Precision@3": df['precision_at_3'].mean(),
}

for name, value in agg.items():
    bar = "‚ñà" * int(value * 20) + "‚ñë" * (20 - int(value * 20))
    print(f"{name:.<35} {bar} {value:.4f}")

# ============================================================================
# CELL 9: Interpretation
# ============================================================================

print("\n" + "="*80)
print("INTERPRETATION")
print("="*80)

def status(score):
    if score >= 0.7:
        return "üü¢ Excellent"
    elif score >= 0.5:
        return "üü° Good"
    elif score >= 0.3:
        return "üü† Fair"
    else:
        return "üî¥ Poor"

print(f"\nRUGE-1: {agg['ROUGE-1']:.4f} {status(agg['ROUGE-1'])}")
print(f"Semantic Similarity: {agg['Semantic Similarity']:.4f} {status(agg['Semantic Similarity'])}")
print(f"MRR: {agg['MRR']:.4f} {status(agg['MRR'])}")
print(f"Precision@3: {agg['Precision@3']:.4f} {status(agg['Precision@3'])}")

latency = agg['Avg Latency (sec)']
if latency < 5:
    latency_status = "üü¢ Excellent"
elif latency < 10:
    latency_status = "üü° Good"
else:
    latency_status = "üü† Fair"
print(f"Latency: {latency:.2f}s {latency_status}")

# ============================================================================
# CELL 10: Done!
# ============================================================================

print("\n" + "="*80)
print("‚úÖ COMPLETE!")
print("="*80)

print(f"\n‚úì RAG System: READY")
print(f"‚úì Evaluation: COMPLETE")
print(f"‚úì Results: {len(results)} questions evaluated")
print(f"\nTo ask more questions, run:")
print('  answer, docs = rag_chain("Your question here?")')

‚úì All packages installed!
‚úì All imports successful!
Loaded 15 papers
‚úì Created 15 document chunks

BUILDING RAG SYSTEM

1Ô∏è‚É£ Loading embeddings...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


‚úì Embeddings ready (device: cuda)

2Ô∏è‚É£ Building FAISS vector store...
‚úì Vector store created (15 chunks)

3Ô∏è‚É£ Loading LLM...


Loading weights:   0%|          | 0/282 [00:00<?, ?it/s]



‚úì LLM loaded

4Ô∏è‚É£ Creating RAG chain...
‚úì RAG chain ready!

EVALUATION METRICS
‚úì Evaluation metrics ready

Loaded 5 evaluation questions

RUNNING EVALUATION

1. What are transformers?
   Latency: 0.16s
   ROUGE-1: 0.545
   Semantic: 0.545
   MRR: 1.000

2. Explain Vision Transformers
   Latency: 10.60s
   ROUGE-1: 0.010
   Semantic: 0.294
   MRR: 0.333

3. What is CLIP?
   Latency: 0.27s
   ROUGE-1: 0.222
   Semantic: 0.418
   MRR: 1.000

4. What is BERT?
   Latency: 0.23s
   ROUGE-1: 0.400
   Semantic: 0.494
   MRR: 1.000

5. How do multimodal models work?
   Latency: 0.23s
   ROUGE-1: 0.000
   Semantic: 0.371
   MRR: 0.500

EVALUATION RESULTS

DETAILED RESULTS:
                      question   latency   rouge1  semantic      mrr
        What are transformers?  0.157882 0.545455  0.544881 1.000000
   Explain Vision Transformers 10.603743 0.010309  0.293682 0.333333
                 What is CLIP?  0.273719 0.222222  0.417820 1.000000
                 What is BERT?  0.229216 0