In [38]:
!pip install -q sentence-transformers faiss-cpu PyMuPDF transformers



In [39]:
from google.colab import files

uploaded = files.upload()  # Manually upload your PDF research papers


Saving 1706.03762v7.pdf to 1706.03762v7 (1).pdf
Saving 2005.11401v4.pdf to 2005.11401v4 (1).pdf
Saving 2005.14165v4.pdf to 2005.14165v4 (1).pdf


In [40]:
from sentence_transformers import SentenceTransformer
import faiss
import fitz  # PyMuPDF
import numpy as np
from transformers import pipeline
import os


In [41]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Extract and chunk text
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    texts = []
    for i, page in enumerate(doc):
        text = page.get_text().strip()
        if text:
            texts.append((text, i + 1))
    return texts

def chunk_text(text, chunk_size=100):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# Process all uploaded PDFs
all_chunks, metadata = [], []

for file_name in uploaded.keys():
    pages = extract_text_from_pdf(file_name)
    for text, page in pages:
        for chunk in chunk_text(text):
            all_chunks.append(chunk)
            metadata.append({'source': file_name, 'page': page})

# Embed and index
embeddings = embedding_model.encode(all_chunks, convert_to_numpy=True)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

In [42]:
generator = pipeline("text-generation", model="gpt2", max_new_tokens=150)


Device set to use cpu


In [43]:
def answer_question(question, top_k=5):
    q_embed = embedding_model.encode([question])
    distances, indices = index.search(np.array(q_embed).astype('float32'), top_k)

    retrieved = [all_chunks[i] for i in indices[0]]
    cites = [metadata[i] for i in indices[0]]

    context = "\n".join([f"[{i+1}] {txt}" for i, txt in enumerate(retrieved)])
    prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"

    result = generator(prompt, max_new_tokens=150, do_sample=True, top_k=50, top_p=0.95, num_return_sequences=1)[0]['generated_text']

    references = "\nSources:\n" + "\n".join(
        [f"[{i+1}] {c['source']} (Page {c['page']})" for i, c in enumerate(cites)]
    )
    return result + references

In [44]:
# @title Ask a question about the documents
question = "What are the components of a RAG model?" # @param {type:"string"}
answer = answer_question(question)
print(answer)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Context:
[1] and automated spam/phishing. Acknowledgments The authors would like to thank the reviewers for their thoughtful and constructive feedback on this paper, as well as HuggingFace for their help in open-sourcing code to run RAG models. The authors would also like to thank Kyunghyun Cho and Sewon Min for productive discussions and advice. EP thanks supports from the NSF Graduate Research Fellowship. PL is supported by the FAIR PhD program. References [1] Payal Bajaj, Daniel Campos, Nick Craswell, Li Deng, Jianfeng Gao, Xiaodong Liu, Rangan Majumder, Andrew McNamara, Bhaskar Mitra, Tri Nguyen, Mir Rosenberg, Xia Song, Alina Stoica, Saurabh Tiwary, and
[2] over text passages given a query x and (ii) a generator pθ(yi|x, z, y1:i−1) parametrized 1Code to run experiments with RAG has been open-sourced as part of the HuggingFace Transform- ers Library [66] and can be found at https://github.com/huggingface/transformers/blob/master/ examples/rag/. An interactive demo of RAG models can