In [4]:
pip install -q torch transformers sentence-transformers faiss-cpu accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
import torch
print("GPU available:", torch.cuda.is_available())


GPU available: True


In [6]:
from sentence_transformers import SentenceTransformer
import numpy as np

**Sample Documents**

In [7]:

#documents = [
#    "Machine learning is a subset of artificial intelligence that learns from data.",
#    "Retrieval-Augmented Generation combines information retrieval with text generation.",
#    "Large Language Models are trained on massive datasets and can generate human-like text.",
#    "FAISS is a library for efficient similarity search and clustering of dense vectors."
#]
documents = [
    "Reinforcement learning focuses on learning optimal actions through rewards and penalties.",
    "Deep Q-Networks combine Q-learning with deep neural networks.",
    "Policy gradient methods directly optimize the policy without using a value function.",
    "Actor-Critic methods combine value-based and policy-based approaches."
]


**Embeddings**

In [8]:


embedder = SentenceTransformer("all-MiniLM-L6-v2")

doc_embeddings = embedder.encode(documents, convert_to_numpy=True)
doc_embeddings.shape


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

(4, 384)

**FAISS Index**

In [9]:
import faiss

dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings)

print("Total documents indexed:", index.ntotal)


Total documents indexed: 4


**Retrival Function**

In [10]:
def retrieve_documents(query, top_k=2):
    query_embedding = embedder.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    return [documents[i] for i in indices[0]]


**Loading LLM**

In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small").to("cuda")


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

**Answer Generation**

In [12]:
def generate_answer(context, question):
    prompt = f"""
    Context:
    {context}

    Question:
    {question}

    Answer:
    """
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to("cuda")
    outputs = model.generate(**inputs, max_length=150)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


**RAG Pipeline**

In [13]:
#question = "What is retrieval augmented generation?"
question = [
    "What is reinforcement learning?",
    "How do actor-critic methods work?",
    "What is the difference between DQN and policy gradient?"
]

retrieved_docs = retrieve_documents(question)
context = " ".join(retrieved_docs)

answer = generate_answer(context, question)

print("Question:", question)
print("\nRetrieved Context:")
for doc in retrieved_docs:
    print("-", doc)

print("\nGenerated Answer:")
print(answer)


Question: ['What is reinforcement learning?', 'How do actor-critic methods work?', 'What is the difference between DQN and policy gradient?']

Retrieved Context:
- Actor-Critic methods combine value-based and policy-based approaches.
- Reinforcement learning focuses on learning optimal actions through rewards and penalties.

Generated Answer:
learning optimal actions through rewards and penalties


In [14]:
for q in question:
    retrieved_docs = retrieve_documents(q)
    context = " ".join(retrieved_docs)
    answer = generate_answer(context, q)

    print("="*60)
    print("Question:", q)
    print("Context:", context)
    print("Answer:", answer)


Question: What is reinforcement learning?
Context: Reinforcement learning focuses on learning optimal actions through rewards and penalties. Actor-Critic methods combine value-based and policy-based approaches.
Answer: focuses on learning optimal actions through rewards and penalties
Question: How do actor-critic methods work?
Context: Actor-Critic methods combine value-based and policy-based approaches. Policy gradient methods directly optimize the policy without using a value function.
Answer: combine value-based and policy-based approaches
Question: What is the difference between DQN and policy gradient?
Context: Policy gradient methods directly optimize the policy without using a value function. Reinforcement learning focuses on learning optimal actions through rewards and penalties.
Answer: Reinforcement learning focuses on learning optimal actions through rewards and penalties
