# 1️⃣ Install Dependencies

In [10]:
!pip install faiss-cpu sentence-transformers transformers datasets langchain openai tiktoken scikit-learn numpy pandas torch
!pip install -U langchain-community

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collectin

# 2️⃣ Import Required Libraries

In [11]:
import faiss
import numpy as np
import torch
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from sklearn.feature_extraction.text import TfidfVectorizer
from datasets import load_dataset

# 3️⃣ Load Embedding Model

In [12]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# 4️⃣ Load Pre-trained LLM for Generation

In [13]:
llm_name = "google/flan-t5-base"
generation_model = AutoModelForSeq2SeqLM.from_pretrained(llm_name)
generation_tokenizer = AutoTokenizer.from_pretrained(llm_name)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

# 5️⃣ Data Ingestion: Load Dataset & Convert to Embeddings

In [14]:
dataset = load_dataset("wikipedia", "20220301.simple", split="train[:1000]")
documents = dataset["text"]

def embed_documents(docs):
    return embedding_model.encode(docs, convert_to_tensor=True)

doc_embeddings = embed_documents(documents)

print(f"✅ Loaded {len(documents)} documents and computed embeddings.")

README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

wikipedia.py:   0%|          | 0.00/36.7k [00:00<?, ?B/s]

The repository for wikipedia contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/wikipedia.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


train-00000-of-00001.parquet:   0%|          | 0.00/134M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/205328 [00:00<?, ? examples/s]

✅ Loaded 1000 documents and computed embeddings.


# 6️⃣ Build FAISS Index

In [15]:
index = faiss.IndexFlatL2(doc_embeddings.shape[1])
index.add(doc_embeddings.cpu().numpy())
print("✅ FAISS index created and populated.")

✅ FAISS index created and populated.


# 7️⃣ Implement TF-IDF for Hybrid Search

In [16]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
print("✅ TF-IDF vectorizer initialized.")

✅ TF-IDF vectorizer initialized.


# 8️⃣ Hybrid Search Function

In [17]:
def hybrid_search(query, top_k=5):
    # Compute TF-IDF scores
    query_tfidf = tfidf_vectorizer.transform([query])
    tfidf_scores = (tfidf_matrix @ query_tfidf.T).toarray().flatten()

    # Compute dense vector similarity using FAISS
    query_embedding = embedding_model.encode([query], convert_to_tensor=True).cpu().numpy()
    _, faiss_indices = index.search(query_embedding, top_k)

    # Extract FAISS scores (cosine similarity)
    faiss_scores = np.zeros_like(tfidf_scores)
    for idx in faiss_indices[0]:
        faiss_scores[idx] = tfidf_scores[idx]  # Align FAISS scores with TF-IDF shape

    # Normalize scores to avoid scale issues
    tfidf_scores = (tfidf_scores - np.min(tfidf_scores)) / (np.max(tfidf_scores) - np.min(tfidf_scores) + 1e-8)
    faiss_scores = (faiss_scores - np.min(faiss_scores)) / (np.max(faiss_scores) - np.min(faiss_scores) + 1e-8)

    # Combine scores (weighted sum)
    combined_scores = 0.5 * tfidf_scores + 0.5 * faiss_scores
    top_indices = np.argsort(combined_scores)[-top_k:][::-1]

    return [documents[idx] for idx in top_indices]

print("✅ Hybrid retrieval system is ready.")

✅ Hybrid retrieval system is ready.


# 9️⃣ RAG Response Generation

In [22]:
def generate_response(query):
    # Retrieve documents
    retrieved_docs = hybrid_search(query, top_k=2)  # Use top 2 relevant docs

    # Check retrieved documents
    print(f"Retrieved Documents: {retrieved_docs}")

    # Build context with a refined selection of retrieved documents
    context = " ".join(retrieved_docs)
    input_text = f"Context: {context}\nQuestion: {query}\nAnswer:"

    inputs = generation_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    output = generation_model.generate(**inputs, max_new_tokens=50)

    # Decode the response
    response = generation_tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# 🔟 Test the RAG Pipeline

In [23]:
query = "What is web browser?"
response = generate_response(query)
print(f"\n🔍 Query: {query}\n📝 RAG Response: {response}")

print("✅ Advanced RAG pipeline is fully implemented!")

Retrieved Documents: ['A web browser is a computer program application for reading pages of the World Wide Web. Since the late 1990s, most personal computers and mobile phones and other mobile devices have a browser.\n\nWeb browsers are used by people to find and look at websites on the Internet. The first web browser was created in 1990. Many web browsers are available for free. All web browsers can go to websites but each browser has good things and bad things about it. For example, some browsers focus on data security and keeping computers safe from viruses. Other browsers are made so that web pages appear on-screen faster.\n\nSome popular web browsers include:\n\n Mozilla Firefox\n Google Chrome \n Opera\n Safari\n Internet Explorer included with Microsoft Windows\n Microsoft Edge, a more modernized version of Internet Explorer, included with Windows 10\n\nOther browsers are:\n Flock\n Epiphany\n\nWeb browsers and HTML \nA webpage is one page of a website.  Every web page has a web