In [7]:
# 1️⃣ Install Dependencies
!pip install faiss-cpu sentence-transformers transformers datasets langchain openai tiktoken scikit-learn numpy pandas torch
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.20-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB

In [8]:
# 2️⃣ Import Required Libraries
import faiss
import numpy as np
import torch
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from sklearn.feature_extraction.text import TfidfVectorizer
from datasets import load_dataset

In [9]:
# 3️⃣ Load Embedding Model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [24]:
# 4️⃣ Load Pre-trained LLM for Generation
llm_name = "google/flan-t5-base"
generation_model = AutoModelForSeq2SeqLM.from_pretrained(llm_name)
generation_tokenizer = AutoTokenizer.from_pretrained(llm_name)

In [25]:
# 5️⃣ Data Ingestion: Load Dataset & Convert to Embeddings
dataset = load_dataset("wikipedia", "20220301.simple", split="train[:1000]")
documents = dataset["text"]

def embed_documents(docs):
    return embedding_model.encode(docs, convert_to_tensor=True)

doc_embeddings = embed_documents(documents)

print(f"✅ Loaded {len(documents)} documents and computed embeddings.")

✅ Loaded 1000 documents and computed embeddings.


In [26]:
# 6️⃣ Build FAISS Index
index = faiss.IndexFlatL2(doc_embeddings.shape[1])
index.add(doc_embeddings.cpu().numpy())
print("✅ FAISS index created and populated.")

✅ FAISS index created and populated.


In [27]:
# 7️⃣ Implement TF-IDF for Hybrid Search
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
print("✅ TF-IDF vectorizer initialized.")

✅ TF-IDF vectorizer initialized.


In [52]:
# 8️⃣ Hybrid Search Function
def hybrid_search(query, top_k=5):
    # Compute TF-IDF scores
    query_tfidf = tfidf_vectorizer.transform([query])
    tfidf_scores = (tfidf_matrix @ query_tfidf.T).toarray().flatten()

    # Compute dense vector similarity using FAISS
    query_embedding = embedding_model.encode([query], convert_to_tensor=True).cpu().numpy()
    _, faiss_indices = index.search(query_embedding, top_k)

    # Extract FAISS scores (cosine similarity)
    faiss_scores = np.zeros_like(tfidf_scores)
    for idx in faiss_indices[0]:
        faiss_scores[idx] = tfidf_scores[idx]  # Align FAISS scores with TF-IDF shape

    # Normalize scores to avoid scale issues
    tfidf_scores = (tfidf_scores - np.min(tfidf_scores)) / (np.max(tfidf_scores) - np.min(tfidf_scores) + 1e-8)
    faiss_scores = (faiss_scores - np.min(faiss_scores)) / (np.max(faiss_scores) - np.min(faiss_scores) + 1e-8)

    # Combine scores (weighted sum)
    combined_scores = 0.5 * tfidf_scores + 0.5 * faiss_scores
    top_indices = np.argsort(combined_scores)[-top_k:][::-1]

    return [documents[idx] for idx in top_indices]

print("✅ Hybrid retrieval system is ready.")

✅ Hybrid retrieval system is ready.


In [53]:
# 9️⃣ RAG Response Generation
def generate_response(query):
    # Retrieve documents from the hybrid search
    retrieved_docs = hybrid_search(query, top_k=5)

    # Print the retrieved documents to check relevance
    print(f"Retrieved Documents: {retrieved_docs}")

    # Build context
    context = " ".join(retrieved_docs)
    input_text = f"Context: {context}\nQuestion: {query}\nAnswer:"

    inputs = generation_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    output = generation_model.generate(**inputs, max_new_tokens=50)

    # Decode the model's output
    response = generation_tokenizer.decode(output[0], skip_special_tokens=True)
    return response

In [57]:
# 🔟 Test the RAG Pipeline
query = "What is web browser?"
response = generate_response(query)
print(f"\n🔍 Query: {query}\n📝 RAG Response: {response}")

print("✅ Advanced RAG pipeline is fully implemented!")


🔍 Query: What is web browser?
📝 RAG Response: Web browsers are used by people to find and look at websites on the Internet.
✅ Advanced RAG pipeline is fully implemented!
