# RAG System on Google Colab (Free Tier)

This notebook demonstrates how to build a Retrieval Augmented Generation (RAG) system using **free open-source models** on a standard Google Colab T4 instance.

### **Pipeline:**
1. **Unstructured Data** -> Text Chunks
2. **Text Chunks** -> Embeddings (Sentence Transformers)
3. **Embeddings** -> Vector DB (ChromaDB)
4. **Query** -> LLM (Zephyr-7B-beta, 4-bit quantized) -> Answer

**Note**: Make sure your Runtime is set to **GPU** (T4).

In [None]:
# Install dependencies
# - transformers, accelerate, bitsandbytes: For loading the LLM
# - langchain*: For the RAG orchestration
# - chromadb: Vector database
# - sentence-transformers: For embeddings
# - unstructrued, pypdf: For loading data
!pip install -q -U requests==2.32.4 "opentelemetry-sdk<1.39.0" transformers accelerate bitsandbytes langchain langchain-community sentence-transformers chromadb pypdf unstructured networkx

In [None]:
import os
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader, TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [None]:
# --- CONFIGURATION ---
# Using Zephyr 7B Beta (Instruction Tuned Mistral)
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"

# Standard lightweight embedding model
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

# Quantization Config for 4-bit loading (Required for Colab Free Tier T4 GPU)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
# --- LOAD LLM ---
print(f"Loading {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)

# Create a generation pipeline
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=400,
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
print("LLM Loaded successfully!")

In [None]:
# --- DATA PIPELINE: AUTOMATING UNSTRUCTURED TO STRUCTURED ---

# 1. Setup Data Directory
DATA_PATH = "/content/data"
os.makedirs(DATA_PATH, exist_ok=True)

print(f"Please upload your files (.txt, .pdf) to {DATA_PATH}")

# Check if directory is empty, create demo file if so
if not any(fname.endswith(('.txt', '.pdf')) for fname in os.listdir(DATA_PATH)):
    print("No Text/PDF data found in folder. Creating a demo file.")
    with open(os.path.join(DATA_PATH, "demo_info.txt"), "w") as f:
        f.write("The Neural RAG system was built in 2024. It uses Zephyr-7B as its core brain. "
                "Data structuring allows unstructured text to be queried efficiently. "
                "The capital of the Moon is currently unknown, but cheese is a popular theory.")

# 2. Load Documents
print("Loading documents...")
documents = []
# simple loader for text files
txt_loader = DirectoryLoader(DATA_PATH, glob="**/*.txt", loader_cls=TextLoader)
documents.extend(txt_loader.load())

# loader for PDF files
try:
    pdf_loader = DirectoryLoader(DATA_PATH, glob="**/*.pdf", loader_cls=PyPDFLoader)
    documents.extend(pdf_loader.load())
except Exception as e:
    print(f"Could not load PDFs (maybe none exist or missing dependency): {e}")

print(f"Total Documents Loaded: {len(documents)}")

In [None]:
# 3. Split Text (Chunking)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)
print(f"Data broken down into {len(chunks)} structured chunks.")

# 4. Embeddings & Vector Store
print("Creating Embeddings and Vector Index...")
embeddings = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    model_kwargs={'device': 'cpu'} # Use CPU for embeddings to save VRAM for the LLM
)

# Initialize ChromaDB
vector_store = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="/content/chroma_db"
)
print("Vector Database Ready.")

In [None]:
# --- RETRIEVAL SETUP ---
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

# Zephyr/Mistral Prompt Template
prompt_template = """
<|system|>
You are a helpful AI assistant. Use the following pieces of context to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Context: {context}</s>
<|user|>
{question}</s>
<|assistant|>
"""

PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

In [None]:
# --- TEST THE PIPELINE ---
def ask_rag(query):
    result = qa_chain.invoke({"query": query})
    print(f"\nQuestion: {query}")
    print("Answer:", result['result'])
    print("\n[Retrieved Sources]")
    for i, doc in enumerate(result['source_documents']):
        source = doc.metadata.get('source', 'Unknown')
        print(f"{i+1}. {source}: {doc.page_content[:100]}...")

# Ask a question based on your data
ask_rag("What is this system built with?")
# ask_rag("Your own question here")