1. Setup

In [1]:
!pip install sentence-transformers faiss-cpu huggingface_hub pydf

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting pydf
  Downloading pydf-12.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pydf
  Building wheel for pydf (setup.py) ... [?25l[?25hdone
  Created wheel for pydf: filename=pydf-12-py2.py3-none-any.whl size=8360 sha256=0e47aa98eac821fe9b425b4aabf0a65b4b48e988b9a0445ef0f12bddb6b5c81b
  Stored in directory: /root/.cache/pip/wheels/51/13/bf/eac7ba3e7e11839e613f40cb7b8483123a654478de2d18ec0e
Successfully built pydf
Installing collected packages: pydf, faiss-cpu
Successfully installed faiss-cpu-1.12.0 pydf-12


In [2]:
from huggingface_hub import InferenceClient
import getpass

# Ask for your token at runtime (not saved in notebook)
hf_token = getpass.getpass("Enter your Hugging Face token: ")

client = InferenceClient(
    model="meta-llama/Meta-Llama-3-8B-Instruct",
    token=hf_token
)


Enter your Hugging Face token: ··········


2. Load Data

In [3]:
import os

os.makedirs("data", exist_ok=True)

markdown_text = """
# My AI Notes

Retrieval-Augmented Generation (RAG) is a technique where a language model
is given extra context from a document store.

FAISS is a library for fast similarity search and clustering of dense vectors.

Embeddings are numerical representations of text that capture meaning.
"""

with open("data/notes.md", "w") as f:
    f.write(markdown_text)

print("Saved data/notes.md")


Saved data/notes.md


In [4]:
import glob

def load_text_from_data_folder():
    texts = []
    for path in glob.glob("data/*"):
        if path.endswith(".md") or path.endswith(".txt"):
            with open(path, "r", encoding="utf-8") as f:
                texts.append(f.read())
    return "\n\n".join(texts)

corpus_text = load_text_from_data_folder()
print(corpus_text)



# My AI Notes

Retrieval-Augmented Generation (RAG) is a technique where a language model
is given extra context from a document store.

FAISS is a library for fast similarity search and clustering of dense vectors.

Embeddings are numerical representations of text that capture meaning.



3. Chunking

In [5]:
def chunk_text(text, chunk_size=300, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk.strip())
        start = end - overlap  # small overlap for continuity
    return [c for c in chunks if c]

chunks = chunk_text(corpus_text)
print(f"Number of chunks: {len(chunks)}")
for i, ch in enumerate(chunks[:3]):
    print(f"\n--- Chunk {i} ---\n{ch}")


Number of chunks: 2

--- Chunk 0 ---
# My AI Notes

Retrieval-Augmented Generation (RAG) is a technique where a language model
is given extra context from a document store.

FAISS is a library for fast similarity search and clustering of dense vectors.

Embeddings are numerical representations of text that capture meaning.

--- Chunk 1 ---
ntations of text that capture meaning.


4. Embeddings (MiniLM)

In [6]:
from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Each chunk → embedding vector
embeddings = embed_model.encode(chunks)
embeddings.shape


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

(2, 384)

5. FAISS Index

In [7]:
import faiss
import numpy as np

embeddings_np = np.array(embeddings).astype("float32")
d = embeddings_np.shape[1]  # dimension

index = faiss.IndexFlatL2(d)
index.add(embeddings_np)

print("FAISS index built with", index.ntotal, "vectors.")


FAISS index built with 2 vectors.


6. RAG Q&A

In [8]:
def search_chunks(query, k=3):
    query_emb = embed_model.encode([query]).astype("float32")
    distances, indices = index.search(query_emb, k)
    results = []
    for idx in indices[0]:
        results.append(chunks[idx])
    return results

# Test it:
test_query = "What is RAG?"
top_chunks = search_chunks(test_query, k=3)
for i, c in enumerate(top_chunks):
    print(f"\n--- Retrieved Chunk {i} ---\n{c}")



--- Retrieved Chunk 0 ---
# My AI Notes

Retrieval-Augmented Generation (RAG) is a technique where a language model
is given extra context from a document store.

FAISS is a library for fast similarity search and clustering of dense vectors.

Embeddings are numerical representations of text that capture meaning.

--- Retrieved Chunk 1 ---
ntations of text that capture meaning.

--- Retrieved Chunk 2 ---
ntations of text that capture meaning.


In [9]:
def answer_with_rag(question, k=3):
    # 1. Retrieve context from FAISS
    retrieved_chunks = search_chunks(question, k=k)
    context = "\n\n".join(retrieved_chunks)

    # 2. Build prompt for LLM
    prompt = f"""
You are a helpful assistant. Answer the question ONLY using the context below.
If the answer is not in the context, say "I don't know based on the provided context."

Context:
{context}

Question:
{question}

Answer:
"""
    # 3. Call the chat LLM
    response = client.chat_completion(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=300
    )

    answer = response.choices[0].message["content"]
    return answer, retrieved_chunks

# Try it:
question = "Explain RAG in simple words."
answer, used_chunks = answer_with_rag(question, k=3)

print("QUESTION:\n", question)
print("\nANSWER:\n", answer)
print("\nCONTEXT CHUNKS USED:")
for i, c in enumerate(used_chunks):
    print(f"\n--- Chunk {i} ---\n{c}")


QUESTION:
 Explain RAG in simple words.

ANSWER:
 RAG is a technique where a language model is given extra context from a document store to help it understand and generate better responses.

CONTEXT CHUNKS USED:

--- Chunk 0 ---
# My AI Notes

Retrieval-Augmented Generation (RAG) is a technique where a language model
is given extra context from a document store.

FAISS is a library for fast similarity search and clustering of dense vectors.

Embeddings are numerical representations of text that capture meaning.

--- Chunk 1 ---
ntations of text that capture meaning.

--- Chunk 2 ---
ntations of text that capture meaning.
