In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import faiss

In [4]:
docs_df = pd.read_csv("documents.csv")
print(docs_df.head())

   index                                         source_url  \
0      0  https://enterthegungeon.fandom.com/wiki/Bullet...   
1      1  https://www.dropbox.com/scl/fi/ljtdg6eaucrbf1a...   
2      2  https://bytes-and-nibbles.web.app/bytes/stici-...   
3      3              https://github.com/llmware-ai/llmware   
4      4                https://docs.marimo.io/recipes.html   

                                                text  
0  Bullet Kin\nBullet Kin are one of the most com...  
1  ---The Paths through the Underground/Underdark...  
2  Semantic and Textual Inference Chatbot Interfa...  
3  llmware\n\nBuilding Enterprise RAG Pipelines w...  
4  Recipes\nThis page includes code snippets or “...  


In [5]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [9]:
def chunk_text(text, max_words=100):
    words = text.split()
    return [" ".join(words[i:i+max_words]) for i in range(0, len(words), max_words)]

chunks = []
for i, row in docs_df.iterrows():
    for chunk in chunk_text(row['text']):
        chunks.append({
            'doc_id': row['source_url'],
            'chunk': chunk
        })

chunks_df = pd.DataFrame(chunks)

# 🧬 Create embeddings
embeddings = model.encode(chunks_df['chunk'].tolist(), show_progress_bar=True)

# 🧠 Build FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

Batches:   0%|          | 0/37 [00:00<?, ?it/s]

In [None]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # or the one you used earlier

# Fix the retrieve function
def retrieve(query, k=5):
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(query_embedding, k)
    return chunks_df.iloc[indices[0]]


In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to(device)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
def generate_answer_local(query, k=5):
    # Get top-k relevant chunks
    retrieved_docs = retrieve(query, k)
    context = "\n".join(retrieved_docs['chunk'].tolist())

    prompt = f"""Use the context below to answer the question.
Context:
{context}

Question: {query}
Answer:"""

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
    outputs = model.generate(**inputs, max_new_tokens=100)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return answer

In [2]:
print(generate_answer_local("What is RAG used for?"))


NameError: name 'generate_answer_local' is not defined