This notebook builds a Retrieval-Augmented Generation (RAG) chatbot for the Lab of Future website. It follows these steps:
1. Web scraping with BeautifulSoup and requests
2. Text chunking 
3. Embedding generation with Sentence Transformers
4. Knowledge base setup with FAISS
5. Agent creation with LangChain

## 1. Setup and Installation


In [3]:
pip install beautifulsoup4 requests pandas tqdm sentence-transformers faiss-cpu langchain langchain-openai langchain-community langchain-text-splitters


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## 2. Web Scraping - Collecting Data from Lab of Future Website



In [21]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import pandas as pd
import time

# Initialize
root_url = "https://www.laboffuture.com/"
visited = set()
data = []

def clean_text(soup):
    # Remove script, style, nav, footer, header tags
    for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'noscript', 'form', 'iframe']):
        tag.decompose()
    # Extract visible text
    text = soup.get_text(separator=' ', strip=True)
    # Normalize whitespace
    text = ' '.join(text.split())
    return text

def crawl(url, max_pages=50):
    if len(visited) >= max_pages:
        return
    if url in visited:
        return
    print(f"Crawling: {url}")
    visited.add(url)

    try:
        response = requests.get(url, timeout=10)
        if response.status_code != 200:
            return
        soup = BeautifulSoup(response.text, 'html.parser')

        # Clean page text
        page_text = clean_text(soup)
        data.append({'url': url, 'content': page_text})

        # Find all internal links and crawl them recursively
        domain = urlparse(root_url).netloc
        links = soup.find_all('a', href=True)
        for link in links:
            href = link['href']
            joined_link = urljoin(url, href)
            parsed_link = urlparse(joined_link)
            # Only crawl same domain, skip mailto:, tel:, javascript: etc
            if parsed_link.netloc == domain and parsed_link.scheme in ('http', 'https'):
                if joined_link not in visited:
                    crawl(joined_link, max_pages)
        # Sleep a bit to be polite
        time.sleep(0.5)
    except Exception as e:
        print(f"Failed to crawl {url}: {e}")

# Start crawling
crawl(root_url, max_pages=50)

# Save data as CSV
df = pd.DataFrame(data)
df.to_csv('laboffuture_content.csv', index=False)

print("Crawling complete. Data saved to laboffuture_content.csv")

Crawling: https://www.laboffuture.com/
Crawling: https://www.laboffuture.com/#myCarousel1721988882528
Crawling: https://www.laboffuture.com/summer-camp-2025
Crawling: https://www.laboffuture.com/summer-camp-2025-form
Crawling: https://www.laboffuture.com/summer-camp-2025#myCarousel1746438217777
Crawling: https://www.laboffuture.com/summer-camp-2025#nav_tabs_content_1746549997829_379
Crawling: https://www.laboffuture.com/summer-camp-2025#nav_tabs_content_1746549997829_380
Crawling: https://www.laboffuture.com/contactus
Crawling: https://www.laboffuture.com/#table_of_content_heading_1725885336557_77
Crawling: https://www.laboffuture.com/#table_of_content_heading_1725885336557_78
Crawling: https://www.laboffuture.com/celestial-voyages
Crawling: https://www.laboffuture.com/celestial-voyages#myCarousel1726032948445
Crawling: https://www.laboffuture.com/celestial-voyages#nav_tabs_content_1723674780460_850
Crawling: https://www.laboffuture.com/celestial-voyages#nav_tabs_content_1723674780460_

In [31]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize the chunker (adjust chunk_size and chunk_overlap as needed)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

chunk_texts = []
chunk_urls = []

for i, row in df.iterrows():
    # Split the page content into chunks
    chunks = text_splitter.split_text(row['content'])
    # Append chunks and corresponding URL
    chunk_texts.extend(chunks)
    chunk_urls.extend([row['url']] * len(chunks))

print(f"Total chunks created: {len(chunk_texts)}")

Total chunks created: 18741


In [33]:
# Step 1: Install langchain.text_splitter if needed
# !pip install langchain

import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import numpy as np

# Assuming your scraped data is in df with columns ['url', 'content']
print(f"Total pages scraped: {len(df)}")

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

chunk_texts = []
chunk_urls = []

# Chunk each page's content and keep URL mapping
for i, row in df.iterrows():
    chunks = text_splitter.split_text(row['content'])
    chunk_texts.extend(chunks)
    chunk_urls.extend([row['url']] * len(chunks))

print(f"Total chunks created: {len(chunk_texts)}")

# Step 2: Generate embeddings for chunks
model = SentenceTransformer('all-MiniLM-L6-v2')

print("Generating embeddings for chunks...")
embeddings = model.encode(chunk_texts, show_progress_bar=True)
embeddings = np.array(embeddings).astype('float32')
print(f"Embeddings shape: {embeddings.shape}")

# Step 3: Save chunk metadata and embeddings separately
chunked_df = pd.DataFrame({
    'chunk_text': chunk_texts,
    'url': chunk_urls
})

chunked_df.to_csv('laboffuture_content_chunks_metadata.csv', index=False)
np.save('laboffuture_embeddings.npy', embeddings)

print("Chunk metadata and embeddings saved!")

Total pages scraped: 50
Total chunks created: 18741
Generating embeddings for chunks...


Batches:   0%|          | 0/586 [00:00<?, ?it/s]

Embeddings shape: (18741, 384)
Chunk metadata and embeddings saved!


## 3. Text Chunking


In [32]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('laboffuture_content.csv')

# Parameters for chunking
chunk_size = 500  # number of characters per chunk
overlap = 100     # number of characters to overlap

def chunk_text(text, chunk_size=500, overlap=100):
    chunks = []
    start = 0
    text_length = len(text)
    while start < text_length:
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap  # move forward with overlap
    return chunks

# Prepare a list to hold all chunks with their source URL
chunked_data = []

for idx, row in df.iterrows():
    url = row['url']
    content = str(row['content'])  # ensure text format
    if len(content) > 0:
        chunks = chunk_text(content, chunk_size, overlap)
        for i, chunk in enumerate(chunks):
            chunked_data.append({
                'url': url,
                'chunk_index': i,
                'chunk_text': chunk
            })

# Convert to DataFrame for further processing or saving
chunked_df = pd.DataFrame(chunked_data)

# Save chunked data to CSV
chunked_df.to_csv('laboffuture_content_chunks.csv', index=False)

print(f"Total chunks created: {len(chunked_df)}")
print("Chunks saved to laboffuture_content_chunks.csv")

Total chunks created: 21069
Chunks saved to laboffuture_content_chunks.csv



## 4. Generate Embeddings with Sentence Transformers



In [23]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load chunked data
chunked_df = pd.read_csv('laboffuture_content_chunks.csv')

# Initialize embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')  # fast and good for many tasks

# Generate embeddings for all chunks
texts = chunked_df['chunk_text'].tolist()
print(f"Generating embeddings for {len(texts)} chunks...")

embeddings = model.encode(texts, show_progress_bar=True)

# Save embeddings with chunk metadata
import numpy as np

# Convert embeddings to list for storage
embedding_list = embeddings.tolist()

chunked_df['embedding'] = embedding_list

# Save to a new CSV or preferably to a binary file (like npz) because embeddings are large
chunked_df.to_pickle('laboffuture_content_chunks_with_embeddings.pkl')

print("Embeddings generated and saved successfully!")

Generating embeddings for 21069 chunks...


Batches:   0%|          | 0/659 [00:00<?, ?it/s]

Embeddings generated and saved successfully!


## 5. Create FAISS Knowledge Base


In [24]:
import faiss
import pandas as pd
import numpy as np

# Load the chunk data with embeddings
chunked_df = pd.read_pickle('laboffuture_content_chunks_with_embeddings.pkl')

# Extract embeddings as a NumPy array
embedding_dim = len(chunked_df['embedding'][0])
embeddings = np.array(chunked_df['embedding'].tolist()).astype('float32')

# Initialize FAISS index - use IndexFlatL2 for simplicity (L2 distance)
index = faiss.IndexFlatL2(embedding_dim)

# Add embeddings to index
index.add(embeddings)

print(f"Number of vectors indexed: {index.ntotal}")

Number of vectors indexed: 21069


In [25]:
faiss.write_index(index, 'laboffuture_faiss.index')
print("FAISS index saved to laboffuture_faiss.index")

FAISS index saved to laboffuture_faiss.index


In [26]:
# Sample query text
query_text = "Learn about Lab of Future's space programs"

# Embed query using same model
query_embedding = model.encode([query_text]).astype('float32')

# Search top 5 similar chunks
k = 5
distances, indices = index.search(query_embedding, k)

print("Top matches:")
for i, idx in enumerate(indices[0]):
    print(f"Rank {i+1}, Distance: {distances[0][i]}")
    print(f"URL: {chunked_df.iloc[idx]['url']}")
    print(f"Text snippet: {chunked_df.iloc[idx]['chunk_text'][:200]}...\n")

Top matches:
Rank 1, Distance: 0.5124529600143433
URL: https://www.laboffuture.com/
Text snippet: Home | Lab Of Future Previous Next × Building a community of Future Scientists, today. Get involved Building a Community of Future Scientists, today. Get involved ​​Who are we ​​​​What we do ​​ Who ar...

Rank 2, Distance: 0.5124529600143433
URL: https://www.laboffuture.com/#myCarousel1721988882528
Text snippet: Home | Lab Of Future Previous Next × Building a community of Future Scientists, today. Get involved Building a Community of Future Scientists, today. Get involved ​​Who are we ​​​​What we do ​​ Who ar...

Rank 3, Distance: 0.5124529600143433
URL: https://www.laboffuture.com/#table_of_content_heading_1725885336557_77
Text snippet: Home | Lab Of Future Previous Next × Building a community of Future Scientists, today. Get involved Building a Community of Future Scientists, today. Get involved ​​Who are we ​​​​What we do ​​ Who ar...

Rank 4, Distance: 0.5124529600143433
URL: https://w

In [27]:
import faiss
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

class KnowledgeBase:
    def __init__(self, 
                 faiss_index_path: str, 
                 chunks_pickle_path: str,
                 embedding_model_name: str = 'all-MiniLM-L6-v2'):
        # Load FAISS index
        self.index = faiss.read_index(faiss_index_path)
        # Load chunk metadata with embeddings
        self.chunks_df = pd.read_pickle(chunks_pickle_path)
        # Load embedding model
        self.model = SentenceTransformer(embedding_model_name)
        # Embedding dimension inferred from model
        self.embedding_dim = self.model.get_sentence_embedding_dimension()
        
    def query(self, query_text: str, top_k: int = 5):
        # Embed the query text
        query_embedding = self.model.encode([query_text]).astype('float32')
        # Search in FAISS
        distances, indices = self.index.search(query_embedding, top_k)
        
        # Collect results
        results = []
        for dist, idx in zip(distances[0], indices[0]):
            chunk_data = self.chunks_df.iloc[idx]
            results.append({
                'url': chunk_data['url'],
                'chunk_index': chunk_data['chunk_index'],
                'chunk_text': chunk_data['chunk_text'],
                'distance': float(dist)
            })
        return results

# Usage example
if __name__ == "__main__":
    kb = KnowledgeBase('laboffuture_faiss.index', 'laboffuture_content_chunks_with_embeddings.pkl')
    user_query = "Tell me about your educational programs"
    answers = kb.query(user_query, top_k=3)
    for i, ans in enumerate(answers, 1):
        print(f"Result {i}:")
        print(f"URL: {ans['url']}")
        print(f"Distance: {ans['distance']:.4f}")
        print(f"Text: {ans['chunk_text'][:300]}...\n")

Result 1:
URL: https://www.laboffuture.com/iot-illuminations
Distance: 1.0364
Text: transforming education. They bring a wealth of knowledge and hands-on experience in their respective fields, including: Robotics Engineers Mechanical Engineers Electronics Engineers Computer Science Engineers Aerospace Engineers Mechatronics Engineers Astro-physicists ​...

Result 2:
URL: https://www.laboffuture.com/iot-illuminations#nav_tabs_content_1723674780460_852
Distance: 1.0364
Text: transforming education. They bring a wealth of knowledge and hands-on experience in their respective fields, including: Robotics Engineers Mechanical Engineers Electronics Engineers Computer Science Engineers Aerospace Engineers Mechatronics Engineers Astro-physicists ​...

Result 3:
URL: https://www.laboffuture.com/iot-illuminations#nav_tabs_content_1723674780460_853
Distance: 1.0364
Text: transforming education. They bring a wealth of knowledge and hands-on experience in their respective fields, including: Robotics

In [34]:
import pandas as pd
import numpy as np
import faiss

# Load metadata and embeddings
chunked_df = pd.read_csv('laboffuture_content_chunks_metadata.csv')
embeddings = np.load('laboffuture_embeddings.npy')

urls = chunked_df['url'].tolist()

# Build FAISS index
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

print(f"FAISS index built with {index.ntotal} vectors")

# Example similarity search (search top 3 matches for first chunk)
D, I = index.search(np.array([embeddings[0]]), k=3)

print("Top 3 matching URLs:")
for idx in I[0]:
    print(urls[idx])

FAISS index built with 18741 vectors
Top 3 matching URLs:
https://www.laboffuture.com/
https://www.laboffuture.com/#myCarousel1721988882528
https://www.laboffuture.com/#table_of_content_heading_1725885336557_77


In [47]:
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import pandas as pd

# Load chunk metadata, embeddings, and initialize model and index (do this once)
def initialize_search_system():
    chunked_df = pd.read_csv('laboffuture_content_chunks_metadata.csv')
    embeddings = np.load('laboffuture_embeddings.npy')
    chunk_texts = chunked_df['chunk_text'].tolist()
    urls = chunked_df['url'].tolist()
    
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    return model, index, chunk_texts, urls

# Query function
def search_similar_chunks(query_text, model, index, chunk_texts, urls, top_k=3):
    query_embedding = model.encode([query_text])
    query_embedding = np.array(query_embedding).astype('float32')
    distances, indices = index.search(query_embedding, top_k)
    results = []
    for dist, idx in zip(distances[0], indices[0]):
        results.append({
            'chunk_text': chunk_texts[idx],
            'url': urls[idx],
            'distance': dist
        })
    return results

# Initialize once
model, index, chunk_texts, urls = initialize_search_system()

# Example query
query = "solar system exploration"
results = search_similar_chunks(query, model, index, chunk_texts, urls, top_k=3)

print(f"Results for query: '{query}'")
for r in results:
    print(f"URL: {r['url']}, Distance: {r['distance']}")

Results for query: 'solar system exploration'
URL: https://www.laboffuture.com/celestial-voyages, Distance: 0.7719335556030273
URL: https://www.laboffuture.com/celestial-voyages, Distance: 0.7719335556030273
URL: https://www.laboffuture.com/celestial-voyages, Distance: 0.7719335556030273


In [66]:
def generate_answer(query, contexts):
    prompt = "Answer the question based on the context below. If not found, say 'I don't know'.\n\n"
    prompt += "Context:\n"
    for i, context in enumerate(contexts):
        prompt += f"{i+1}. {context}\n"
    prompt += f"\nQuestion: {query}\nAnswer:"

    response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are an AI assistant."},
        {"role": "user", "content": prompt},
    ],
    max_tokens=150,
    temperature=0,
    )
    print(response.choices[0].message.content)

In [67]:
# Example usage
query = "solar system exploration"
results = search_similar_chunks(query, model, index, chunk_texts, urls, top_k=3)
contexts = [res['chunk_text'] for res in results]
answer = generate_answer(query, contexts)
print("Answer:", answer)

Virtual Exploration of the Solar System with AR & VR
Answer: None


In [58]:
pip install google-cloud-aiplatform

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting google-cloud-aiplatform
  Downloading google_cloud_aiplatform-1.93.1-py2.py3-none-any.whl.metadata (35 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0,>=1.34.1 (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0,>=1.34.1->google-cloud-aiplatform)
  Downloading google_api_core-2.24.2-py3-none-any.whl.metadata (3.0 kB)
Collecting proto-plus<2.0.0,>=1.22.3 (from google-cloud-aiplatform)
  Downloading proto_plus-1.26.1-py3-none-any.whl.metadata (2.2 kB)
Collecting google-cloud-storage<3.0.0,>=1.32.0 (from google-cloud-aiplatform)
  Downloading google_cloud_storage-2.19.0-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting google-cloud-bigquery!=3.20.0,<4.0.0,>=1.15.0 (from google-cloud-aiplatform)
  Downloading google_cloud_bigquery-3.33.0-py3-none-any.whl.metadata (8.0 kB)
Collecting google-cloud-resource-manager<3.0.0,>=1.3.3 (from google-cloud-aiplatform)
  Downloading google_clou

In [None]:
import requests
import json

# Replace with your actual Gemini API key
API_KEY = "

url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={API_KEY}"

headers = {
    "Content-Type": "application/json",
}

# Example prompt for Gemini
data = {
    "contents": [
        {
            "parts": [
                {
                    "text": "Explain how AI works in a few words"
                }
            ]
        }
    ]
}

response = requests.post(url, headers=headers, data=json.dumps(data))

if response.status_code == 200:
    result = response.json()
    # The generated text is usually in 'candidates' inside 'generateTextResponse'
    generated_text = result.get('candidates', [{}])[0].get('content', '')
    print("Gemini response:", generated_text)
else:
    print(f"Request failed with status {response.status_code}: {response.text}")

Gemini response: {'parts': [{'text': 'AI learns from data to make predictions or decisions.\n'}], 'role': 'model'}


In [None]:
import requests
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import pandas as pd

# === Gemini API setup ===
API_KEY = ""
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={API_KEY}"

def call_gemini_api(prompt_text):
    headers = {"Content-Type": "application/json"}
    data = {
        "contents": [
            {
                "parts": [
                    {"text": prompt_text}
                ]
            }
        ]
    }
    response = requests.post(GEMINI_API_URL, headers=headers, data=json.dumps(data))
    if response.status_code == 200:
        result = response.json()
        generated_text = result.get('candidates', [{}])[0].get('content', '')
        return generated_text
    else:
        raise Exception(f"Gemini API request failed: {response.status_code} {response.text}")

# === Initialize search system ===
def initialize_search_system():
    chunked_df = pd.read_csv('laboffuture_content_chunks_metadata.csv')
    embeddings = np.load('laboffuture_embeddings.npy')
    chunk_texts = chunked_df['chunk_text'].tolist()
    urls = chunked_df['url'].tolist()
    
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    
    model = SentenceTransformer('all-MiniLM-L6-v2')
    return model, index, chunk_texts, urls

# === Semantic search ===
def search_similar_chunks(query_text, model, index, chunk_texts, urls, top_k=3):
    query_embedding = model.encode([query_text])
    query_embedding = np.array(query_embedding).astype('float32')
    distances, indices = index.search(query_embedding, top_k)
    
    results = []
    for dist, idx in zip(distances[0], indices[0]):
        results.append({
            'chunk_text': chunk_texts[idx],
            'url': urls[idx],
            'distance': dist
        })
    return results

# === Generate answer with Gemini ===
def generate_answer_with_gemini(query, contexts):
    prompt = "Answer the question based on the context below. If you don't know, say 'I don't know'.\n\n"
    prompt += "Context:\n"
    for i, context in enumerate(contexts):
        prompt += f"{i+1}. {context}\n"
    prompt += f"\nQuestion: {query}\nAnswer:"
    
    return call_gemini_api(prompt)

# === Full RAG flow example ===
if __name__ == "__main__":
    # Initialize retrieval system
    model, index, chunk_texts, urls = initialize_search_system()
    
    # User query
    query = "what is lab of future? and what do they do?"
    
    # Retrieve relevant chunks
    results = search_similar_chunks(query, model, index, chunk_texts, urls, top_k=3)
    contexts = [res['chunk_text'] for res in results]
    
    # Generate answer using Gemini
    answer = generate_answer_with_gemini(query, contexts)
    
    print("User query:", query)
    print("Retrieved contexts:")
    for i, ctx in enumerate(contexts, 1):
        print(f"{i}. {ctx[:200]}...")  # Print snippet
    
    print("\nGemini answer:")
    print(answer)

User query: what is lab of future? and what do they do?
Retrieved contexts:
1. of cutting-edge technology and the spirit of innovation here is truly remarkable. It's an honor to witness the future taking shape, as I'm reminded that the pursuit of excellence knows no bounds, whet...
2. of cutting-edge technology and the spirit of innovation here is truly remarkable. It's an honor to witness the future taking shape, as I'm reminded that the pursuit of excellence knows no bounds, whet...
3. of cutting-edge technology and the spirit of innovation here is truly remarkable. It's an honor to witness the future taking shape, as I'm reminded that the pursuit of excellence knows no bounds, whet...

Gemini answer:
{'parts': [{'text': 'Lab of Future is an outstanding facility for developing research across different fields of technology. It is an initiative to transform the scientific temperament of the students and give them global opportunities in the field of Space.\n'}], 'role': 'model'}


In [73]:
pip install nest_asyncio

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
import asyncio
import numpy as np
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
from agno.agent import Agent
from agno.models.google import Gemini
from textwrap import dedent
import os

# Set your Gemini API key securely
os.environ["GOOGLE_API_KEY"] = ""

# Initialize Gemini model with instructions
gemini_model = Gemini(
    id="gemini-2.0-flash-001",
    instructions=[
        "You are an AI assistant for Lab of Future. Answer questions concisely based on given context."
    ],
)

agent = Agent(model=gemini_model, markdown=True)

# Initialize search system (same as before)
def initialize_search_system():
    chunked_df = pd.read_csv('laboffuture_content_chunks_metadata.csv')
    embeddings = np.load('laboffuture_embeddings.npy')
    chunk_texts = chunked_df['chunk_text'].tolist()
    urls = chunked_df['url'].tolist()
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    model = SentenceTransformer('all-MiniLM-L6-v2')
    return model, index, chunk_texts, urls

# Semantic search (same as before)
def search_similar_chunks(query_text, model, index, chunk_texts, urls, top_k=3):
    query_embedding = model.encode([query_text])
    query_embedding = np.array(query_embedding).astype('float32')
    distances, indices = index.search(query_embedding, top_k)
    results = []
    for dist, idx in zip(distances[0], indices[0]):
        results.append({
            'chunk_text': chunk_texts[idx],
            'url': urls[idx],
            'distance': dist
        })
    return results

# Async function to generate answer with Gemini Agent
async def generate_answer_with_agent(query, contexts):
    prompt = dedent("""\
        Context:
    """)
    for i, context in enumerate(contexts, 1):
        prompt += f"{i}. {context}\n"
    prompt += f"\nQuestion: {query}\nAnswer:"

    run_response = await agent.arun(prompt)
    return run_response.content

# Full async chatbot flow
async def main():
    model, index, chunk_texts, urls = initialize_search_system()
    user_query = "What course they provide ?"
    results = search_similar_chunks(user_query, model, index, chunk_texts, urls, top_k=3)
    contexts = [res['chunk_text'] for res in results]

    answer = await generate_answer_with_agent(user_query, contexts)

    print("User query:", user_query)
    print("Retrieved contexts:")
    for i, ctx in enumerate(contexts, 1):
        print(f"{i}. {ctx[:200]}...")  # snippet
    print("\nGemini answer:")
    print(answer)

# Run the async main function
import nest_asyncio
nest_asyncio.apply()

await main()

User query: What course they provide ?
Retrieved contexts:
1. and tools will be provided by the Lab of Future. Personal items may be needed for specific activities. This course includes 60 hours of professional training in the lab (5 levels of 12 hours each) 365...
2. and tools will be provided by the Lab of Future. Personal items may be needed for specific activities. This course includes 60 hours of professional training in the lab (5 levels of 12 hours each) 365...
3. and tools will be provided by the Lab of Future. Personal items may be needed for specific activities. This course includes 60 hours of professional training in the lab (5 levels of 12 hours each) 365...

Gemini answer:
The course includes:

*   60 hours of professional training in the lab (5 levels of 12 hours each)
*   365 days access of learning resources in LMS
*   Gold Sealed Certificate upon completion
*   Detailed Performance & Skills Report

Skills you will learn:

1.  Hands-on engineering experience
2.  Creati

In [None]:
import asyncio
import numpy as np
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
from agno.agent import Agent
from agno.models.google import Gemini
from textwrap import dedent, fill
import os

# Set your Gemini API key securely (replace with your real key)
os.environ["GOOGLE_API_KEY"] = ""

# Initialize Gemini agent with instructions
gemini_model = Gemini(
    id="gemini-2.0-flash-001",
    instructions=[
        "You are an AI assistant for Lab of Future. Answer questions concisely based on given context."
    ],
)

agent = Agent(model=gemini_model, markdown=True)

# Initialize search system (run once)
def initialize_search_system():
    chunked_df = pd.read_csv('laboffuture_content_chunks_metadata.csv')
    embeddings = np.load('laboffuture_embeddings.npy')
    chunk_texts = chunked_df['chunk_text'].tolist()
    urls = chunked_df['url'].tolist()
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    model = SentenceTransformer('all-MiniLM-L6-v2')
    return model, index, chunk_texts, urls

# Semantic search function
def search_similar_chunks(query_text, model, index, chunk_texts, urls, top_k=3):
    query_embedding = model.encode([query_text])
    query_embedding = np.array(query_embedding).astype('float32')
    distances, indices = index.search(query_embedding, top_k)
    results = []
    for dist, idx in zip(distances[0], indices[0]):
        results.append({
            'chunk_text': chunk_texts[idx],
            'url': urls[idx],
            'distance': dist
        })
    return results

# Async function to generate answer from Gemini agent
async def generate_answer_with_agent(query, contexts):
    prompt = dedent("""\
        Context:
    """)
    for i, context in enumerate(contexts, 1):
        prompt += f"{i}. {context}\n"
    prompt += f"\nQuestion: {query}\nAnswer:"

    run_response = await agent.arun(prompt)
    return run_response.content

# Async chat for a single query
async def chat_once(query, model, index, chunk_texts, urls):
    results = search_similar_chunks(query, model, index, chunk_texts, urls, top_k=3)
    contexts = [res['chunk_text'] for res in results]
    answer = await generate_answer_with_agent(query, contexts)
    return contexts, answer

# Main interactive loop
def main():
    print("Welcome to the Lab of Future AI Assistant!")
    print("Type your question and press Enter. Type 'exit' to quit.\n")

    model, index, chunk_texts, urls = initialize_search_system()

    while True:
        query = input("You: ").strip()
        if query.lower() == "exit":
            print("Goodbye!")
            break
        try:
            # Run async Gemini + retrieval call synchronously
            contexts, answer = asyncio.run(chat_once(query, model, index, chunk_texts, urls))
            
            print("\nRetrieved contexts:")
            for i, ctx in enumerate(contexts, 1):
                print(f"{i}. {ctx[:200]}...")  # show snippet
            
            print("\nAI answer:\n")
            print(fill(answer, width=80))
            print("-" * 80)
        except Exception as e:
            print(f"Error: {e}")
            print("Please try again.")

# Uncomment below line to run the chatbot interactively
main()

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Welcome to the Lab of Future AI Assistant!
Type your question and press Enter. Type 'exit' to quit.



  print("Please try again.")


Error: asyncio.run() cannot be called from a running event loop
Please try again.
Error: asyncio.run() cannot be called from a running event loop
Please try again.
Error: asyncio.run() cannot be called from a running event loop
Please try again.
Error: asyncio.run() cannot be called from a running event loop
Please try again.
Error: asyncio.run() cannot be called from a running event loop
Please try again.
Error: asyncio.run() cannot be called from a running event loop
Please try again.
Error: asyncio.run() cannot be called from a running event loop
Please try again.


KeyboardInterrupt: Interrupted by user

In [None]:
import asyncio
import numpy as np
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
from agno.agent import Agent
from agno.models.google import Gemini
from textwrap import dedent, fill
import os

# Set your Gemini API key securely (replace with your real key)
os.environ["GOOGLE_API_KEY"] = ""

# Initialize Gemini agent with instructions
gemini_model = Gemini(
    id="gemini-2.0-flash-001",
    instructions=[
        "You are an AI assistant for Lab of Future. Answer questions concisely based on given context."
    ],
)

agent = Agent(model=gemini_model, markdown=True)

# Initialize search system (run once)
def initialize_search_system():
    chunked_df = pd.read_csv('laboffuture_content_chunks_metadata.csv')
    embeddings = np.load('laboffuture_embeddings.npy')
    chunk_texts = chunked_df['chunk_text'].tolist()
    urls = chunked_df['url'].tolist()
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    model = SentenceTransformer('all-MiniLM-L6-v2')
    return model, index, chunk_texts, urls

# Semantic search function
def search_similar_chunks(query_text, model, index, chunk_texts, urls, top_k=3):
    query_embedding = model.encode([query_text])
    query_embedding = np.array(query_embedding).astype('float32')
    distances, indices = index.search(query_embedding, top_k)
    results = []
    for dist, idx in zip(distances[0], indices[0]):
        results.append({
            'chunk_text': chunk_texts[idx],
            'url': urls[idx],
            'distance': dist
        })
    return results

# Async function to generate answer from Gemini agent
async def generate_answer_with_agent(query, contexts):
    prompt = dedent("""\
        Context:
    """)
    for i, context in enumerate(contexts, 1):
        prompt += f"{i}. {context}\n"
    prompt += f"\nQuestion: {query}\nAnswer:"

    run_response = await agent.arun(prompt)
    return run_response.content

# Async chat for a single query
async def chat_once(query, model, index, chunk_texts, urls):
    results = search_similar_chunks(query, model, index, chunk_texts, urls, top_k=3)
    contexts = [res['chunk_text'] for res in results]
    answer = await generate_answer_with_agent(query, contexts)
    return contexts, answer

# Main interactive loop
async def main():
    print("Welcome to the Lab of Future AI Assistant!")
    print("Type your question and press Enter. Type 'exit' to quit.\n")

    model, index, chunk_texts, urls = initialize_search_system()

    while True:
        query = input("You: ").strip()
        if query.lower() == "exit":
            print("Goodbye!")
            break
        try:
            # Run async Gemini + retrieval call synchronously using await
            contexts, answer = await chat_once(query, model, index, chunk_texts, urls)

            print("\nRetrieved contexts:")
            for i, ctx in enumerate(contexts, 1):
                print(f"{i}. {ctx[:200]}...")  # show snippet

            print("\nAI answer:\n")
            print(fill(answer, width=80))
            print("-" * 80)
        except Exception as e:
            print(f"Error: {e}")
            print("Please try again.")

# If you're running this in a notebook or interactive shell, call the main function with `await`
await main()

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Welcome to the Lab of Future AI Assistant!
Type your question and press Enter. Type 'exit' to quit.



CancelledError: 