In [1]:
import requests
from bs4 import BeautifulSoup


# Step 1: Scrape data from the URL
url = "https://www.scrapethissite.com/pages/simple/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Extract data (e.g., country names and details)
countries = []
for country_div in soup.find_all('div', class_='country'):
    # Extract country details
    countries.append({
        "name": country_div.find('h3', class_='country-name').text.strip(),
        "capital": country_div.find('span', class_='country-capital').text.strip(),
        "population": country_div.find('span', class_='country-population').text.strip(),
        "area": country_div.find('span', class_='country-area').text.strip()
    })

print("Scraped data:")
for country in countries:
    print(country)

Scraped data:
{'name': 'Andorra', 'capital': 'Andorra la Vella', 'population': '84000', 'area': '468.0'}
{'name': 'United Arab Emirates', 'capital': 'Abu Dhabi', 'population': '4975593', 'area': '82880.0'}
{'name': 'Afghanistan', 'capital': 'Kabul', 'population': '29121286', 'area': '647500.0'}
{'name': 'Antigua and Barbuda', 'capital': "St. John's", 'population': '86754', 'area': '443.0'}
{'name': 'Anguilla', 'capital': 'The Valley', 'population': '13254', 'area': '102.0'}
{'name': 'Albania', 'capital': 'Tirana', 'population': '2986952', 'area': '28748.0'}
{'name': 'Armenia', 'capital': 'Yerevan', 'population': '2968000', 'area': '29800.0'}
{'name': 'Angola', 'capital': 'Luanda', 'population': '13068161', 'area': '1246700.0'}
{'name': 'Antarctica', 'capital': 'None', 'population': '0', 'area': '1.4E7'}
{'name': 'Argentina', 'capital': 'Buenos Aires', 'population': '41343201', 'area': '2766890.0'}
{'name': 'American Samoa', 'capital': 'Pago Pago', 'population': '57881', 'area': '199.0'

In [2]:
# Step 2: Store data in ChromaDB
import chromadb
from chromadb.config import Settings
import chromadb.utils.embedding_functions as embedding_functions

# Initialize ChromaDB client
client = chromadb.Client()

# use directly
google_ef  = embedding_functions.GoogleGenerativeAiEmbeddingFunction(api_key="AIzaSyAO-5OmSfaSEZwwwuRxpjafmFzdsmWBENY")
google_ef(["document1","document2"])

# pass documents to query for .add and .query
collection = client.create_collection(name="name", embedding_function=google_ef)
collection = client.get_collection(name="name", embedding_function=google_ef)

# Prepare data for insertion
documents = []
metadatas = []
ids = []

for i, country in enumerate(countries):
    # Create a text representation of the country for the document
    country_text = f"Country: {country['name']}, Capital: {country['capital']}, Population: {country['population']}, Area: {country['area']}"
    documents.append(country_text)
    
    # Store the original data as metadata
    metadatas.append({
        "name": country["name"],
        "capital": country["capital"],
        "population": country["population"],
        "area": country["area"]
    })
    
    # Create a unique ID for each entry
    ids.append(f"country_{i}")

# Add all countries to ChromaDB at once
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

print(f"Successfully stored {len(countries)} countries in ChromaDB!")

# Example query to verify the data is stored
result = collection.query(
    query_texts=["What is the population of France?"],
    n_results=3
)
print("\nSample query results:")
print(result)

  from .autonotebook import tqdm as notebook_tqdm


Successfully stored 250 countries in ChromaDB!

Sample query results:
{'ids': [['country_74', 'country_137', 'country_61']], 'embeddings': None, 'documents': [['Country: France, Capital: Paris, Population: 64768389, Area: 547030.0', 'Country: Monaco, Capital: Monaco, Population: 32965, Area: 1.95', 'Country: Algeria, Capital: Algiers, Population: 34586184, Area: 2381740.0']], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[{'capital': 'Paris', 'population': '64768389', 'area': '547030.0', 'name': 'France'}, {'population': '32965', 'area': '1.95', 'name': 'Monaco', 'capital': 'Monaco'}, {'population': '34586184', 'area': '2381740.0', 'name': 'Algeria', 'capital': 'Algiers'}]], 'distances': [[0.3727891743183136, 0.47047048807144165, 0.47382891178131104]]}


In [10]:
# Step 3: Build a RAG chatbot with Gemini
import google.generativeai as genai
from IPython.display import display, Markdown

# Configure the Gemini API with your key (using the same key as for embeddings)
api_key = "AIzaSyAO-5OmSfaSEZwwwuRxpjafmFzdsmWBENY"
genai.configure(api_key=api_key)

# Initialize the Gemini model
model = genai.GenerativeModel('gemini-2.0-flash')

# Function to retrieve relevant information from ChromaDB
def retrieve_context(query, n_results=3):
    # Get the collection we created earlier
    collection = client.get_collection(name="name", embedding_function=google_ef)
    
    # Query ChromaDB for relevant documents
    results = collection.query(
        query_texts=[query],
        n_results=n_results
    )
    
    # Format the retrieved information into a context string
    context = "Here is some information that might help answer the question:\n\n"
    
    for i in range(len(results["documents"][0])):
        context += f"{results['documents'][0][i]}\n\n"
    
    return context

# RAG chatbot function
def rag_chatbot(query):
    # Step 1: Retrieve relevant context from vector database
    context = retrieve_context(query)
    
    # Step 2: Create a prompt that includes the retrieved context
    prompt = f"""You are a helpful assistant that answers questions about countries around the world.
    Use ONLY the following context to answer the question. If you don't know the answer based on the context, say you don't have enough information.
    
    Context:
    {context}
    
    Question: {query}
    
    Answer:"""
    
    # Step 3: Generate a response using Gemini with the augmented prompt
    response = model.generate_content(prompt)
    
    return response.text

# Demo interface for using the chatbot
def chat_interface():
    print("🤖 Country Information Chatbot (type 'exit' to quit)")
    print("---------------------------------------------")
    
    while True:
        user_input = input("You: ")
        
        if user_input.lower() in ["exit", "quit"]:
            print("Goodbye!")
            break
        
        response = rag_chatbot(user_input)
        display(Markdown(f"**Chatbot**: {response}"))
        print("---------------------------------------------")