STEP 1: INSTALL DEPENDENCIES

In [None]:
!pip install requests beautifulsoup4 numpy faiss-cpu sentence-transformers google-generativeai ipywidgets --quiet

STEP 2: IMPORTS

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import google.generativeai as genai
import ipywidgets as widgets
from IPython.display import display, Markdown
import os
import getpass
from urllib.parse import urljoin, urlparse
from collections import deque

STEP 3: SCRAPER WITH DEPTH AND PAGE LIMIT

In [None]:
def scrape_with_depth(start_urls, max_depth=1, max_pages=10):
    visited = set()
    texts = []
    queue = deque([(url, 0) for url in start_urls])

    while queue and len(visited) < max_pages:
        url, depth = queue.popleft()
        if url in visited or depth > max_depth:
            continue

        try:
            resp = requests.get(url, timeout=10)
            soup = BeautifulSoup(resp.content, 'html.parser')

            # Extract text from multiple tags
            tags = soup.find_all(['p', 'li', 'td', 'th'])
            page_text = "\n".join(tag.get_text(separator=" ", strip=True) for tag in tags)
            texts.append(page_text)
            visited.add(url)

            # Add links to queue if depth limit not reached
            if depth < max_depth:
                for link_tag in soup.find_all('a', href=True):
                    link = urljoin(url, link_tag['href'])
                    if urlparse(link).netloc == urlparse(url).netloc:  # stay on same domain
                        if link not in visited:
                            queue.append((link, depth + 1))

        except Exception as e:
            print(f"Error scraping {url}: {e}")

    return texts

STEP 4: TEXT CLEANING

In [None]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\[[0-9]*\]', '', text)
    text = re.sub(r'\([^)]*\)', '', text)
    return text.strip()

def chunk_text(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

STEP 5: EMBEDDINGS & VECTOR BASE

In [None]:
def build_vector_store(texts):
    cleaned_texts = [clean_text(t) for t in texts]
    all_chunks = []
    for text in cleaned_texts:
        all_chunks.extend(chunk_text(text))

    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = embedding_model.encode(all_chunks)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(np.array(embeddings))
    return all_chunks, embedding_model, index

def retrieve(query, embedding_model, index, all_chunks, top_k=3):
    query_embedding = embedding_model.encode([query])
    D, I = index.search(np.array(query_embedding), top_k)
    return [all_chunks[i] for i in I[0]]

STEP 6: WIDGETS UI

In [None]:
urls_input = widgets.Textarea(
    description="Websites:",
    placeholder="Enter one or more URLs (each on a new line, e.g. https://www.who.int/news-room/fact-sheets/detail/vaccines-and-immunization)",
    layout=widgets.Layout(width='500px', height='100px')
)
depth_input = widgets.IntText(value=1, description="Depth:")
pages_input = widgets.IntText(value=10, description="Max Pages:")

query_input = widgets.Text(
    description="Your Claim:",
    placeholder="e.g., Does garlic cure heart attacks?",
    layout=widgets.Layout(width='500px')
)
scrape_button = widgets.Button(description='Scrape & Build Index', button_style='info')
generate_button = widgets.Button(description='Myth Buster Answer', button_style='success')
output = widgets.Output()

# Variables to store vector DB
all_chunks = []
embedding_model = None
index = None

STEP 7: BUTTON CALLBACKS


In [None]:
def on_scrape_clicked(b):
    global all_chunks, embedding_model, index
    output.clear_output()

    urls = [u.strip() for u in urls_input.value.strip().split("\n") if u.strip()]
    if not urls:
        with output:
            print("Please enter at least one website URL.")
        return

    with output:
        print("Scraping websites...")
    texts = scrape_with_depth(urls, max_depth=depth_input.value, max_pages=pages_input.value)
    all_chunks, embedding_model, index = build_vector_store(texts)

    with output:
        print(f"Scraped and indexed {len(all_chunks)} chunks from {len(texts)} pages.")

def on_generate_clicked(b):
    global all_chunks, embedding_model, index
    output.clear_output()

    if embedding_model is None or index is None:
        with output:
            print("Please scrape and build index first.")
        return

    query = query_input.value.strip()
    if not query:
        with output:
            print("Please enter a health claim.")
        return

    retrieved_chunks = retrieve(query, embedding_model, index, all_chunks, top_k=3)
    prompt = "You are a health myth-busting expert. Using the following trusted information, answer the question truthfully and clearly, citing relevant context:\n\n"
    for i, chunk in enumerate(retrieved_chunks):
        prompt += f"Context {i+1}:\n{chunk}\n\n"
    prompt += f"Question: {query}\nAnswer:"

    try:
        response = gen_model.generate_content(prompt)
        answer = getattr(response, "text", "").strip() if hasattr(response, "text") else response.candidates[0].content.parts[0].text.strip()
    except Exception as e:
        answer = f"Error calling Gemini API: {e}"

    with output:
        display(Markdown(f"### Myth Buster Answer:\n\n{answer}"))

scrape_button.on_click(on_scrape_clicked)
generate_button.on_click(on_generate_clicked)

STEP 8: DISPLAY UI & GEMINI KEY CONFIG






In [None]:
os.environ['GOOGLE_API_KEY'] = getpass.getpass("Enter your Gemini API Key: ")
genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
gen_model = genai.GenerativeModel("gemini-2.5-flash")

display(widgets.VBox([
    urls_input,
    depth_input,
    pages_input,
    scrape_button,
    query_input,
    generate_button,
    output
]))