In [2]:
import sys
!{sys.executable} -m pip install requests validators
!{sys.executable} -m pip install requests beautifulsoup4 readability-lxml tldextract



In [3]:
import requests
import validators
from urllib.parse import urlparse

class URLHandler:

    @staticmethod
    def validate_url(url: str) -> tuple[bool, str]:
        """Check if URL is properly formatted"""

        if not url or url.strip() == "":
            return False, "URL is empty"

        if not validators.url(url):
            return False, "Invalid URL format"

        parsed = urlparse(url)
        if parsed.scheme not in ["http", "https"]:
            return False, "Unsupported URL scheme"

        return True, "Valid URL"


    @staticmethod
    def check_reachable(url: str, timeout: int = 8) -> tuple[bool, str]:
        """Check if URL is reachable"""

        try:
            response = requests.get(
                url,
                timeout=timeout,
                headers={"User-Agent": "Mozilla/5.0"}
            )

            if response.status_code != 200:
                return False, f"Website returned status {response.status_code}"

            content_type = response.headers.get("Content-Type", "")

            # Only allow text/html for chatbot crawling
            if "text/html" not in content_type:
                return False, f"Unsupported content type: {content_type}"

            return True, "Reachable and supported"

        except requests.exceptions.Timeout:
            return False, "Request timed out"

        except requests.exceptions.ConnectionError:
            return False, "URL unreachable"

        except Exception as e:
            return False, f"Error: {str(e)}"


In [4]:
def process_url_input(url: str):

    # 1 Validate
    valid, msg = URLHandler.validate_url(url)
    if not valid:
        return {
            "success": False,
            "error": msg
        }

    # 2️ Reachability
    reachable, msg = URLHandler.check_reachable(url)
    if not reachable:
        return {
            "success": False,
            "error": msg
        }

    # 3️ Passed all checks
    return {
        "success": True,
        "url": url,
        "message": "URL accepted and ready for crawling"
    }

In [5]:
url = input("Enter website URL: ")

result = process_url_input(url)

if result["success"]:
    print("✅ Proceed with crawling:", result["url"])
else:
    print("❌ Error:", result["error"])


Enter website URL: https://en.wikipedia.org/wiki/India
✅ Proceed with crawling: https://en.wikipedia.org/wiki/India


In [28]:
import requests
from bs4 import BeautifulSoup
from readability import Document as ReadabilityDocument
from urllib.parse import urljoin, urlparse
import re


class CleanCrawler:

    def __init__(self, timeout=10):
        self.timeout = timeout
        self.visited = set()


    # -------------------------
    # Fetch HTML safely
    # -------------------------
    def fetch_html(self, url):
        try:
            r = requests.get(
                url,
                timeout=self.timeout,
                headers={"User-Agent": "Mozilla/5.0"}
            )

            if "text/html" not in r.headers.get("Content-Type", ""):
                return None, "Not an HTML page"

            return r.text, None

        except Exception as e:
            return None, str(e)


    # -------------------------
    # Remove unwanted sections
    # -------------------------
    def remove_noise(self, soup):

        junk_tags = [
            "header", "footer", "nav", "aside",
            "script", "style", "noscript",
            "iframe", "form"
        ]

        for tag in junk_tags:
            for el in soup.find_all(tag):
                el.decompose()

        # remove common ad/menu classes
        junk_classes = [
            "header", "footer", "nav", "menu",
            "sidebar", "ads", "advert", "promo",
            "banner", "cookie"
        ]

        for cls in junk_classes:
            for el in soup.select(f"[class*='{cls}']"):
                el.decompose()


    # -------------------------
    # Extract main readable text
    # -------------------------
    def extract_main_text(self, html):

        # readability gets main content block
        doc = ReadabilityDocument(html)
        clean_html = doc.summary()

        soup = BeautifulSoup(clean_html, "html.parser")

        self.remove_noise(soup)

        text_blocks = []

        for tag in soup.find_all(["p", "h1", "h2", "h3", "li"]):
            txt = tag.get_text(" ", strip=True)
            if len(txt) > 40:  # skip tiny fragments
                text_blocks.append(txt)

        return self.deduplicate_blocks(text_blocks)


    # -------------------------
    # Remove duplicates
    # -------------------------
    def deduplicate_blocks(self, blocks):

        seen = set()
        clean = []

        for b in blocks:
            key = re.sub(r"\s+", " ", b.lower())
            if key not in seen:
                seen.add(key)
                clean.append(b)

        return clean


    # -------------------------
    # Crawl single page
    # -------------------------
    def crawl_page(self, url):

        if url in self.visited:
            return []

        self.visited.add(url)

        html, err = self.fetch_html(url)
        if err:
            print("Skip:", url, err)
            return []

        return self.extract_main_text(html)


In [7]:
crawler = CleanCrawler()

url = "https://en.wikipedia.org/wiki/India"

content_blocks = crawler.crawl_page(url)

print("Extracted blocks:", len(content_blocks))

for block in content_blocks[:5]:
    print("\n", block)


Extracted blocks: 564

 India , officially the Republic of India , [ j ] [ 20 ] is a country in South Asia .  It is the seventh-largest country by area ; the most populous country since 2023; [ 21 ] and, since its independence in 1947, the world's most populous democracy. [ 22 ] [ 23 ] Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west; [ k ] China , Nepal , and Bhutan to the north; and Bangladesh and Myanmar to the east. In the Indian Ocean , India is near Sri Lanka and the Maldives ; its Andaman and Nicobar Islands share a maritime border with Myanmar, Thailand , and Indonesia .

 Modern humans arrived on the Indian subcontinent from Africa no later than 55,000 years ago. [ 26 ] [ 27 ] [ 28 ] Their long occupation, predominantly in isolation as hunter-gatherers, has made the region highly diverse. [ 29 ] Settled life emerged on the subcontinent in the western margins of t

In [8]:
pip install nltk



In [9]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
import re
import nltk
from typing import List, Dict


class TextChunker:

    def __init__(
        self,
        chunk_size: int = 500,
        overlap: int = 80
    ):
        self.chunk_size = chunk_size
        self.overlap = overlap


    # --------------------------
    # Text Normalization
    # --------------------------
    def normalize_text(self, text: str) -> str:

        text = re.sub(r"\s+", " ", text)          # collapse whitespace
        text = re.sub(r"\n+", " ", text)
        text = text.strip()

        # remove weird unicode artifacts
        text = text.replace("\xa0", " ")

        return text


    # --------------------------
    # Sentence Split
    # --------------------------
    def split_sentences(self, text: str) -> List[str]:
        return nltk.sent_tokenize(text)


    # --------------------------
    # Semantic Chunk Builder
    # --------------------------
    def build_chunks(self, sentences: List[str]) -> List[str]:

        chunks = []
        current = []

        current_len = 0

        for sent in sentences:

            sent_len = len(sent)

            # if adding sentence exceeds chunk size → flush
            if current_len + sent_len > self.chunk_size:

                chunks.append(" ".join(current))

                # overlap handling
                overlap_text = self._get_overlap_text(current)
                current = overlap_text.copy()
                current_len = sum(len(s) for s in current)

            current.append(sent)
            current_len += sent_len

        if current:
            chunks.append(" ".join(current))

        return chunks


    # --------------------------
    # Overlap Builder
    # --------------------------
    def _get_overlap_text(self, sentences):

        overlap_chars = 0
        overlap_sents = []

        for s in reversed(sentences):
            overlap_chars += len(s)
            overlap_sents.insert(0, s)

            if overlap_chars >= self.overlap:
                break

        return overlap_sents


    # --------------------------
    # Public API
    # --------------------------
    def chunk_document(
        self,
        text_blocks: List[str],
        source_url: str,
        title: str | None = None
    ) -> List[Dict]:

        all_chunks = []
        chunk_id = 0

        for block in text_blocks:

            clean = self.normalize_text(block)
            sentences = self.split_sentences(clean)

            chunks = self.build_chunks(sentences)

            for c in chunks:
                if len(c) < 50:   # skip tiny chunks
                    continue

                all_chunks.append({
                    "chunk_id": chunk_id,
                    "text": c,
                    "source_url": source_url,
                    "title": title
                })

                chunk_id += 1

        return all_chunks

In [11]:
def extract_title(html):

    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, "html.parser")

    if soup.title:
        return soup.title.get_text(strip=True)

    return None

In [12]:
import nltk
nltk.download('punkt_tab')
crawler = CleanCrawler()
chunker = TextChunker(chunk_size=600, overlap=100)

url = "https://en.wikipedia.org/wiki/India"

html, _ = crawler.fetch_html(url)
title = extract_title(html)

blocks = crawler.extract_main_text(html)

chunks = chunker.chunk_document(
    text_blocks=blocks,
    source_url=url,
    title=title
)

print("Total chunks:", len(chunks))
print(chunks[0])

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Total chunks: 702
{'chunk_id': 0, 'text': "India , officially the Republic of India , [ j ] [ 20 ] is a country in South Asia . It is the seventh-largest country by area ; the most populous country since 2023; [ 21 ] and, since its independence in 1947, the world's most populous democracy. [ 22 ] [ 23 ] Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west; [ k ] China , Nepal , and Bhutan to the north; and Bangladesh and Myanmar to the east.", 'source_url': 'https://en.wikipedia.org/wiki/India', 'title': 'India - Wikipedia'}


In [13]:
TextChunker(
    chunk_size = 700,   # larger chunks
    overlap = 120       # more context carryover
)

<__main__.TextChunker at 0x7bc7a376fc80>

In [14]:
pip install sentence-transformers chromadb



In [15]:
from sentence_transformers import SentenceTransformer

class EmbeddingModel:

    def __init__(self):
        self.model = SentenceTransformer(
            "all-MiniLM-L6-v2"
        )

    def embed_texts(self, texts):
        return self.model.encode(
            texts,
            show_progress_bar=True,
            normalize_embeddings=True
        )




In [16]:
import chromadb
from chromadb.config import Settings


class VectorStore:

    def __init__(self, path="./vector_store"):

        self.client = chromadb.PersistentClient(path=path)

        self.collection = self.client.get_or_create_collection(
            name="website_chunks"
        )


    # -------------------------
    # Add chunks with metadata
    # -------------------------
    def add_chunks(self, chunks, embeddings):

        ids = []
        docs = []
        metas = []

        for c in chunks:
            ids.append(str(c["chunk_id"]))
            docs.append(c["text"])
            metas.append({
                "source": c["source_url"],
                "title": c["title"]
            })

        self.collection.add(
            ids=ids,
            documents=docs,
            embeddings=embeddings.tolist(),
            metadatas=metas
        )


    # -------------------------
    # Semantic search
    # -------------------------
    def search(self, query_embedding, k=5):

        return self.collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=k
        )


In [17]:
# chunks = output from your TextChunker

embedder = EmbeddingModel()
vector_db = VectorStore()

texts = [c["text"] for c in chunks]

embeddings = embedder.embed_texts(texts)

vector_db.add_chunks(chunks, embeddings)

print("✅ Stored embeddings persistently")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

✅ Stored embeddings persistently


In [18]:
query = "Tell about india"

query_vec = embedder.embed_texts([query])[0]

results = vector_db.search(query_vec, k=4)

for doc, meta in zip(
    results["documents"][0],
    results["metadatas"][0]
):
    print("\nSOURCE:", meta["source"])
    print(doc[:300])


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


SOURCE: https://en.wikipedia.org/wiki/India
India , officially the Republic of India , [ j ] [ 20 ] is a country in South Asia . It is the seventh-largest country by area ; the most populous country since 2023; [ 21 ] and, since its independence in 1947, the world's most populous democracy. [ 22 ] [ 23 ] Bounded by the Indian Ocean on the sou

SOURCE: https://en.wikipedia.org/wiki/India
"Country Profile: India" (PDF) . Library of Congress Country Studies (5th ed.). Library of Congress Federal Research Division . December 2004. Archived from the original (PDF) on 27 September 2011 . Retrieved 30 September 2011 .

SOURCE: https://en.wikipedia.org/wiki/India
India's constitution was adopted in 1950 and established a secular, democratic republic. Economic liberalisation has created a large urban middle class and transformed India into a fast growing economy . [ 155 ] [ 61 ] However, India has been hamstrung by persistent poverty, both rural and urban; [ 

SOURCE: https://en.wikipedia.org/

In [19]:
pip install langchain langchain-google-genai



In [22]:
from langchain_core.documents import Document

def chroma_to_docs(results):

    docs = []

    for text, meta in zip(
        results["documents"][0],
        results["metadatas"][0]
    ):
        docs.append(Document(
            page_content=text,
            metadata=meta
        ))

    return docs

In [53]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-pro", # Changed model from gemini-1.5-flash to gemini-pro
    temperature=0.0,
    google_api_key="AIzaSyDHsAuQuMwVO1aY_G8X_UO6q6myIeD7GWs" # Replace with your actual Google API Key
)

In [79]:
pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.14.3


In [90]:
RAG_PROMPT = """
Answer using ONLY the provided website context.

Rules:
- Use only facts from context
- Prefer definition-style sentences when available
- Keep the answer short and direct
- Do NOT invent facts
- If answer not present, reply exactly:
The answer is not available on the provided website.

Context:
{context}

Question:
{question}

Answer:
"""

import re
from rapidfuzz import process


# -------------------------
# Helpers
# -------------------------

def polish_answer(ans):

    # remove bracket citations
    ans = re.sub(r"\[[^\]]*\]", "", ans)

    # remove extra spaces
    ans = re.sub(r"\s+", " ", ans)

    # trim commas before verbs
    ans = ans.replace(" ,", ",")

    # cut after first strong sentence if too long
    parts = re.split(r'(?<=[.!?]) +', ans)
    if parts:
        ans = parts[0]

    # remove trailing incomplete clause
    ans = ans.strip(" ,;-")

    return ans.strip()

def clean_context(text):
    return re.sub(r"\[\s*\d+\s*\]", "", text)


def build_vocab_from_chunks(chunks):
    vocab = set()
    for c in chunks:
        for w in c["text"].lower().split():
            if len(w) > 3:
                vocab.add(w.strip(".,;:()"))
    return list(vocab)


def fuzzy_correct_query(question, vocab):

    fixed = []
    for w in question.lower().split():
        match = process.extractOne(w, vocab, score_cutoff=85)
        fixed.append(match[0] if match else w)

    return " ".join(fixed)


# ---------- smarter reranker ----------

def rerank_chunks(docs, question, top_n=6):

    q = question.lower()
    q_words = set(q.split())

    scored = []

    for d in docs:
        d_low = d.lower()
        score = 0

        # keyword overlap
        for w in q_words:
            if w in d_low:
                score += 2

        # strong definition boost
        if " is a " in d_low or " is an " in d_low:
            score += 6

        # where-question boost
        if q.startswith("where") and " in " in d_low:
            score += 5

        # when-question boost
        if q.startswith("when") and any(x in d_low for x in ["year", "independ", "founded", "since"]):
            score += 5

        scored.append((score, d))

    scored.sort(reverse=True)

    return [d for s, d in scored[:top_n]]

def extract_definition_sentence(docs, question):

    target_words = question.lower().split()

    for d in docs:
        sents = d.split(". ")

        for s in sents:
            s_low = s.lower()

            if (
                (" is a " in s_low or " is an " in s_low)
                and any(w in s_low for w in target_words)
            ):
                return s.strip()

    return None

# ---------- smarter sentence selector ----------

def select_top_sentences(context, question, n=6):

    q = question.lower()
    q_words = set(q.split())
    sents = context.split(". ")

    ranked = []

    for s in sents:
        s_low = s.lower()
        score = 0

        for w in q_words:
            if w in s_low:
                score += 2

        # definition boost
        if " is a " in s_low or " is an " in s_low:
            score += 5

        # location boost
        if q.startswith("where") and " in " in s_low:
            score += 4

        ranked.append((score, s))

    ranked.sort(reverse=True)

    best = [s for sc, s in ranked[:n] if sc > 0]

    if not best:
        best = sents[:3]

    return ". ".join(best)


# -------------------------
# QA Engine
# -------------------------

class WebsiteQA:

    def __init__(self, embedder, vector_db, chunks):
        self.embedder = embedder
        self.vector_db = vector_db
        self.vocab = build_vocab_from_chunks(chunks)

    def answer(self, question):

        # ---- fuzzy normalize ----
        fixed_q = fuzzy_correct_query(question, self.vocab)

        # ---- embed ----
        q_vec = self.embedder.embed_texts([fixed_q])[0]

        # ---- wide retrieval ----
        results = self.vector_db.search(q_vec, k=25)

        docs = results["documents"][0]
        distances = results["distances"][0]

        if not docs or min(distances) > 0.95:
            return "The answer is not available on the provided website."

        # ---- try direct definition extraction ----
        definition = extract_definition_sentence(docs, fixed_q)

        if definition:
            return polish_answer(clean_context(definition))

        # ---- build context for LLM ----
        context = "\n".join(docs)
        context = context[:3200]
        context = clean_context(context)
        context = select_top_sentences(context, fixed_q)


        if len(context.strip()) < 20:
            return "The answer is not available on the provided website."

        # ---- prompt ----
        prompt = RAG_PROMPT.format(
            context=context,
            question=fixed_q
        )

        # ---- LLM ----
        resp = hf_answer(prompt).strip()

        if len(resp) < 5:
            return "The answer is not available on the provided website."

        return polish_answer(resp)

In [91]:
# from crawler import CleanCrawler
# from chunker import TextChunker
# from embeddings import EmbeddingModel, VectorStore
# from qa_engine import WebsiteQA, extract_title

# -------------------------
# CONFIG
# -------------------------

URL = input("Enter website URL: ").strip()

# -------------------------
# INIT COMPONENTS
# -------------------------

crawler = CleanCrawler()
chunker = TextChunker(chunk_size=800, overlap=150)
embedder = EmbeddingModel()
vector_db = VectorStore()

# -------------------------
# CRAWL + EXTRACT
# -------------------------

print("\nCrawling website...")
html, err = crawler.fetch_html(URL)

if err:
    print("Error:", err)
    exit()

title = extract_title(html)
blocks = crawler.extract_main_text(html)

print("Text blocks:", len(blocks))

# -------------------------
# CHUNK
# -------------------------

chunks = chunker.chunk_document(
    text_blocks=blocks,
    source_url=URL,
    title=title
)

print("Chunks created:", len(chunks))

# -------------------------
# EMBED + STORE (only once)
# -------------------------

texts = [c["text"] for c in chunks]

print("\nGenerating embeddings...")
embeddings = embedder.embed_texts(texts)

vector_db.add_chunks(chunks, embeddings)

print("Stored in vector DB ✅")

# -------------------------
# QA ENGINE
# -------------------------

qa = WebsiteQA(embedder, vector_db, chunks)

print("\nAsk questions (type 'exit' to stop)\n")

while True:
    q = input("Q: ")

    if q.lower() == "exit":
        break

    ans = qa.answer(q)
    print("\nA:", ans, "\n")

Enter website URL: https://en.wikipedia.org/wiki/India

Crawling website...
Text blocks: 564
Chunks created: 625

Generating embeddings...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Stored in vector DB ✅

Ask questions (type 'exit' to stop)

Q: where is india


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


A: India, officially the Republic of India, is a country in South Asia 

Q: tell about india


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


A: India, officially the Republic of India, is a country in South Asia 

Q: tell about ancient india


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


A: The answer is not available on the provided website. 

Q: exit
