In [3]:
import json, time, re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

PRODUCTS_BASE = "https://www.kddc.com/product/"
CAREERS_BASE  = "https://career.kddc.com/"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; RAGBot/1.0; +https://example.org)"
}

def get_soup(url):
    r = requests.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")

def clean_text(x: str) -> str:
    return re.sub(r"\s+", " ", (x or "").strip())

def scrape_product_categories(max_categories=2):
    """Scrape up to two product categories and their items."""
    soup = get_soup(PRODUCTS_BASE)
    # NOTE: Selectors may need tweaks if site structure changes.
    # Try to find category links (adjust CSS selectors based on KDD page structure):
    cat_links = []
    for a in soup.select("a[href]"):
        href = a.get("href", "")
        if "/product/" in href and href != PRODUCTS_BASE:
            full = urljoin(PRODUCTS_BASE, href)
            cat_links.append(full)
    # Deduplicate / normalize
    cat_links = list(dict.fromkeys(cat_links))

    # Keep only first N categories (as requested)
    chosen = cat_links[:max_categories]

    all_products = []
    for c in chosen:
        try:
            csoup = get_soup(c)
            # Find product “cards” (update selectors as needed)
            product_cards = csoup.select("article, .product-item, .product, .card")
            for card in product_cards:
                title = clean_text((card.select_one(".product-title, h2, h3, .title") or {}).get_text() if card.select_one(".product-title, h2, h3, .title") else "")
                link  = card.select_one("a[href]")
                link  = urljoin(c, link["href"]) if link else c
                desc  = clean_text((card.select_one(".desc, .excerpt, p") or {}).get_text() if card.select_one(".desc, .excerpt, p") else "")
                if title:
                    all_products.append({
                        "category_url": c,
                        "name": title,
                        "url": link,
                        "description": desc
                    })
        except Exception as e:
            print(f"[warn] product category error {c}: {e}")
        time.sleep(0.5)
    return all_products

def scrape_careers():
    """Scrape careers list (title, location, department, link, summary)."""
    jobs = []
    try:
        soup = get_soup(CAREERS_BASE)
        # Adjust selectors according to real DOM:
        job_cards = soup.select("article, .job, .career, .opening, .position, .vacancy, .card")
        for card in job_cards:
            title = clean_text((card.select_one("h2, h3, .title") or {}).get_text() if card.select_one("h2, h3, .title") else "")
            link  = card.select_one("a[href]")
            link  = urljoin(CAREERS_BASE, link["href"]) if link else CAREERS_BASE
            location = clean_text((card.select_one(".location") or {}).get_text() if card.select_one(".location") else "")
            dept     = clean_text((card.select_one(".department, .dept") or {}).get_text() if card.select_one(".department, .dept") else "")
            summary  = clean_text((card.select_one("p, .summary, .desc") or {}).get_text() if card.select_one("p, .summary, .desc") else "")
            if title:
                jobs.append({
                    "title": title,
                    "url": link,
                    "location": location,
                    "department": dept,
                    "summary": summary
                })
    except Exception as e:
        print(f"[warn] careers page error: {e}")
    return jobs

def save_json(products, careers, path="kdd_data.json"):
    payload = {"products": products, "careers": careers, "meta": {"source_products": PRODUCTS_BASE, "source_careers": CAREERS_BASE}}
    with open(path, "w", encoding="utf-8") as f:
        json.dump(payload, f, indent=2, ensure_ascii=False)
    print(f"✅ Saved {len(products)} products and {len(careers)} careers to {path}")

# Run scraping (safe if site structure changed; you can also skip and bring your own JSON)
products = scrape_product_categories(max_categories=2)
careers  = scrape_careers()
if not products and not careers:
    # Fallback minimal sample so downstream RAG always runs
    print("[info] Using fallback sample data (scraping returned nothing).")
    products = [
        {"category_url": PRODUCTS_BASE+"sample-iot/", "name": "IoT Gateway X1", "url": PRODUCTS_BASE+"iot-gateway-x1", "description": "Edge gateway for industrial IoT."},
        {"category_url": PRODUCTS_BASE+"sample-networking/", "name": "Smart Switch S12", "url": PRODUCTS_BASE+"smart-switch-s12", "description": "Managed Layer-2 switch for SMBs."},
    ]
    careers = [
        {"title": "Data Engineer", "url": CAREERS_BASE+"jobs/data-engineer", "location": "Kuwait", "department": "IT", "summary": "Build data pipelines and analytics."},
        {"title": "Sales Executive", "url": CAREERS_BASE+"jobs/sales-executive", "location": "Kuwait", "department": "Sales", "summary": "Develop client relationships and hit targets."},
    ]

save_json(products, careers, "kdd_data.json")

[info] Using fallback sample data (scraping returned nothing).
✅ Saved 2 products and 2 careers to kdd_data.json


In [4]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [5]:
import json, numpy as np
from sentence_transformers import SentenceTransformer
import faiss

with open("kdd_data.json", "r", encoding="utf-8") as f:
    raw = json.load(f)

docs = []
for p in raw.get("products", []):
    docs.append({
        "id": f"prod::{p.get('name','').strip()}",
        "type": "product",
        "payload": p,
        "text": json.dumps(p, ensure_ascii=False)
    })
for j in raw.get("careers", []):
    docs.append({
        "id": f"job::{j.get('title','').strip()}",
        "type": "career",
        "payload": j,
        "text": json.dumps(j, ensure_ascii=False)
    })

# Embeddings model (UNGATED & strong)
embedder = SentenceTransformer("intfloat/e5-base-v2")

def embed_texts(texts):
    # e5 formatting: "passage:" for docs, "query:" for queries
    return embedder.encode(texts, normalize_embeddings=True, convert_to_numpy=True)

doc_texts_for_emb = [f"passage: {d['text']}" for d in docs]
doc_vecs = embed_texts(doc_texts_for_emb)

# Build FAISS index (inner product since normalized)
dim = doc_vecs.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(doc_vecs)
print(f"✅ Indexed {index.ntotal} documents (dim={dim})")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

✅ Indexed 4 documents (dim=768)


In [6]:
try:
    from sentence_transformers import CrossEncoder
    reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
    USE_RERANK = True
    print("✅ Re-ranker loaded.")
except Exception as e:
    print(f"[info] Reranker not available: {e}")
    reranker = None
    USE_RERANK = False

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

✅ Re-ranker loaded.


In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import numpy as np

tok = AutoTokenizer.from_pretrained("google/flan-t5-base")
gen = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

def retrieve(query: str, k=10, k_final=5):
    qv = embed_texts([f"query: {query}"])
    D, I = index.search(qv, k)
    candidates = [docs[i] for i in I[0]]
    if USE_RERANK and reranker is not None:
        pairs = [(query, c["text"]) for c in candidates]
        scores = reranker.predict(pairs)
        order = np.argsort(scores)[::-1][:k_final]
        return [candidates[i] for i in order]
    return candidates[:k_final]

SYSTEM_INSTR = (
    "You are a helpful assistant that answers ONLY using the provided context. "
    "If the answer is not present in the context, say: 'I don't know based on the scraped data.'"
)

def generate_answer(query: str, contexts, max_new_tokens=256):
    context_block = "\n\n".join([c["text"] for c in contexts]) if contexts else "No context."
    prompt = (
        f"{SYSTEM_INSTR}\n\n"
        f"Context:\n{context_block}\n\n"
        f"Question: {query}\nAnswer:"
    )
    inputs = tok(prompt, return_tensors="pt", truncation=True, max_length=2048)
    with torch.no_grad():
        out = gen.generate(**inputs, max_new_tokens=max_new_tokens)
    return tok.decode(out[0], skip_special_tokens=True)

# Smoke test
q = "List the product names we scraped."
ctx = retrieve(q)
print("🔍 Retrieved:", [c["id"] for c in ctx])
print("🤖 Answer:", generate_answer(q, ctx))

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

🔍 Retrieved: ['prod::Smart Switch S12', 'prod::IoT Gateway X1', 'job::Sales Executive', 'job::Sales Executive', 'job::Sales Executive']
🤖 Answer: Smart Switch S12, IoT Gateway X1, IoT Gateway X1, IoT Gateway X1


In [8]:
from fastapi import FastAPI, Query
from pydantic import BaseModel
from typing import List, Dict, Any

app = FastAPI(title="KDD RAG API", version="1.0")

class AskResponse(BaseModel):
    answer: str
    context_ids: List[str]
    context_snippets: List[str]

@app.get("/health")
def health():
    return {"status": "ok", "docs": len(docs)}

@app.get("/ask", response_model=AskResponse)
def ask(query: str = Query(..., description="Natural language question about KDD products or careers")):
    ctx = retrieve(query)
    answer = generate_answer(query, ctx)
    return AskResponse(
        answer=answer,
        context_ids=[c["id"] for c in ctx],
        context_snippets=[c["text"][:240] for c in ctx]
    )

In [9]:
readme = """# KDD RAG (Scrape → JSON → Dense Retrieval → FastAPI)

## Overview
End-to-end RAG system:
1) Scrape KDD products (2 categories) + careers
2) Store as JSON
3) Dense retrieval (e5-base-v2) + optional re-rank
4) Generate answers with FLAN-T5
5) Serve via FastAPI (/ask)

## Stack
- Scraping: requests + BeautifulSoup
- Storage: JSON (`kdd_data.json`)
- Embeddings: intfloat/e5-base-v2 (ungated)
- Vector DB: FAISS (inner product, normalized)
- Generator: google/flan-t5-base (ungated)
- API: FastAPI (+ Swagger UI at /docs)

## Run
pip install -r requirements.txt
python scraper.py   # or run Cell 1
uvicorn app:app --reload

## Example
GET /ask?query=List the KDD product names we scraped.

## Notes
- If the KDD structure changes, adjust CSS selectors in scraper.
- For bonus, replace JSON with PostgreSQL/Milvus/FAISS persisted index.
"""
with open("README.md", "w", encoding="utf-8") as f:
    f.write(readme)
print("✅ Wrote README.md")

✅ Wrote README.md


In [10]:
scraper_py = r'''
# scraper.py
import json, time, re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

PRODUCTS_BASE = "https://www.kddc.com/product/"
CAREERS_BASE  = "https://career.kddc.com/"
HEADERS       = {"User-Agent": "Mozilla/5.0 (compatible; RAGBot/1.0; +https://example.org)"}

def get_soup(url):
    r = requests.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")

def clean_text(x: str) -> str:
    import re
    return re.sub(r"\s+", " ", (x or "").strip())

def scrape_product_categories(max_categories=2):
    soup = get_soup(PRODUCTS_BASE)
    cat_links = []
    for a in soup.select("a[href]"):
        href = a.get("href", "")
        if "/product/" in href and href != PRODUCTS_BASE:
            full = urljoin(PRODUCTS_BASE, href)
            cat_links.append(full)
    cat_links = list(dict.fromkeys(cat_links))
    chosen = cat_links[:max_categories]
    all_products = []
    for c in chosen:
        try:
            csoup = get_soup(c)
            product_cards = csoup.select("article, .product-item, .product, .card")
            for card in product_cards:
                title_el = card.select_one(".product-title, h2, h3, .title")
                title = clean_text(title_el.get_text()) if title_el else ""
                link_el  = card.select_one("a[href]")
                link  = urljoin(c, link_el["href"]) if link_el else c
                desc_el  = card.select_one(".desc, .excerpt, p")
                desc  = clean_text(desc_el.get_text()) if desc_el else ""
                if title:
                    all_products.append({"category_url": c, "name": title, "url": link, "description": desc})
        except Exception as e:
            print(f"[warn] product category error {c}: {e}")
        time.sleep(0.5)
    return all_products

def scrape_careers():
    jobs = []
    try:
        soup = get_soup(CAREERS_BASE)
        job_cards = soup.select("article, .job, .career, .opening, .position, .vacancy, .card")
        for card in job_cards:
            title_el = card.select_one("h2, h3, .title")
            title = clean_text(title_el.get_text()) if title_el else ""
            link_el  = card.select_one("a[href]")
            link  = urljoin(CAREERS_BASE, link_el["href"]) if link_el else CAREERS_BASE
            loc_el = card.select_one(".location")
            location = clean_text(loc_el.get_text()) if loc_el else ""
            dept_el = card.select_one(".department, .dept")
            dept     = clean_text(dept_el.get_text()) if dept_el else ""
            sum_el = card.select_one("p, .summary, .desc")
            summary  = clean_text(sum_el.get_text()) if sum_el else ""
            if title:
                jobs.append({"title": title, "url": link, "location": location, "department": dept, "summary": summary})
    except Exception as e:
        print(f"[warn] careers page error: {e}")
    return jobs

def save_json(products, careers, path="kdd_data.json"):
    payload = {"products": products, "careers": careers, "meta": {"source_products": PRODUCTS_BASE, "source_careers": CAREERS_BASE}}
    with open(path, "w", encoding="utf-8") as f:
        json.dump(payload, f, indent=2, ensure_ascii=False)
    print(f"Saved {len(products)} products and {len(careers)} careers to {path}")

if __name__ == "__main__":
    products = scrape_product_categories(max_categories=2)
    careers  = scrape_careers()
    if not products and not careers:
        print("[info] Using fallback sample data.")
        products = [
            {"category_url": PRODUCTS_BASE+"sample-iot/", "name": "IoT Gateway X1", "url": PRODUCTS_BASE+"iot-gateway-x1", "description": "Edge gateway for industrial IoT."},
            {"category_url": PRODUCTS_BASE+"sample-networking/", "name": "Smart Switch S12", "url": PRODUCTS_BASE+"smart-switch-s12", "description": "Managed Layer-2 switch for SMBs."},
        ]
        careers = [
            {"title": "Data Engineer", "url": CAREERS_BASE+"jobs/data-engineer", "location": "Kuwait", "department": "IT", "summary": "Build data pipelines and analytics."},
            {"title": "Sales Executive", "url": CAREERS_BASE+"jobs/sales-executive", "location": "Kuwait", "department": "Sales", "summary": "Develop client relationships and hit targets."},
        ]
    save_json(products, careers, "kdd_data.json")
'''

app_py = r'''
# app.py
import json, numpy as np
from fastapi import FastAPI, Query
from pydantic import BaseModel
from typing import List
from sentence_transformers import SentenceTransformer, CrossEncoder
import faiss
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Load data
with open("kdd_data.json", "r", encoding="utf-8") as f:
    raw = json.load(f)

docs = []
for p in raw.get("products", []):
    docs.append({"id": f"prod::{p.get('name','').strip()}", "type": "product", "payload": p, "text": json.dumps(p, ensure_ascii=False)})
for j in raw.get("careers", []):
    docs.append({"id": f"job::{j.get('title','').strip()}", "type": "career", "payload": j, "text": json.dumps(j, ensure_ascii=False)})

# Embeddings + index
embedder = SentenceTransformer("intfloat/e5-base-v2")
def embed_texts(texts): return embedder.encode(texts, normalize_embeddings=True, convert_to_numpy=True)
doc_vecs = embed_texts([f"passage: {d['text']}" for d in docs])
index = faiss.IndexFlatIP(doc_vecs.shape[1])
index.add(doc_vecs)

# Re-ranker (optional)
try:
    reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
    USE_RERANK = True
except Exception:
    reranker = None
    USE_RERANK = False

# Generator
tok = AutoTokenizer.from_pretrained("google/flan-t5-base")
gen = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

SYSTEM_INSTR = ("You are a helpful assistant that answers ONLY using the provided context. "
                "If the answer is not present, say: 'I don't know based on the scraped data.'")

def retrieve(query: str, k=10, k_final=5):
    qv = embed_texts([f"query: {query}"])
    D, I = index.search(qv, k)
    cands = [docs[i] for i in I[0]]
    if USE_RERANK and reranker is not None:
        pairs = [(query, c["text"]) for c in cands]
        scores = reranker.predict(pairs)
        order = np.argsort(scores)[::-1][:k_final]
        return [cands[i] for i in order]
    return cands[:k_final]

def generate_answer(query: str, contexts, max_new_tokens=256):
    context_block = "\n\n".join([c["text"] for c in contexts]) if contexts else "No context."
    prompt = f"{SYSTEM_INSTR}\n\nContext:\n{context_block}\n\nQuestion: {query}\nAnswer:"
    inputs = tok(prompt, return_tensors="pt", truncation=True, max_length=2048)
    with torch.no_grad():
        out = gen.generate(**inputs, max_new_tokens=max_new_tokens)
    return tok.decode(out[0], skip_special_tokens=True)

class AskResponse(BaseModel):
    answer: str
    context_ids: List[str]
    context_snippets: List[str]

app = FastAPI(title="KDD RAG API", version="1.0")

@app.get("/health")
def health():
    return {"status": "ok", "docs": len(docs)}

@app.get("/ask", response_model=AskResponse)
def ask(query: str = Query(..., description="NL question about KDD products or careers")):
    ctx = retrieve(query)
    answer = generate_answer(query, ctx)
    return AskResponse(
        answer=answer,
        context_ids=[c["id"] for c in ctx],
        context_snippets=[c["text"][:240] for c in ctx]
    )
'''

reqs = """transformers
sentence-transformers
faiss-cpu
fastapi
uvicorn
bs4
requests
pydantic==2.*
python-multipart
"""

with open("scraper.py", "w", encoding="utf-8") as f:
    f.write(scraper_py)
with open("app.py", "w", encoding="utf-8") as f:
    f.write(app_py)
with open("requirements.txt", "w", encoding="utf-8") as f:
    f.write(reqs)

print("✅ Wrote scraper.py, app.py, requirements.txt")

✅ Wrote scraper.py, app.py, requirements.txt
