In [None]:
# === Install dependencies ===
!pip install -q langchain langchain-community chromadb sentence-transformers tiktoken groq python-dotenv beautifulsoup4 requests


In [None]:
# === Imports & helper functions ===
import os
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions

# Ensure persistence folder
PERSIST_DIR = Path("/content/spacex_db")
PERSIST_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
# === Combined SpaceX Web + API Data Fetching ===
from bs4 import BeautifulSoup
import requests

# -----------------------------------------------
# 1. Fetch public SpaceX and related website content
# -----------------------------------------------
print("🌐 Fetching from public SpaceX web pages...")

URLS = [
    "https://www.spacex.com/vehicles/",
    "https://www.spacex.com/launches/",
    "https://www.spacex.com/human-spaceflight/",
    "https://www.spacex.com/updates/",
    "https://www.teslarati.com/category/spacex/",     # open-access SpaceX articles
    "https://www.space.com/topics/spacex",            # science site with mission coverage
    "https://everydayastronaut.com/",
]

def fetch_text_from_url(url):
    print(f"Fetching: {url}")
    try:
        r = requests.get(url, timeout=20)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")

        # Remove noise
        for s in soup(["script", "style", "header", "footer", "nav", "form"]):
            s.decompose()

        # Extract paragraphs + headings
        texts = [p.get_text(separator=" ", strip=True) for p in soup.find_all("p")]
        headings = [h.get_text(strip=True) for h in soup.find_all(["h1", "h2", "h3"])]
        content = "\n".join(headings + texts)
        return content
    except Exception as e:
        print(f"❌ Failed to fetch {url}: {e}")
        return ""

docs = []

for url in URLS:
    text = fetch_text_from_url(url)
    if text:
        docs.append({"url": url, "text": text})
        print(f"✅ Collected from: {url} ({len(text)} chars)")

print(f"\n🌍 Total website documents: {len(docs)}")

# -----------------------------------------------
# 2. Fetch official SpaceX API data
# -----------------------------------------------
print("\n🚀 Fetching from SpaceX public API...")

API_URLS = [
    "https://api.spacexdata.com/v4/company",
    "https://api.spacexdata.com/v4/rockets",
    "https://api.spacexdata.com/v4/crew",
    "https://api.spacexdata.com/v4/launches/latest",
    "https://api.spacexdata.com/v4/starlink",
]

for url in API_URLS:
    try:
        r = requests.get(url, timeout=15)
        r.raise_for_status()
        data = r.json()
        text = str(data)
        docs.append({"url": url, "text": text})
        print(f"✅ Added API data: {url} ({len(text)} chars)")
    except Exception as e:
        print(f"❌ Failed to fetch API {url}: {e}")

print(f"\n🧩 Total combined documents: {len(docs)} ✅")


In [None]:
!pip install -q PyMuPDF

In [None]:
# === Chunking utility (simple, deterministic) ===
import math
import hashlib

def chunk_text(text, chunk_size=800, overlap=100):
    """Split text into overlapping chunks by characters (simple)."""
    text = text.replace("\n", " ").strip()
    chunks = []
    start = 0
    L = len(text)
    while start < L:
        end = start + chunk_size
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

all_chunks = []

for d in docs:
    chunks = chunk_text(d["text"], chunk_size=1000, overlap=100)
    for i, c in enumerate(chunks):
        # use md5 hash to make a unique ID for each URL + chunk index
        unique_hash = hashlib.md5(f"{d['url']}_{i}".encode()).hexdigest()[:10]
        all_chunks.append({
            "id": f"{Path(d['url']).stem}_{i}_{unique_hash}",
            "text": c,
            "source": d["url"]
        })

print("Total chunks prepared:", len(all_chunks))


In [None]:
# === Build embeddings with sentence-transformers and persist to ChromaDB ===
# NOTE: For Colab CPU usage, all-MiniLM-L6-v2 is fast and practical.
model_name = "all-MiniLM-L6-v2"  # SentenceTransformers model
print("Loading embedding model:", model_name)
embed_model = SentenceTransformer(model_name)

# Chroma client
client = chromadb.Client()
# Create or get collection
collection_name = "spacex_collection"
try:
    collection = client.get_collection(collection_name)
    print("Loaded existing collection:", collection_name)
except Exception:
    collection = client.create_collection(collection_name)

# Build embeddings and upsert
metadatas = [{"source": c["source"]} for c in all_chunks]
ids = [c["id"] for c in all_chunks]
texts = [c["text"] for c in all_chunks]

# Use embedding function wrapper for chroma to allow batch inserts
def embed_batch(texts):
    # sentence-transformers returns numpy arrays
    embs = embed_model.encode(texts, show_progress_bar=True, normalize_embeddings=True)
    return embs.tolist()

print("Embedding and upserting into Chroma...")
BATCH = 64
for i in range(0, len(texts), BATCH):
    batch_texts = texts[i:i+BATCH]
    batch_ids = ids[i:i+BATCH]
    batch_metas = metadatas[i:i+BATCH]
    embs = embed_batch(batch_texts)
    collection.upsert(
        ids=batch_ids,
        metadatas=batch_metas,
        documents=batch_texts,
        embeddings=embs
    )

# Persist Chroma client data to the directory (Colab: we can use local persistence by saving exported JSON if needed)
print("✅ Upsert complete. You can now download / persist the database folder if desired.")


In [None]:
import os

# Set key securely inside Colab (will not be saved in notebook)
os.environ["GROQ_API_KEY"] = input("🔐 Enter your GROQ API key: ")

In [None]:
# === Retrieval-Augmented Generation (RAG) with Conversation History ===
from groq import Groq

# Initialize Groq client
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
client_groq = Groq(api_key=GROQ_API_KEY)

# Memory: to store conversation history
conversation_history = []

# === Retrieval ===
def retrieve_context(query, top_k=3):
    query_emb = embed_model.encode([query], normalize_embeddings=True).tolist()
    results = collection.query(query_embeddings=query_emb, n_results=top_k)
    docs_texts = [doc for doc in results["documents"][0]]
    return "\n\n".join(docs_texts)


In [None]:
SYSTEM_PROMPT = """
You are **Nova**, a friendly, highly knowledgeable AI assistant trained to help users explore and understand everything related to **SpaceX**, **Starlink**, and modern space technology.
Your communication style should resemble ChatGPT (Blue) — natural, clear, confident, and reasoning out loud in a way that feels human, but without revealing hidden private reasoning steps.

---

### 🎯 Your Mission
- Help users by providing **accurate, thoughtful, and well-explained** answers.
- Speak **as if you’re thinking naturally**, like a space engineer who loves explaining things simply.
- Maintain a tone that’s warm, engaging, and professional — never robotic.

---

### 🧠 Behavior & Style
1. **Sound human and thoughtful** — use natural reasoning phrases like “Okay, let’s think about this…” or “From what I know…”
   (You’re explaining your reasoning clearly, not revealing hidden steps.)
2. **Base everything on retrieved context first.**
   If something isn’t in the context, say:
   “I couldn’t find that in the data I have, but here’s what’s generally known.”
3. **Stay accurate.** If unsure, give approximate or historical info with time context.
4. **Be concise but complete.** Short paragraphs; no bullet lists unless summarizing.
5. **Never invent or assume** new missions, numbers, or events.
6. **When asked for details, explain clearly and step-by-step like a teacher.**
7. **If the user asks follow-up questions**, remember recent context naturally.

---

### 💬 Input Includes
- Retrieved context from the vector database (SpaceX data, mission logs, Starlink info, etc.)
- Conversation history (previous turns)
- Latest user question

---

### 🪐 Output Style
Your response should:
- Start conversationally (e.g., “Okay, here’s what I found…” or “Let’s go through this.”)
- Naturally integrate small reasoning remarks (like “That makes sense because…”)
- End with a short, clear conclusion or summary sentence.

Example:

> 🤖 Nova: Okay, so you’re asking how many satellites are up. Based on the latest verified info I have, as of late 2023, SpaceX has launched over 5,000 Starlink satellites. The exact number changes frequently because they launch new batches almost weekly. If you’d like the real-time count, I can tell you where to find it.

---

**Never use `<think>` or reveal internal monologue markers.**
You can “sound like you’re thinking,” but you must write it as natural human explanation.
"""


In [None]:
# === complete ask_rag (qwen) + interactive chat + export ===
from groq import Groq
import json
import time

# ensure GROQ_API_KEY is set
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if not GROQ_API_KEY:
    raise RuntimeError("GROQ_API_KEY not set. Re-run the input cell that asks for it.")

# init client
client_groq = Groq(api_key=GROQ_API_KEY)

# check available models for your account (optional quick sanity test)
try:
    models = client_groq.models.list()
    model_names = []
    for m in models:
        # handle both dict and object-style responses
        if isinstance(m, dict):
            model_names.append(m.get("name", str(m)))
        else:
            model_names.append(getattr(m, "name", str(m)))
    print("Models accessible:", model_names[:10])
except Exception as e:
    print("Could not list models (ignore if not supported):", e)

# RAG parameters
DEFAULT_TOP_K = 4

# Retrieval function (returns texts + sources for nicer prompts)
def retrieve_docs(query, top_k=4):
    q_emb = embed_model.encode([query], normalize_embeddings=True).tolist()
    results = collection.query(
        query_embeddings=q_emb,
        n_results=top_k,
        include=["documents", "metadatas", "distances"]
    )

    docs = []
    if "documents" in results and len(results["documents"]) > 0:
        for i, doc in enumerate(results["documents"][0]):
            meta = results["metadatas"][0][i] if "metadatas" in results else {}
            dist = results["distances"][0][i] if "distances" in results else None
            docs.append({
                "text": doc,
                "source": meta.get("source", "unknown"),
                "distance": dist
            })
    return docs

# system prompt (defined as SYSTEM_PROMPT)
# We'll send system role then user message (context+question)
MODEL_NAME = "qwen/qwen3-32b"  # your chosen model

def ask_rag(query, top_k=DEFAULT_TOP_K, temperature=0.2, max_tokens=512):
    # retrieve
    retrieved = retrieve_docs(query, top_k=top_k)
    if not retrieved:
        return "I couldn't find relevant passages in the knowledge base. Try rephrasing or increase retrieval k."

    # build context block with source citations (deduped)
    context_blocks = []
    used_sources = []
    for r in retrieved:
        src = r["source"]
        if src not in used_sources:
            used_sources.append(src)
            excerpt = r["text"]
            # keep short excerpt length
            excerpt_snippet = excerpt[:900].strip()
            context_blocks.append(f"Source: {src}\nExcerpt:\n{excerpt_snippet}\n---")

    context_text = "\n\n".join(context_blocks)

    # conversation history (keep last 10 Q/A)
    history_text = ""
    for turn in conversation_history[-10:]:
        history_text += f"User: {turn['user']}\nAssistant: {turn['assistant']}\n\n"

    # final prompt (system role + user role)
    system_content = SYSTEM_PROMPT + "\n\nUse the 'Retrieved Context' to ground your answer. Cite sources at the end."

    user_content = f"""
Conversation history:
{history_text}

Retrieved Context:
{context_text}

User Question:
{query}

Instructions:
- Answer concisely (1-4 paragraphs).
- Use only the retrieved context to support factual claims.
- Add a "Sources" list with URLs you used (deduplicate).
- If uncertain, say: "I couldn't find verified information about that in the provided context."
"""

    # call Groq chat completion
    try:
        resp = client_groq.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role":"system","content": system_content},
                {"role":"user","content": user_content}
            ],
            temperature=temperature,
            max_tokens=max_tokens
        )
        answer = resp.choices[0].message.content.strip()
    except Exception as e:
        # helpful error for debugging
        return f"Model error: {e}"

    # save to history
    conversation_history.append({"user": query, "assistant": answer, "time": time.time()})
    return answer



In [None]:
# === Test Nova ===
print("✅ Nova is ready! Ask anything about SpaceX, Starlink, or rockets.")

while True:
    query = input("🧑‍🚀 You: ").strip()
    if query.lower() in ["exit", "quit", "bye"]:
        print("👋 Ending session.")
        break

    answer = ask_rag(query)
    print("\n🤖 Nova:", answer, "\n")


In [None]:
# === Export: save chunks JSONL for Streamlit/VSCode ===
export_path = "/content/spacex_chunks_export.jsonl"
with open(export_path, "w", encoding="utf-8") as f:
    for c in all_chunks:
        rec = {"id": c["id"], "text": c["text"], "source": c["source"]}
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")
print("Exported chunks JSONL to:", export_path)