In [None]:
!pip install -U google-genai faiss-cpu beautifulsoup4 requests numpy

In [None]:
import os
import numpy as np
import faiss
import requests
from bs4 import BeautifulSoup
from google import genai

# Set your Gemini API key
os.environ["GOOGLE_API_KEY"] = "your_api_key"

client = genai.Client()

In [None]:
def load_website(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    # Remove unwanted elements
    for tag in soup(["script", "style", "nav", "footer"]):
        tag.decompose()

    paragraphs = soup.find_all("p")
    text = " ".join([p.get_text() for p in paragraphs])
    text = " ".join(text.split())

    return text

In [None]:
def chunk_text(text, chunk_size=400, overlap=80):
    words = text.split()
    chunks = []
    start = 0

    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap

    return chunks

In [None]:
url = "https://en.wikipedia.org/wiki/Artificial_intelligence"

website_text = load_website(url)
chunks = chunk_text(website_text)

documents = []
for i, chunk in enumerate(chunks):
    documents.append({
        "text": chunk,
        "source": f"{url} | Chunk {i+1}"
    })

print("Total Chunks:", len(documents))

In [None]:
def embed_text(text):
    response = client.models.embed_content(
        model="models/gemini-embedding-001",
        contents=text
    )
    return np.array(response.embeddings[0].values, dtype="float32")

In [None]:
embeddings = [embed_text(doc["text"]) for doc in documents]
dimension = embeddings[0].shape[0]

index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

print("FAISS index built successfully.")

In [None]:
def retrieve(query, top_k=8):
    query_vector = embed_text(query)
    distances, indices = index.search(np.array([query_vector]), top_k)

    results = []
    for idx, dist in zip(indices[0], distances[0]):
        results.append({
            "text": documents[idx]["text"],
            "source": documents[idx]["source"],
            "score": float(dist)
        })
    return results

In [None]:
SIMILARITY_THRESHOLD = 1.2

def guardrail_filter(results):
    return [r for r in results if r["score"] < SIMILARITY_THRESHOLD]

In [None]:
def rerank(query, retrieved_chunks):
    chunk_texts = "\n\n".join(
        [f"Chunk {i+1}: {chunk['text']}" for i, chunk in enumerate(retrieved_chunks)]
    )

    prompt = f"""
    Query: {query}

    Rank the top 3 most relevant chunks by number.

    {chunk_texts}

    Return only numbers separated by commas.
    """

    response = client.models.generate_content(
        model="models/gemini-2.5-flash",
        contents=prompt
    )

    ranked_numbers = response.text.strip()
    top_indices = [int(n.strip()) - 1 for n in ranked_numbers.split(",") if n.strip().isdigit()]

    return [retrieved_chunks[i] for i in top_indices if i < len(retrieved_chunks)]

In [None]:
def generate_answer(query):
    retrieved = retrieve(query)
    filtered = guardrail_filter(retrieved)

    if not filtered:
        return "The answer is not available in the provided website content.", []

    reranked = rerank(query, filtered)

    context = "\n\n".join([chunk["text"] for chunk in reranked])
    sources = list(set([chunk["source"] for chunk in reranked]))

    final_prompt = f"""
    Answer the question strictly using the context below.
    If the answer is not found, say it is not available.

    Context:
    {context}

    Question:
    {query}
    """

    response = client.models.generate_content(
        model="models/gemini-2.5-flash",
        contents=final_prompt
    )

    return response.text, sources

In [None]:
query = "What is the difference between AI and Machine Learning?"

answer, sources = generate_answer(query)

print("Answer:\n")
print(answer)

print("\nSources:")
for s in sources:
    print("-", s)

In [None]:
#  Cosine Similarity Re-ranking (Upgrade 1)

def cosine_rerank(query, retrieved_chunks, top_k=3):
    query_vec = embed_text(query)

    reranked = []

    for chunk in retrieved_chunks:
        chunk_vec = embed_text(chunk["text"])

        cosine_sim = np.dot(query_vec, chunk_vec) / (
            np.linalg.norm(query_vec) * np.linalg.norm(chunk_vec)
        )

        chunk["cosine_score"] = float(cosine_sim)
        reranked.append(chunk)

    reranked = sorted(reranked, key=lambda x: x["cosine_score"], reverse=True)

    return reranked[:top_k]

In [None]:
def extract_sources(chunks):
    return list(set(chunk["source"] for chunk in chunks))

In [None]:
#  Confidence Estimation (Upgrade 3)

def calculate_confidence(reranked_chunks):
    if not reranked_chunks:
        return 0.0

    avg_score = np.mean([chunk["cosine_score"] for chunk in reranked_chunks])

    # Normalize between 0 and 1 (cosine is already between -1 and 1)
    confidence = max(0, min(1, (avg_score + 1) / 2))

    return round(confidence, 3)

In [None]:
def generate_answer(query):

    # Step 1: Retrieve
    retrieved = retrieve(query)

    # Step 2: Guardrail Filter
    filtered = guardrail_filter(retrieved)

    if not filtered:
        return {
            "answer": "The answer is not available in the provided website content.",
            "sources": [],
            "confidence": 0.0
        }

    # Step 3: Cosine Re-rank
    reranked = cosine_rerank(query, filtered)

    # Step 4: Prepare Context
    context = "\n\n".join([chunk["text"] for chunk in reranked])

    # Step 5: Source Attribution
    sources = extract_sources(reranked)

    # Step 6: Final Answer Generation
    final_prompt = f"""
    Answer the question strictly using the context below.
    If the answer is not found, say it is not available.

    Context:
    {context}

    Question:
    {query}
    """

    response = client.models.generate_content(
        model="models/gemini-2.5-flash",
        contents=final_prompt
    )

    # Step 7: Confidence
    confidence = calculate_confidence(reranked)

    return {
        "answer": response.text,
        "sources": sources,
        "confidence": confidence
    }

In [None]:
query = "What is the difference between AI and Machine Learning?"

result = generate_answer(query)

print("Answer:\n")
print(result["answer"])

print("\nSources:")
for s in result["sources"]:
    print("-", s)

print("\nConfidence Score:", result["confidence"])