# 🚀 Pipeline Completo: Generación de Preguntas AWS CLF-C02

Este notebook ejecuta todo el pipeline de generación de preguntas:
1. Fetch de documentación AWS oficial
2. Construcción del sistema RAG con ChromaDB
3. Estimación de costos
4. Generación de preguntas con GPT-4o-mini
5. Evaluación con Arize Phoenix

---

**Requisitos:**
- Archivo `.env` con tu `OPENAI_API_KEY`
- Dependencias instaladas: `pip install -r requirements.txt`

## 🔧 Setup Inicial

In [None]:
import os
import sys
from pathlib import Path
import json
from datetime import datetime
from IPython.display import display, HTML, Markdown
import pandas as pd

# Setup paths
PROJECT_ROOT = Path.cwd()
sys.path.insert(0, str(PROJECT_ROOT))

# Load environment
from dotenv import load_dotenv
load_dotenv()

# Verify API key
if not os.getenv("OPENAI_API_KEY"):
    display(HTML('<div style="background-color: #ffcccc; padding: 10px; border-radius: 5px;">❌ <b>Error:</b> OPENAI_API_KEY no encontrada en .env<br>Por favor, crea un archivo .env con tu API key</div>'))
else:
    display(HTML('<div style="background-color: #ccffcc; padding: 10px; border-radius: 5px;">✅ <b>API Key detectada:</b> ' + os.getenv("OPENAI_API_KEY")[:20] + '...</div>'))

print(f"\n📂 Directorio de trabajo: {PROJECT_ROOT}")
print(f"🕐 Inicio del pipeline: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## 📥 Paso 1: Fetch Documentación AWS

Descarga documentación oficial de AWS para usar en el sistema RAG.

In [None]:
import requests
import time

DOCS_DIR = PROJECT_ROOT / "data" / "aws_docs"
DOCS_DIR.mkdir(parents=True, exist_ok=True)

# URLs de documentación
AWS_DOCS = {
    "CLF-C02 Exam Guide": "https://d1.awsstatic.com/training-and-certification/docs-cloud-practitioner/AWS-Certified-Cloud-Practitioner_Exam-Guide.pdf",
    "Well-Architected Framework": "https://docs.aws.amazon.com/wellarchitected/latest/framework/wellarchitected-framework.pdf",
}

SERVICE_FAQS = {
    "EC2": "https://aws.amazon.com/ec2/faqs/",
    "S3": "https://aws.amazon.com/s3/faqs/",
    "Lambda": "https://aws.amazon.com/lambda/faqs/",
    "RDS": "https://aws.amazon.com/rds/faqs/",
    "DynamoDB": "https://aws.amazon.com/dynamodb/faqs/",
    "IAM": "https://aws.amazon.com/iam/faqs/",
    "VPC": "https://aws.amazon.com/vpc/faqs/",
}

def download_file(url, filename, desc=""):
    """Descarga un archivo"""
    output_path = DOCS_DIR / filename
    
    if output_path.exists():
        print(f"⏭️  {desc}: Ya existe")
        return True
    
    try:
        print(f"📥 Descargando {desc}...", end=" ")
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()
        
        with open(output_path, 'wb') as f:
            f.write(response.content)
        
        print(f"✅ ({len(response.content)/1024:.1f} KB)")
        return True
    except Exception as e:
        print(f"❌ Error: {str(e)}")
        return False

# Descargar documentos oficiales
print("\n📚 DESCARGANDO DOCUMENTACIÓN OFICIAL AWS")
print("="*60)

success_count = 0
for name, url in AWS_DOCS.items():
    filename = name.replace(" ", "-") + ".pdf"
    if download_file(url, filename, name):
        success_count += 1
    time.sleep(1)

# Descargar FAQs
print("\n📚 DESCARGANDO FAQs DE SERVICIOS")
print("="*60)

for service, url in SERVICE_FAQS.items():
    filename = f"{service.lower()}_faq.html"
    if download_file(url, filename, f"{service} FAQ"):
        success_count += 1
    time.sleep(1)

# Resumen
total_files = len(list(DOCS_DIR.glob("*")))
total_size = sum(f.stat().st_size for f in DOCS_DIR.glob("*")) / (1024 * 1024)

summary_df = pd.DataFrame({
    'Métrica': ['Total archivos', 'PDFs', 'HTMLs', 'Tamaño total'],
    'Valor': [
        total_files,
        len(list(DOCS_DIR.glob("*.pdf"))),
        len(list(DOCS_DIR.glob("*.html"))),
        f"{total_size:.2f} MB"
    ]
})

display(HTML("<h3>📊 Resumen de Descarga</h3>"))
display(summary_df)

## 🔮 Paso 2: Construir Sistema RAG

Procesa los documentos y crea una base de datos vectorial con ChromaDB.

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, UnstructuredHTMLLoader
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
import tiktoken

CHROMA_DIR = PROJECT_ROOT / "data" / "chroma_db"
CHROMA_DIR.mkdir(parents=True, exist_ok=True)

CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
EMBEDDING_MODEL = "text-embedding-3-small"

print("\n📖 CARGANDO DOCUMENTOS")
print("="*60)

documents = []
errors = []

# Cargar PDFs
pdf_files = list(DOCS_DIR.glob("*.pdf"))
print(f"\n📄 Procesando {len(pdf_files)} PDFs...")

for pdf_path in pdf_files:
    try:
        loader = PyPDFLoader(str(pdf_path))
        docs = loader.load()
        for doc in docs:
            doc.metadata["source"] = pdf_path.name
        documents.extend(docs)
        print(f"   ✅ {pdf_path.name}: {len(docs)} páginas")
    except Exception as e:
        errors.append(f"{pdf_path.name}: {str(e)}")
        print(f"   ⚠️  {pdf_path.name}: Error")

# Cargar HTMLs
html_files = list(DOCS_DIR.glob("*.html"))
print(f"\n🌐 Procesando {len(html_files)} HTMLs...")

for html_path in html_files:
    try:
        loader = UnstructuredHTMLLoader(str(html_path))
        docs = loader.load()
        for doc in docs:
            doc.metadata["source"] = html_path.name
        documents.extend(docs)
        print(f"   ✅ {html_path.name}")
    except Exception as e:
        errors.append(f"{html_path.name}: {str(e)}")
        print(f"   ⚠️  {html_path.name}: Error")

print(f"\n✅ Total documentos cargados: {len(documents)}")

# Dividir en chunks
print("\n✂️  DIVIDIENDO EN CHUNKS")
print("="*60)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE * 4,
    chunk_overlap=CHUNK_OVERLAP * 4,
    length_function=len,
)

chunks = text_splitter.split_documents(documents)
chunks = [c for c in chunks if len(c.page_content.strip()) > 50]

# Estadísticas
encoding = tiktoken.get_encoding("cl100k_base")
sample_tokens = [len(encoding.encode(c.page_content)) for c in chunks[:100]]
avg_tokens = sum(sample_tokens) / len(sample_tokens)

print(f"   📊 Chunks creados: {len(chunks)}")
print(f"   📊 Tokens promedio: ~{avg_tokens:.0f}")
print(f"   💰 Costo estimado embeddings: ${(len(chunks) * avg_tokens / 1_000_000 * 0.02):.4f} USD")

# Crear vector store
print("\n🔮 CREANDO VECTOR DATABASE")
print("="*60)
print(f"   Modelo embeddings: {EMBEDDING_MODEL}")
print(f"   Procesando en batches de 100...")

embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)

# Primer batch
batch_size = 100
vectorstore = Chroma.from_documents(
    documents=chunks[:batch_size],
    embedding=embeddings,
    persist_directory=str(CHROMA_DIR),
    collection_name="aws_docs"
)

# Resto de batches
for i in range(batch_size, len(chunks), batch_size):
    batch = chunks[i:i + batch_size]
    vectorstore.add_documents(batch)
    print(f"   Procesado: {min(i + batch_size, len(chunks))}/{len(chunks)} chunks")

print(f"\n   ✅ Vector store creado en: {CHROMA_DIR}")

# Test de retrieval
print("\n🧪 TEST DE RETRIEVAL")
print("="*60)

test_query = "¿Qué es AWS Shield y cómo protege contra DDoS?"
print(f"\n🔍 Query: {test_query}")
results = vectorstore.similarity_search(test_query, k=3)

for i, doc in enumerate(results, 1):
    preview = doc.page_content[:150].replace("\n", " ")
    print(f"\n   {i}. Fuente: {doc.metadata.get('source', 'Unknown')}")
    print(f"      {preview}...")

## 💰 Paso 3: Estimación de Costos

Calcula el costo de generar preguntas según tus necesidades.

In [None]:
# Configuración
NUM_QUESTIONS = 50  # 👈 CAMBIA ESTO: 50, 100, 200, 300, etc.

# Precios OpenAI
PRICES = {
    "input": 0.15 / 1_000_000,
    "output": 0.60 / 1_000_000
}

# Dominios CLF-C02
DOMAINS = {
    "Domain 1: Cloud Concepts": 24,
    "Domain 2: Security and Compliance": 30,
    "Domain 3: Cloud Technology and Services": 34,
    "Domain 4: Billing, Pricing, and Support": 12
}

# Estimaciones
tokens_per_question = {
    "input": 1400,   # Contexto RAG + prompt
    "output": 400    # Pregunta + opciones + explicación
}

total_input = NUM_QUESTIONS * tokens_per_question["input"]
total_output = NUM_QUESTIONS * tokens_per_question["output"]

cost_input = total_input * PRICES["input"]
cost_output = total_output * PRICES["output"]
cost_generation = cost_input + cost_output

# Evaluaciones (3 evals por pregunta)
eval_tokens_per_question = {"input": 300, "output": 100}
total_evals = NUM_QUESTIONS * 3
cost_eval = (total_evals * eval_tokens_per_question["input"] * PRICES["input"] + 
             total_evals * eval_tokens_per_question["output"] * PRICES["output"])

subtotal = cost_generation + cost_eval
buffer = subtotal * 0.3
total = subtotal + buffer

# Distribución por dominio
distribution = {}
for domain, pct in DOMAINS.items():
    count = round((pct / 100) * NUM_QUESTIONS)
    distribution[domain] = count

# Ajustar
diff = NUM_QUESTIONS - sum(distribution.values())
if diff != 0:
    max_domain = max(DOMAINS, key=DOMAINS.get)
    distribution[max_domain] += diff

# Display
display(HTML(f"<h2>💰 Estimación de Costos para {NUM_QUESTIONS} Preguntas</h2>"))

# Tabla de distribución
dist_df = pd.DataFrame([
    {"Dominio": domain.split(": ")[1], "% Oficial": f"{DOMAINS[domain]}%", "Preguntas": count}
    for domain, count in distribution.items()
])
display(HTML("<h3>📋 Distribución por Dominio CLF-C02</h3>"))
display(dist_df)

# Tabla de costos
cost_df = pd.DataFrame({
    'Concepto': [
        'Generación (input)',
        'Generación (output)',
        'Evaluaciones Phoenix',
        'Subtotal',
        'Buffer (30%)',
        'TOTAL'
    ],
    'Tokens': [
        f"{total_input:,}",
        f"{total_output:,}",
        f"{total_evals * (eval_tokens_per_question['input'] + eval_tokens_per_question['output']):,}",
        '-',
        '-',
        '-'
    ],
    'Costo USD': [
        f"${cost_input:.4f}",
        f"${cost_output:.4f}",
        f"${cost_eval:.4f}",
        f"${subtotal:.4f}",
        f"${buffer:.4f}",
        f"${total:.4f}"
    ]
})

display(HTML("<h3>💵 Desglose de Costos</h3>"))
display(cost_df)

display(HTML(f'<div style="background-color: #e6f3ff; padding: 15px; border-radius: 5px; margin-top: 20px;"><b>💡 Costo total estimado:</b> ${total:.2f} USD para {NUM_QUESTIONS} preguntas evaluadas</div>'))

## 🤖 Paso 4: Generar Preguntas con RAG

Usa GPT-4o-mini para generar preguntas basadas en documentación real de AWS.

In [None]:
from openai import OpenAI
from tqdm.notebook import tqdm
import time

MODEL = "gpt-4o-mini"

# Cargar prompts
with open(PROJECT_ROOT / "prompts" / "system.txt", 'r', encoding='utf-8') as f:
    system_prompt_template = f.read()

with open(PROJECT_ROOT / "prompts" / "examples.json", 'r', encoding='utf-8') as f:
    examples = json.load(f)

# Topics por dominio
DOMAIN_TOPICS = {
    "Domain 1: Cloud Concepts": [
        "Beneficios de la nube AWS",
        "Alta disponibilidad y tolerancia a fallos",
        "Economía de la nube",
        "Estrategias de migración"
    ],
    "Domain 2: Security and Compliance": [
        "Modelo de responsabilidad compartida",
        "AWS IAM",
        "Servicios de seguridad",
        "Encriptación"
    ],
    "Domain 3: Cloud Technology and Services": [
        "Amazon EC2",
        "Amazon S3",
        "Bases de datos AWS",
        "Servicios de red",
        "Lambda y serverless"
    ],
    "Domain 4: Billing, Pricing, and Support": [
        "Modelos de pricing",
        "AWS Cost Explorer",
        "Planes de soporte"
    ]
}

# Cliente OpenAI
client = OpenAI()

def get_example_for_domain(domain):
    """Obtiene ejemplo para el dominio"""
    for ex in examples:
        if ex["domain"] == domain:
            return ex["example"]
    return examples[0]["example"]

def generate_question(domain, topic, vectorstore):
    """Genera una pregunta"""
    # Retrieval
    query = f"{domain}: {topic}"
    docs = vectorstore.similarity_search(query, k=3)
    context = "\n\n".join([f"[{doc.metadata.get('source', 'Unknown')}]\n{doc.page_content[:500]}" for doc in docs])
    
    # Prompt
    system_msg = system_prompt_template.format(context=context, domain=domain)
    example = get_example_for_domain(domain)
    
    user_msg = f"""Genera UNA pregunta de examen sobre: {topic}

Sigue el formato del siguiente ejemplo:
{json.dumps(example, indent=2, ensure_ascii=False)}

Responde SOLO con JSON válido."""
    
    try:
        response = client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": system_msg},
                {"role": "user", "content": user_msg}
            ],
            temperature=0.8,
            response_format={"type": "json_object"}
        )
        
        question_data = json.loads(response.choices[0].message.content)
        question_data["domain"] = domain
        question_data["topic"] = topic
        question_data["retrieved_context"] = context[:500]
        question_data["tokens_used"] = {
            "input": response.usage.prompt_tokens,
            "output": response.usage.completion_tokens
        }
        
        return question_data
    except Exception as e:
        return {"error": str(e)}

# Distribuir preguntas
distribution_list = []
for domain, count in distribution.items():
    topics = DOMAIN_TOPICS[domain]
    questions_per_topic = count // len(topics)
    remainder = count % len(topics)
    
    for i, topic in enumerate(topics):
        topic_count = questions_per_topic + (1 if i < remainder else 0)
        for _ in range(topic_count):
            distribution_list.append((domain, topic))

# Generar preguntas
display(HTML(f"<h3>🤖 Generando {len(distribution_list)} preguntas con GPT-4o-mini...</h3>"))

questions = []
total_cost = 0.0
failed = 0

progress_bar = tqdm(distribution_list, desc="Generando")
for i, (domain, topic) in enumerate(progress_bar, 1):
    q = generate_question(domain, topic, vectorstore)
    
    if "error" not in q:
        questions.append(q)
        cost = (q["tokens_used"]["input"] / 1_000_000 * 0.15 + 
                q["tokens_used"]["output"] / 1_000_000 * 0.60)
        total_cost += cost
        progress_bar.set_postfix({"Costo": f"${total_cost:.4f}"})
    else:
        failed += 1
    
    # Rate limiting
    if i % 10 == 0:
        time.sleep(1)

# Guardar
output_path = PROJECT_ROOT / "data" / "questions_raw.json"
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(questions, f, indent=2, ensure_ascii=False)

# Resumen
result_df = pd.DataFrame({
    'Métrica': ['Generadas', 'Fallidas', 'Costo Real'],
    'Valor': [len(questions), failed, f"${total_cost:.4f} USD"]
})

display(HTML("<h3>✅ Generación Completada</h3>"))
display(result_df)
display(HTML(f'<div style="background-color: #ccffcc; padding: 10px; border-radius: 5px; margin-top: 10px;">📁 Guardado en: {output_path}</div>'))

# Preview
if questions:
    display(HTML("<h3>👀 Preview de Primera Pregunta</h3>"))
    q = questions[0]
    display(Markdown(f"**Dominio:** {q['domain']}"))  
    display(Markdown(f"**Topic:** {q['topic']}"))  
    display(Markdown(f"**Pregunta:** {q['question']}"))  
    display(Markdown(f"**Opciones:**"))
    for letter, text in q['options'].items():
        marker = "✅" if letter == q['correct_answer'] else "  "
        display(Markdown(f"{marker} **{letter})** {text}"))

## 🔍 Paso 5: Evaluación con Phoenix

Evalúa la calidad de las preguntas generadas usando Arize Phoenix.

In [None]:
import phoenix as px
from phoenix.evals import HallucinationEvaluator, QAEvaluator, OpenAIModel

# Iniciar Phoenix
display(HTML("<h3>🚀 Iniciando Arize Phoenix...</h3>"))
session = px.launch_app()
display(HTML('<div style="background-color: #e6f3ff; padding: 10px; border-radius: 5px;">📊 Dashboard disponible en: <a href="http://localhost:6006" target="_blank">http://localhost:6006</a></div>'))

# Setup evaluators
eval_model = OpenAIModel(model=MODEL)
hallucination_eval = HallucinationEvaluator(eval_model)
qa_eval = QAEvaluator(eval_model)

def evaluate_clf_compliance(q):
    """Evalúa compliance CLF-C02"""
    checks = {
        "has_4_options": len(q.get('options', {})) == 4,
        "has_correct_answer": q.get('correct_answer') in ['A', 'B', 'C', 'D'],
        "has_domain": q.get('domain', '').startswith('Domain'),
        "has_explanation": len(q.get('explanation', '')) > 100,
        "answer_in_options": q.get('correct_answer') in q.get('options', {}),
        "question_not_empty": len(q.get('question', '')) > 20
    }
    score = sum(checks.values()) / len(checks)
    return score, checks

# Evaluar preguntas
display(HTML(f"<h3>🔍 Evaluando {len(questions)} preguntas...</h3>"))

approved = []
rejected = []
eval_results = []

progress = tqdm(questions, desc="Evaluando")
for i, q in enumerate(progress, 1):
    # Eval 1: Hallucination
    input_text = f"{q['question']}\n\n" + "\n".join([f"{k}) {v}" for k, v in q['options'].items()])
    try:
        hall_result = hallucination_eval.evaluate(
            input=input_text,
            output=q['explanation'],
            context=q.get('retrieved_context', '')
        )
        hall_score = hall_result.score
    except:
        hall_score = 0.5
    
    # Eval 2: QA Correctness
    try:
        qa_result = qa_eval.evaluate(
            input=q['question'],
            output=f"{q['correct_answer']}) {q['options'][q['correct_answer']]}",
            reference=q['explanation']
        )
        qa_score = qa_result.score
    except:
        qa_score = 0.5
    
    # Eval 3: Compliance
    clf_score, checks = evaluate_clf_compliance(q)
    
    # Decidir
    passed = (hall_score < 0.3 and qa_score > 0.7 and clf_score >= 0.9)
    
    result = {
        "hallucination": hall_score,
        "qa_correctness": qa_score,
        "clf_compliance": clf_score,
        "passed": passed
    }
    eval_results.append(result)
    
    q_with_eval = {**q, "phoenix_evals": result}
    
    if passed:
        approved.append(q_with_eval)
    else:
        reasons = []
        if hall_score >= 0.3:
            reasons.append(f"Hallucination: {hall_score:.2f}")
        if qa_score <= 0.7:
            reasons.append(f"QA: {qa_score:.2f}")
        if clf_score < 0.9:
            reasons.append(f"Compliance: {clf_score:.2f}")
        q_with_eval["rejection_reasons"] = reasons
        rejected.append(q_with_eval)
    
    progress.set_postfix({"Aprobadas": len(approved), "Rechazadas": len(rejected)})

# Guardar resultados
approved_path = PROJECT_ROOT / "data" / "questions_evaluated.json"
rejected_path = PROJECT_ROOT / "data" / "questions_rejected.json"

with open(approved_path, 'w', encoding='utf-8') as f:
    json.dump(approved, f, indent=2, ensure_ascii=False)

with open(rejected_path, 'w', encoding='utf-8') as f:
    json.dump(rejected, f, indent=2, ensure_ascii=False)

# Reporte
total = len(questions)
approval_rate = (len(approved) / total * 100) if total > 0 else 0

avg_hall = sum(r["hallucination"] for r in eval_results) / len(eval_results)
avg_qa = sum(r["qa_correctness"] for r in eval_results) / len(eval_results)
avg_clf = sum(r["clf_compliance"] for r in eval_results) / len(eval_results)

report_df = pd.DataFrame({
    'Métrica': [
        'Total Evaluadas',
        '✅ Aprobadas',
        '❌ Rechazadas',
        'Tasa de Aprobación',
        '',
        'Hallucination (avg)',
        'QA Correctness (avg)',
        'CLF Compliance (avg)'
    ],
    'Valor': [
        total,
        len(approved),
        len(rejected),
        f"{approval_rate:.1f}%",
        '',
        f"{avg_hall:.3f}",
        f"{avg_qa:.3f}",
        f"{avg_clf:.3f}"
    ]
})

display(HTML("<h2>📊 REPORTE FINAL DE EVALUACIONES</h2>"))
display(report_df)

display(HTML(f'<div style="background-color: #ccffcc; padding: 15px; border-radius: 5px; margin-top: 20px;">'
             f'<b>✅ Preguntas aprobadas:</b> {len(approved)}<br>'
             f'<b>📁 Guardadas en:</b> {approved_path}<br><br>'
             f'<b>❌ Preguntas rechazadas:</b> {len(rejected)}<br>'
             f'<b>📁 Guardadas en:</b> {rejected_path}<br><br>'
             f'<b>📊 Phoenix Dashboard:</b> <a href="http://localhost:6006" target="_blank">http://localhost:6006</a>'
             f'</div>'))

# Preview de pregunta aprobada
if approved:
    display(HTML("<h3>✅ Ejemplo de Pregunta Aprobada</h3>"))
    q = approved[0]
    display(Markdown(f"**Dominio:** {q['domain']}"))  
    display(Markdown(f"**Pregunta:** {q['question']}"))  
    for letter, text in q['options'].items():
        marker = "✅" if letter == q['correct_answer'] else "  "
        display(Markdown(f"{marker} **{letter})** {text}"))
    display(Markdown(f"**Scores Phoenix:**"))
    display(Markdown(f"- Hallucination: {q['phoenix_evals']['hallucination']:.3f} (< 0.3 ✅)"))
    display(Markdown(f"- QA Correctness: {q['phoenix_evals']['qa_correctness']:.3f} (> 0.7 ✅)"))
    display(Markdown(f"- CLF Compliance: {q['phoenix_evals']['clf_compliance']:.3f} (>= 0.9 ✅)"))

## 🎉 Pipeline Completado

¡Listo! Has generado y evaluado preguntas de calidad para el examen AWS CLF-C02.

### Próximos pasos:

1. **Revisar preguntas aprobadas** en `data/questions_evaluated.json`
2. **Explorar Phoenix Dashboard** en http://localhost:6006 para ver análisis detallado
3. **Integrar al simulador** copiando las preguntas aprobadas a `data/questions.json`

### Para integrar con el simulador web:

```python
# Copiar preguntas al simulador
import shutil
shutil.copy(
    PROJECT_ROOT / "data" / "questions_evaluated.json",
    PROJECT_ROOT / "data" / "questions.json"
)
```