In [0]:
%pip install databricks-vectorsearch

In [0]:
dbutils.library.restartPython()

In [0]:
import json
import re
from databricks.vector_search.client import VectorSearchClient
from databricks.sdk import WorkspaceClient
from databricks.sdk.service.serving import ChatMessage, ChatMessageRole

In [0]:
# --- ‚öôÔ∏è CONFIGURACI√ìN DIN√ÅMICA (Recibiendo par√°metros del Job) ---

dbutils.widgets.text("catalog", "workspace")
dbutils.widgets.text("schema", "default")
dbutils.widgets.text("endpoint_name", "vector-search-demo")

CATALOG = dbutils.widgets.get("catalog")
SCHEMA = dbutils.widgets.get("schema")
ENDPOINT_NAME = dbutils.widgets.get("endpoint_name")

INDEX_NAME = f"{CATALOG}.{SCHEMA}.pdf_hierarchical_index"

print(f"üß™ Iniciando pruebas de validaci√≥n contra:")
print(f"   üîé √çndice: {INDEX_NAME}")

In [0]:
default_questions = json.dumps([
    "¬øCu√°les son los componentes clave de Real-Time Intelligence?",
    "En la matriz de riesgos, ¬øqu√© problema tiene el ID 'AME001'?",
    "Diferencia entre Coalesce y Repartition en Spark"
])

dbutils.widgets.text("validation_questions_json", default_questions, "Lista de Preguntas (JSON)")

questions_str = dbutils.widgets.get("validation_questions_json")

In [0]:
def consultar_y_generar(pregunta):
    print(f"\nüîé Buscando: '{pregunta}'")
    vsc = VectorSearchClient()
    w = WorkspaceClient()
    
    try:
        # 1. Retrieval
        index = vsc.get_index(ENDPOINT_NAME, INDEX_NAME)
        results = index.similarity_search(
            query_text=pregunta,
            columns=["chunk_full_text", "section_abstract", "section_title", "source_path"],
            num_results=15, 
            query_type="HYBRID"
        )
        
        raw_docs = results.get('result', {}).get('data_array', [])
        if not raw_docs:
            print("‚ùå No se encontraron documentos.")
            return

        # 2. Re-Ranking (Llama 3)
        candidates_text = ""
        for i, doc in enumerate(raw_docs):
            candidates_text += f"Doc [{i}] ({doc[2]}): {doc[1]}\n"

        rerank_prompt = f"""
        Act√∫a como experto. Selecciona los 3 documentos m√°s relevantes.
        Pregunta: "{pregunta}"
        Candidatos: {candidates_text}
        Devuelve SOLO una lista JSON de √≠ndices, ej: [0, 4, 2].
        """
        
        ranking_response = w.serving_endpoints.query(
            name="databricks-meta-llama-3-3-70b-instruct",
            messages=[ChatMessage(role=ChatMessageRole.USER, content=rerank_prompt)],
            temperature=0.0
        )
        
        content = ranking_response.choices[0].message.content
        match = re.search(r'\[.*?\]', content, re.DOTALL)
        
        if match:
            top_indices = json.loads(match.group(0))
            top_docs = [raw_docs[i] for i in top_indices if isinstance(i, int) and 0 <= i < len(raw_docs)]
        else:
            print("‚ö†Ô∏è Fallo parsing de Juez. Usando top 3 vectoriales.")
            top_docs = raw_docs[:3]

        # 3. Generaci√≥n
        contexto = "\n\n".join([f"--- Secci√≥n: {d[2]} ---\n{d[0]}" for d in top_docs])
        sources = {d[3].split('/')[-1] for d in top_docs}
        
        final_answer = w.serving_endpoints.query(
            name="databricks-meta-llama-3-3-70b-instruct",
            messages=[ChatMessage(role=ChatMessageRole.USER, content=f"Responde usando: {contexto}. Pregunta: {pregunta}")],
            max_tokens=500
        )
        
        print("="*40)
        print(f"üìù {final_answer.choices[0].message.content}")
        print(f"üìö Fuentes: {', '.join(sources)}")
        print("="*40)

    except Exception as e:
        print(f"‚ùå Error en inferencia: {e}")

In [0]:
try:
    questions_list = json.loads(questions_str)
    
    # Validaci√≥n de seguridad: debe ser una lista
    if not isinstance(questions_list, list):
        raise ValueError("El par√°metro no es una lista v√°lida. Formato esperado: ['Pregunta A', 'Pregunta B']")
        
    print(f"üìã Se recibieron {len(questions_list)} preguntas para validar.")

except Exception as e:
    print(f"‚ö†Ô∏è Error al leer el par√°metro de preguntas: {e}")
    print("üîÑ Usando la lista por defecto de emergencia.")
    questions_list = json.loads(default_questions)

# --- 3. Ejecutar el Loop de Validaci√≥n Din√°mico ---
print("\n" + "="*50)
print("üöÄ INICIANDO TEST DE INFERENCIA")
print("="*50)

for i, pregunta in enumerate(questions_list):
    print(f"\nüß™ Test #{i+1}: {pregunta}")
    consultar_y_generar(pregunta) # Llamamos a tu funci√≥n con la variable din√°mica