In [None]:
import json
import os
from openai import AzureOpenAI
from typing import List, Optional, Dict, Any
from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError as FutureTimeoutError
import threading
from functools import partial
import time
from datetime import datetime

# Azure OpenAI Configuration
endpoint = "https://invuniandesai-2.openai.azure.com/"
model_name = "gpt-5.1"
deployment = "gpt-5.1-grande"
subscription_key = ""
api_version = "2024-12-01-preview"

# Initialize Azure OpenAI client
client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
)

# Configuración de paralelización
MAX_WORKERS = 4  # Ajustar según límites de Azure

# Configuración de guardado incremental
SAVE_EVERY_N_ITEMS = 100  # Guardar cada 100 items procesados

# Lock global para operaciones de escritura thread-safe
_write_lock = threading.Lock()

# Custom exception for API limit
class APILimitReached(Exception):
    """Exception raised when the API limit is reached."""
    pass

In [2]:
def send_azure_prompt(message: str, system_message: str = "You are a helpful assistant specialized in Spanish idioms."):
    """Send a prompt to Azure OpenAI and return the response.
    
    Args:
        message: The user message to send
        system_message: The system message to set context
        
    Returns:
        str: The response content or error dict
    """
    try:
        response = client.chat.completions.create(
            model=deployment,
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": message}
            ],
        )
        return response.choices[0].message.content
    
    except Exception as e:
        error_msg = str(e).lower()
        
        # Check for rate limit or quota exceeded
        if "rate" in error_msg or "quota" in error_msg or "429" in error_msg:
            raise APILimitReached(f"Azure API limit reached: {e}")
        
        return {"error": str(e)}

### Configuración General

In [3]:
def _import_prompts() -> Dict[str, str]:
    """Load prompt_1..prompt_3 from prompts.py and return as a dict."""
    import sys
    
    # Agregar el directorio padre al path
    parent_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd(), '../..', 'Straico')
    if parent_dir not in sys.path:
        sys.path.insert(0, parent_dir)
    
    try:
        import prompts as p
    except Exception:
        return {}

    out: Dict[str, str] = {}
    for name in ("prompt_1", "prompt_2", "prompt_3"):
        if hasattr(p, name):
            out[name] = getattr(p, name)
    return out

#### Configuración de Rutas

In [4]:
DATASET_PATH = '../../DataSet/DataSet_PrimeraOcurrencia.json'
RESPONSES_DIR = '../Results'

# Calcular número total de filas en el dataset
with open(DATASET_PATH, encoding='utf-8') as f:
    dataset_json = json.load(f)
    TOTAL_DATASET_ROWS = len(dataset_json)

print(f"Dataset: {DATASET_PATH}")
print(f"Total de modismos disponibles: {TOTAL_DATASET_ROWS:,}")

# Cargar prompts
PROMPTS = _import_prompts()
print(f"Prompts cargados: {', '.join(PROMPTS.keys())}")

Dataset: ../../DataSet/DataSet_PrimeraOcurrencia.json
Total de modismos disponibles: 6,533
Prompts cargados: prompt_1, prompt_2, prompt_3


In [5]:
# Configurar el número de filas a procesar
# N_ROWS = 1  # Para pruebas
N_ROWS = None  # Para procesar todo el dataset

In [6]:
def cargar_dataset(n_rows=None):
    """Carga el dataset y retorna una lista de diccionarios."""
    rows = []
    seen_modismos = set()
    
    with open(DATASET_PATH, encoding='utf-8') as f:
        dataset_json = json.load(f)
        
        for r in dataset_json:
            modismo = r.get('modismo', '').strip()
            if not modismo or modismo.casefold() in seen_modismos:
                continue
            seen_modismos.add(modismo.casefold())
            
            rows.append({
                'modismo': modismo,
                'significado': r.get('significado', '').strip(),
            })
            
            if n_rows and len(rows) >= n_rows:
                break
    
    return rows


def save_json(filepath, data):
    """Guarda datos en un archivo JSON de forma thread-safe."""
    with _write_lock:
        try:
            dir_path = os.path.dirname(filepath)
            if dir_path:
                os.makedirs(dir_path, exist_ok=True)
            
            temp_filepath = f"{filepath}.tmp"
            with open(temp_filepath, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            
            os.replace(temp_filepath, filepath)
        except Exception as e:
            print(f"[ERROR] No se pudo guardar {filepath}: {e}")
            if os.path.exists(temp_filepath):
                try:
                    os.remove(temp_filepath)
                except:
                    pass


def load_checkpoint(base_dir, prompt_name):
    """Carga el checkpoint si existe."""
    checkpoint_path = os.path.join(base_dir, prompt_name, "checkpoint.json")
    
    if os.path.exists(checkpoint_path):
        try:
            with open(checkpoint_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        except:
            return None
    return None


def save_checkpoint(base_dir, prompt_name, checkpoint_data):
    """Guarda el checkpoint."""
    checkpoint_path = os.path.join(base_dir, prompt_name, "checkpoint.json")
    save_json(checkpoint_path, checkpoint_data)


def save_responses(base_dir, prompt_name, data):
    """Guarda las respuestas."""
    filepath = os.path.join(base_dir, prompt_name, f"{model_name}_responses.json")
    save_json(filepath, data)


def format_time(seconds):
    """Formatea segundos a formato legible."""
    if seconds < 60:
        return f"{seconds:.1f}s"
    elif seconds < 3600:
        minutes = seconds / 60
        return f"{minutes:.1f}m"
    else:
        hours = seconds / 3600
        return f"{hours:.1f}h"


def print_progress_bar(current, total, prefix='', suffix='', length=40, fill='█'):
    """Imprime una barra de progreso."""
    percent = 100 * (current / float(total))
    filled_length = int(length * current // total)
    bar = fill * filled_length + '─' * (length - filled_length)
    print(f'\r{prefix} |{bar}| {percent:.1f}% {suffix}', end='', flush=True)
    if current == total:
        print()

### Utilidades de Gestión

In [7]:
def check_checkpoint(prompt_name):
    """Verifica el estado del checkpoint."""
    checkpoint = load_checkpoint(RESPONSES_DIR, prompt_name)
    
    if checkpoint:
        items = checkpoint.get('items_completed', 0)
        errors = checkpoint.get('errors_count', 0)
        last_updated = checkpoint.get('last_updated', 'desconocido')
        print(f"Checkpoint encontrado para {prompt_name}:")
        print(f"  Items completados: {items}")
        print(f"  Errores: {errors}")
        print(f"  Última actualización: {last_updated}")
        return True
    else:
        print(f"No hay checkpoint pendiente para {prompt_name}")
        return False


def clear_checkpoint(prompt_name, confirm=True):
    """Limpia el checkpoint."""
    if confirm:
        response = input(f"¿Estás seguro de eliminar el checkpoint de {prompt_name}? (si/no): ")
        if response.lower() not in ['si', 'sí', 's', 'yes', 'y']:
            print("[CANCELADO]")
            return
    
    checkpoint_path = os.path.join(RESPONSES_DIR, prompt_name, "checkpoint.json")
    if os.path.exists(checkpoint_path):
        try:
            os.remove(checkpoint_path)
            print(f"[OK] Checkpoint eliminado")
        except Exception as e:
            print(f"[ERROR] {e}")
    else:
        print("[INFO] No hay checkpoint para eliminar")


def get_processing_stats(prompt_name):
    """Obtiene estadísticas de procesamiento."""
    filepath = os.path.join(RESPONSES_DIR, prompt_name, f"{model_name}_responses.json")
    
    if not os.path.exists(filepath):
        print(f"[INFO] No hay datos procesados para {prompt_name}")
        return
    
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    errors = sum(1 for r in data if isinstance(r.get('response'), dict) and 'error' in r['response'])
    
    print(f"Estadísticas de {prompt_name}:")
    print(f"  Total items: {len(data)}")
    print(f"  Errores: {errors}")
    print(f"  Exitosos: {len(data) - errors}")

In [8]:
# Ejemplos de uso:
# check_checkpoint("Prompt 1")
# get_processing_stats("Prompt 1")
# clear_checkpoint("Prompt 1")

print("[INFO] Descomenta las líneas arriba para usar las utilidades")

[INFO] Descomenta las líneas arriba para usar las utilidades


---
## PROMPT 2 - Azure GPT-5.1

In [9]:
def run_prompt_2_azure(n_rows=N_ROWS):
    """
    PROMPT 2
    """
    print("=" * 80)
    print("EJECUTANDO PROMPT 2 con Azure GPT-5.1")
    print("=" * 80)

    # Cargar dataset
    dataset = cargar_dataset(n_rows)
    if not dataset:
        print("[ERROR] No se pudo cargar el dataset")
        return
    
    print(f"\nConfiguración:")
    print(f"  Dataset: {len(dataset)} modismos")
    print(f"  Modelo: {model_name} ({deployment})")
    print(f"  Guardado incremental: cada {SAVE_EVERY_N_ITEMS} items")

    # Obtener template del prompt
    template = PROMPTS.get('prompt_2')
    if not template:
        print("[ERROR] prompt_2 no encontrado")
        return

    # Cargar checkpoint si existe
    checkpoint = load_checkpoint(RESPONSES_DIR, "Prompt 2")
    if checkpoint:
        responses = checkpoint.get('responses', [])
        processed_modismos = set(checkpoint.get('processed_modismos', []))
        start_idx = len(responses)
        print(f"[CHECKPOINT] Reanudando desde item {start_idx}")
    else:
        responses = []
        processed_modismos = set()
        start_idx = 0
    
    errors_count = 0
    total = len(dataset)
    start_time = time.time()
    
    print("\n" + "─" * 80)
    print("Iniciando procesamiento...")
    print("─" * 80)
    
    try:
        for idx, row in enumerate(dataset, 1):
            modismo = row.get('modismo', '').strip()
            if not modismo or modismo in processed_modismos:
                continue
            
            # Armar el prompt
            prompt_text = template.replace('{{modismo}}', modismo)
            
            # Obtener respuesta con retry
            max_retries = 3
            resp = None
            for attempt in range(max_retries):
                try:
                    resp = send_azure_prompt(prompt_text)
                    if not isinstance(resp, dict) or 'error' not in resp:
                        break
                except APILimitReached:
                    raise
                except Exception as e:
                    if attempt == max_retries - 1:
                        resp = {"error": str(e)}
                        break
                
                wait_time = (2 ** attempt) + (time.time() % 1)
                time.sleep(wait_time)
            
            # Procesar respuesta
            if isinstance(resp, str):
                try:
                    parsed = json.loads(resp)
                    response_data = parsed
                except:
                    response_data = {"raw_response": resp}
            elif isinstance(resp, dict):
                if 'error' in resp:
                    errors_count += 1
                response_data = resp
            else:
                response_data = {"raw_response": str(resp)}
            
            # Agregar entrada
            entry = {
                "modismo": modismo,
                "model": model_name,
                "response": response_data,
                "timestamp": datetime.now().isoformat()
            }
            
            responses.append(entry)
            processed_modismos.add(modismo)
            
            # Progress bar
            elapsed = time.time() - start_time
            items_per_sec = len(responses) / elapsed if elapsed > 0 else 0
            eta = (total - len(responses)) / items_per_sec if items_per_sec > 0 else 0
            
            print_progress_bar(
                len(responses), 
                total,
                prefix='Progreso',
                suffix=f'Items: {len(responses)}/{total} | Errores: {errors_count} | ETA: {format_time(eta)}'
            )
            
            # Guardado incremental
            if len(responses) % SAVE_EVERY_N_ITEMS == 0:
                checkpoint_data = {
                    'responses': responses,
                    'processed_modismos': list(processed_modismos),
                    'last_updated': datetime.now().isoformat(),
                    'items_completed': len(responses),
                    'errors_count': errors_count
                }
                save_checkpoint(RESPONSES_DIR, "Prompt 2", checkpoint_data)
                save_responses(RESPONSES_DIR, "Prompt 2", responses)
    
    except APILimitReached as e:
        print(f"\n\n[ADVERTENCIA] {str(e)}")
        print(f"Progreso guardado: {len(responses)}/{total} items")
        
        checkpoint_data = {
            'responses': responses,
            'processed_modismos': list(processed_modismos),
            'last_updated': datetime.now().isoformat(),
            'items_completed': len(responses),
            'errors_count': errors_count
        }
        save_checkpoint(RESPONSES_DIR, "Prompt 2", checkpoint_data)
        save_responses(RESPONSES_DIR, "Prompt 2", responses)
        return
    
    except KeyboardInterrupt:
        print("\n\n[ADVERTENCIA] Interrumpido por el usuario")
        print(f"Progreso guardado: {len(responses)}/{total} items")
        
        checkpoint_data = {
            'responses': responses,
            'processed_modismos': list(processed_modismos),
            'last_updated': datetime.now().isoformat(),
            'items_completed': len(responses),
            'errors_count': errors_count
        }
        save_checkpoint(RESPONSES_DIR, "Prompt 2", checkpoint_data)
        save_responses(RESPONSES_DIR, "Prompt 2", responses)
        return
    
    # Guardar respuestas finales
    elapsed_time = time.time() - start_time
    
    print("\n\n" + "─" * 80)
    print("Guardando resultados...")
    try:
        save_responses(RESPONSES_DIR, "Prompt 2", responses)
        
        # Limpiar checkpoint
        checkpoint_path = os.path.join(RESPONSES_DIR, "Prompt 2", "checkpoint.json")
        if os.path.exists(checkpoint_path):
            os.remove(checkpoint_path)
        
        print("[OK] Guardado completo")
    except Exception as e:
        print(f"[ERROR] {e}")
    
    # Resumen final
    print("\n" + "=" * 80)
    print("PROMPT 2 - COMPLETADO")
    print("=" * 80)
    print(f"Estadísticas:")
    print(f"  Items procesados: {len(responses)}/{total}")
    print(f"  Errores: {errors_count}")
    print(f"  Exitosos: {len(responses) - errors_count}")
    print(f"  Tiempo total: {format_time(elapsed_time)}")
    
    if len(responses) > 0:
        avg_time = elapsed_time / len(responses)
        print(f"  Velocidad: {format_time(avg_time)}/item")
    
    print("=" * 80)

In [10]:
# EJECUTAR PROMPT 2 con Azure GPT-5.1
# Procesa el dataset completo con checkpoint y progreso
# Si se interrumpe (Ctrl+C), puede reanudar desde el checkpoint

run_prompt_2_azure()

EJECUTANDO PROMPT 2 con Azure GPT-5.1

Configuración:
  Dataset: 6533 modismos
  Modelo: gpt-5.1 (gpt-5.1-grande)
  Guardado incremental: cada 100 items

────────────────────────────────────────────────────────────────────────────────
Iniciando procesamiento...
────────────────────────────────────────────────────────────────────────────────
Progreso |████████████████████████████████████████| 100.0% Items: 6533/6533 | Errores: 0 | ETA: 0.0s


────────────────────────────────────────────────────────────────────────────────
Guardando resultados...
[OK] Guardado completo

PROMPT 2 - COMPLETADO
Estadísticas:
  Items procesados: 6533/6533
  Errores: 0
  Exitosos: 6533
  Tiempo total: 2.1h
  Velocidad: 1.2s/item
