#  Parte B ‚Äî Embeddings con OpenAI

## Celda 1 ‚Äî Configuraci√≥n inicial

In [1]:
import os
import json
from pathlib import Path
import numpy as np
import pandas as pd
from openai import OpenAI

# Asegurar directorio ra√≠z
os.chdir(Path(__file__).resolve().parents[1] if "__file__" in locals() else Path.cwd().parents[0])
print("üìÇ Directorio actual:", Path.cwd())

# Carpetas base
summaries_dir = Path("data/summaries")
embeddings_dir = Path("data/embeddings")
embeddings_dir.mkdir(parents=True, exist_ok=True)

# Modelos a usar
models = ["gpt-4o", "gpt-4o-mini"]

# Cliente OpenAI
client = OpenAI(api_key="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")


üìÇ Directorio actual: /home/brunoz/Documents/AgenticAI_Horoscopes


## Celda 2 ‚Äî Cargar res√∫menes

In [2]:
from pathlib import Path
import json
import pandas as pd

def load_summaries(path: Path):
    """Carga todos los res√∫menes JSON y los organiza en un DataFrame."""
    data = []
    for file in path.glob("*.json"):
        try:
            with open(file, "r", encoding="utf-8") as f:
                js = json.load(f)

            # 1. Texto a vectorizar
            text = js.get("final_summary") or js.get("raw", "")
            if not text:
                continue

            # 2. Primero intentamos tomar los metadatos DEL JSON
            sign = js.get("sign")
            date = js.get("date")
            interpreter = js.get("interpreter")

            # 3. Si falta algo, lo sacamos del nombre del archivo
            #    formato esperado: hola_2025-10-31_aries.json
            if not sign or not date or not interpreter:
                name = file.stem.split("_")
                interpreter = name[0]
                date = name[1]
                # por si acaso el signo tuviera m√°s de una palabra
                sign = "_".join(name[2:])

            data.append({
                "sign": str(sign).capitalize(),
                "date": date,
                "interpreter": interpreter,
                "text": text,
            })
        except Exception as e:
            print(f"‚ö†Ô∏è Error al leer {file.name}: {e}")
    return pd.DataFrame(data)

# usarla as√≠:
df = load_summaries(summaries_dir)
print(f"‚úÖ {len(df)} res√∫menes cargados.")
df.head(5)


‚úÖ 108 res√∫menes cargados.


Unnamed: 0,sign,date,interpreter,text
0,Escorpio,2025-10-31,lecturas,"On October 31, 2025, Scorpios are encouraged t..."
1,Leo,2025-10-30,lecturas,"```json\n{\n ""tone"": ""optimistic"",\n ""facets..."
2,Cancer,2025-10-31,hola,"```json\n{\n ""tone"": ""optimistic"",\n ""facets..."
3,Escorpio,2025-10-29,hola,"```json\n{\n ""tone"": ""introspective and intui..."
4,Tauro,2025-10-31,20minutos,"On this day, Taurus, you'll find joy in your p..."


## Celda 3 ‚Äî Generar embeddings con OpenAI

In [3]:
def generate_embeddings(texts, model):
    """Genera embeddings usando el modelo especificado."""
    print(f"üöÄ Generando embeddings con {model} ({len(texts)} textos)...")
    vectors = []
    batch_size = 50  # L√≠mite seguro para evitar rate limit

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        response = client.embeddings.create(model=f"text-embedding-3-large", input=batch)
        vectors.extend([d.embedding for d in response.data])

    print(f"‚úÖ Embeddings creados ({len(vectors)})")
    return np.array(vectors)


## CELDA 4 ‚Äî Generar y guardar embeddings (dos modelos)

In [4]:
embeddings_all = []

for model in models:
    vectors = generate_embeddings(df["text"].tolist(), model)

    npy_path = embeddings_dir / f"embeddings_{model.replace('-', '_')}.npy"
    np.save(npy_path, vectors)
    print(f"üíæ Guardado: {npy_path}")

    # Guardar tambi√©n en lista JSON
    for i, row in df.iterrows():
        embeddings_all.append({
            "sign": row["sign"],
            "date": row["date"],
            "interpreter": row["interpreter"],
            "model": model,
            "embedding_vector": vectors[i].tolist()
        })

# Guardado combinado
with open(embeddings_dir / "embeddings_all.json", "w", encoding="utf-8") as f:
    json.dump(embeddings_all, f, ensure_ascii=False, indent=2)

print("‚úÖ Archivo combinado guardado en data/embeddings/embeddings_all.json")


üöÄ Generando embeddings con gpt-4o (108 textos)...
‚úÖ Embeddings creados (108)
üíæ Guardado: data/embeddings/embeddings_gpt_4o.npy
üöÄ Generando embeddings con gpt-4o-mini (108 textos)...
‚úÖ Embeddings creados (108)
üíæ Guardado: data/embeddings/embeddings_gpt_4o_mini.npy
‚úÖ Archivo combinado guardado en data/embeddings/embeddings_all.json


## Celda 5 ‚Äî Verificar embeddings guardados

In [5]:
# Cargar y verificar
check = pd.read_json("data/embeddings/embeddings_all.json")
print(check.head(3))
print(f"Total embeddings: {len(check)}")


       sign       date interpreter   model  \
0  Escorpio 2025-10-31    lecturas  gpt-4o   
1       Leo 2025-10-30    lecturas  gpt-4o   
2    Cancer 2025-10-31        hola  gpt-4o   

                                    embedding_vector  
0  [-0.043936226516962, -0.051031034439802, -0.01...  
1  [-0.026828752830624, -0.010034312494099001, -0...  
2  [-0.007350745610892001, 0.0041490341536700006,...  
Total embeddings: 216


In [6]:
# python
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

emb_dir = Path("data/embeddings")
mini_path = emb_dir / "embeddings_gpt_4o_mini.npy"
large_path = emb_dir / "embeddings_gpt_4o.npy"
json_path = emb_dir / "embeddings_all.json"

if mini_path.exists() and large_path.exists():
    emb_mini = np.load(mini_path)
    emb_large = np.load(large_path)
elif json_path.exists():
    df_emb = pd.read_json(json_path)
    emb_mini = np.vstack(df_emb[df_emb["model"] == "gpt-4o-mini"]["embedding_vector"].to_list())
    emb_large = np.vstack(df_emb[df_emb["model"] == "gpt-4o"]["embedding_vector"].to_list())
else:
    raise FileNotFoundError(f"Missing embeddings in {emb_dir!s} (expected .npy files or {json_path.name}).")

# Ensure 2D arrays
if emb_mini.ndim == 1:
    emb_mini = emb_mini.reshape(1, -1)
if emb_large.ndim == 1:
    emb_large = emb_large.reshape(1, -1)

if emb_mini.size == 0 or emb_large.size == 0:
    raise ValueError("One of the embedding arrays is empty.")

sim_mini = cosine_similarity(emb_mini)
sim_large = cosine_similarity(emb_large)

print("üìà Promedio de similitud interna:")
print("GPT-4o-mini:", np.mean(sim_mini))
print("GPT-4o:", np.mean(sim_large))

üìà Promedio de similitud interna:
GPT-4o-mini: 0.6189009149285587
GPT-4o: 0.6189923510680192
