## preparar cosas

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Lista de modelos a comparar
models_info = {
    "e5-large-v2": "intfloat/e5-large-v2",
    "gte-base": "thenlper/gte-base",
    "miniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2",
    "mpnet-base-v2": "sentence-transformers/all-mpnet-base-v2",
    "bge-base-en-v1.5": "BAAI/bge-base-en-v1.5"
}

In [9]:
# actualizar models_info con los modelos cargados
for model_name, model_path in models_info.items():
    print(f"Cargando modelo: {model_name}")
    models_info[model_name] = SentenceTransformer(model_path)

Cargando modelo: e5-large-v2
Cargando modelo: gte-base
Cargando modelo: miniLM-L6-v2


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Cargando modelo: mpnet-base-v2


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Cargando modelo: bge-base-en-v1.5


In [10]:
df = pd.read_csv("dataset_genius.csv")
sample_texts = df['lyrics'].sample(50000, random_state=42).tolist()

## armar samples (no correr)

In [None]:
# from sentence_transformers import SentenceTransformer
# import numpy as np
# import os
# import torch

# # Carpeta para guardar los resultados
# EMBEDDINGS_DIR = "model_embeddings"
# os.makedirs(EMBEDDINGS_DIR, exist_ok=True)

# print(f"Son {len(sample_texts)} muestras de prueba")

# # Elegir dispositivo automáticamente
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# # Recomendaciones de batch size por modelo
# batch_sizes = {
#     "miniLM-L6-v2": 256,
#     "gte-base": 128,
#     "e5-large-v2": 64
# }

# # Generar y guardar embeddings
# for model_name, model in models_info.items():
#     print(f"\n🔄 Generando embeddings para el modelo: {model_name}")

#     # Batch size dinámico
#     batch_size = batch_sizes.get(model_name, 64)

#     # Generar embeddings del sample
#     embeddings = model.encode(
#         sample_texts,
#         batch_size=batch_size,
#         show_progress_bar=True,
#         convert_to_numpy=True,
#         device=DEVICE
#     )

#     np.save(f"{EMBEDDINGS_DIR}/embeddings_{model_name}.npy", embeddings)


Son 50000 muestras de prueba

🔄 Generando embeddings para el modelo: e5-large-v2


Batches: 100%|██████████| 782/782 [30:10<00:00,  2.32s/it]



🔄 Generando embeddings para el modelo: gte-base


Batches: 100%|██████████| 391/391 [10:36<00:00,  1.63s/it]



🔄 Generando embeddings para el modelo: miniLM-L6-v2


Batches: 100%|██████████| 196/196 [01:18<00:00,  2.51it/s]



🔄 Generando embeddings para el modelo: mpnet-base-v2


Batches: 100%|██████████| 782/782 [09:23<00:00,  1.39it/s]



🔄 Generando embeddings para el modelo: bge-base-en-v1.5


Batches: 100%|██████████| 782/782 [09:29<00:00,  1.37it/s]


## evaluar samples

In [None]:

# Textos de prueba representativos
test_texts = [
    "I feel so bad, my dog just died",
    "Feeling hopeful after a tough day",
    "I just got bad news from a friend",
    "Starting to feel excited for the weekend"
]

In [12]:
# para cada texto de prueba, calcular la similitud coseno con todos los textos de cada modelo
# para cada modelo, devolver los 3 textos más similares a cada texto de prueba

k_top = 3
for model_name, model in models_info.items():
    print(f"Calculando similitud coseno para el modelo: {model_name}")
    embeddings = np.load(f"model_embeddings/embeddings_{model_name}.npy")

    for test_text in test_texts:
        test_embedding = model.encode([test_text], convert_to_numpy=True)
        similarities = cosine_similarity(test_embedding, embeddings)[0]

        # Obtener los índices de los 3 textos más similares
        top_indices = np.argsort(similarities)[-k_top:][::-1]
        top_similarities = similarities[top_indices]
        top_texts = [sample_texts[i] for i in top_indices]

        print(f"Texto de prueba: {test_text}")
        for i, (text, sim) in enumerate(zip(top_texts, top_similarities)):
            print(f"  Top {i+1}: {text} (Similitud: {sim:.4f})")
    print("\n")  # Separar resultados por modelo

Calculando similitud coseno para el modelo: e5-large-v2
Texto de prueba: I feel so bad, my dog just died
  Top 1: well i had an old dog and his name was blue yes i had an old dog and his name was blue well i had an old dog and his name was blue bet ya five dollars he s a good dog too old blue chased a possum up a holler limb blue chased a possum up a holler limb blue chased a possum up a holler limb the possum growled blue whined at him bye bye blue you good dog you bye bye blue you good dog you when old blue died he died so hard he shook the ground in my back yard we lowered him down with a golden chain and every link we called his name bye bye blue you good dog you bye bye blue you good dog you my old blue he was a good old hound you d hear him hollering miles around when i get to heaven first thing i ll do i ll grab my horn and call for blue bye bye blue you good dog you bye bye blue you good dog you (Similitud: 0.8134)
  Top 2: i had a dog he was a mix he loved me like a god but i 

### EVALUACIÓN

Objetivo: medir qué modelo devuelve canciones más relevantes para distintos tipos de prompts
(amor, ruptura, muerte de una mascota, celebración…).  Métrica: *precision@k* y *MRR* sobre una
pequeña colección de prompts evaluados manual o semiautomáticamente.
