# Preparación del dataset "Wikipedia Movie Plots"

## Nombre: Michael Pillaga


## PARTE 1: RECUPERACION CON TF-IDF

### PASO 1: CARGAR LOS DATOS


In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Cargar el dataset desde un archivo CSV
# Asegúrate de que el archivo esté en la misma carpeta que tu notebook o ajusta la ruta
file_path = "./wiki_movie_plots_deduped.csv"  
df = pd.read_csv(file_path)

# Verifica las columnas del dataset
print("Columnas del dataset:", df.columns)

# Opcional: muestra algunas filas para asegurarte de que los datos están cargados correctamente
print(df.head())


Columnas del dataset: Index(['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast',
       'Genre', 'Wiki Page', 'Plot'],
      dtype='object')
   Release Year                             Title Origin/Ethnicity  \
0          1901            Kansas Saloon Smashers         American   
1          1901     Love by the Light of the Moon         American   
2          1901           The Martyred Presidents         American   
3          1901  Terrible Teddy, the Grizzly King         American   
4          1902            Jack and the Beanstalk         American   

                             Director Cast    Genre  \
0                             Unknown  NaN  unknown   
1                             Unknown  NaN  unknown   
2                             Unknown  NaN  unknown   
3                             Unknown  NaN  unknown   
4  George S. Fleming, Edwin S. Porter  NaN  unknown   

                                           Wiki Page  \
0  https://en.wikipedia.org/wiki/Kansa

### PASO 2: NORMALIZACION DEL TEXTO

In [15]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Descargar recursos necesarios de NLTK
nltk.download('stopwords')
nltk.download('wordnet')

# Inicializar lematizador y stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Función para normalizar el texto
def normalize_text(text):
    # Convertir a minúsculas y tokenizar
    tokens = text.lower().split()
    # Eliminar stopwords y lematizar
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Reconstruir el texto normalizado
    return " ".join(tokens)

# Aplicar normalización al texto de las tramas
df['Normalized_Plot'] = df['Plot'].apply(normalize_text)

# Verificar los primeros resultados normalizados
print(df[['Title', 'Normalized_Plot']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Saitama\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Saitama\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                              Title  \
0            Kansas Saloon Smashers   
1     Love by the Light of the Moon   
2           The Martyred Presidents   
3  Terrible Teddy, the Grizzly King   
4            Jack and the Beanstalk   

                                     Normalized_Plot  
0  bartender working saloon, serving drink custom...  
1  moon, painted smiling face hang park night. yo...  
2  film, minute long, composed two shots. first, ...  
3  lasting 61 second consisting two shots, first ...  
4  earliest known adaptation classic fairytale, f...  


### PASO 3: CONFIGURAR TF IDF

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Configurar el vectorizador TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Generar la matriz TF-IDF
tfidf_matrix = vectorizer.fit_transform(df['Normalized_Plot'])

# Verificar las dimensiones de la matriz
print(f"Matriz TF-IDF generada con dimensiones: {tfidf_matrix.shape}")


Matriz TF-IDF generada con dimensiones: (34886, 5000)


### PASO 4: REALIZAR CONSULTAS

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

# Función para normalizar consultas
def normalize_query(query):
    tokens = query.lower().split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Función para evaluar los resultados con TF-IDF
def evaluate_results(query, vectorizer, tfidf_matrix, top_n=5):
    # Normalizar la consulta antes de vectorizarla
    normalized_query = normalize_query(query)
    # Vectorizar la consulta normalizada
    query_vec = vectorizer.transform([normalized_query])
    # Calcular similitud del coseno
    cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    # Obtener los índices de los documentos más similares
    top_indices = cosine_similarities.argsort()[-top_n:][::-1]
    # Recuperar los documentos con sus puntuaciones
    results = df.iloc[top_indices].copy()
    results['Similarity_Score'] = cosine_similarities[top_indices]
    return results[['Title', 'Plot', 'Similarity_Score']]

# Realizar una consulta
query = "space adventure with dinosaurs"  # Cambia esto por tu consulta
evaluated_results = evaluate_results(query, vectorizer, tfidf_matrix)

# Mostrar los resultados evaluados
print("Documentos recuperados con puntuaciones de similitud:")
print(evaluated_results)



Documentos recuperados con puntuaciones de similitud:
                                Title  \
22892    The Thousand Faces of Dunjia   
12568                    Theodore Rex   
34882              Çalgı Çengi İkimiz   
33715  Space Sheriff Gavan: The Movie   
12103  We're Back! A Dinosaur's Story   

                                                    Plot  Similarity_Score  
22892  The film follows a group of swordsmen's advent...          0.382771  
12568  In an alternate futuristic society where human...          0.373772  
34882  Two musicians, Salih and Gürkan, described the...          0.353538  
33715  Fulfilling their fifteen-year-old childhood dr...          0.337162  
12103  In present-day New York City, an Eastern blueb...          0.329330  


## PARTE 2: RECUPERACION CON BM25

### PASO 1: Verificar conexion con elasticsearch

In [20]:
from elasticsearch import Elasticsearch

# Conexión al cliente Elasticsearch
es = Elasticsearch("http://localhost:9200")

# Verificar si está conectado
if es.ping():
    print("Conexión exitosa a Elasticsearch")
else:
    print("Error al conectar con Elasticsearch")


Conexión exitosa a Elasticsearch


### Verificar que el indice existe

In [10]:
# Listar los índices existentes con argumentos nombrados
indices = es.indices.get_alias(index="*")
print("Índices existentes:")
for index_name in indices:
    print(index_name)


Índices existentes:
movies


### PASO 2: Realizar consultas

In [23]:
# Función para realizar consultas con BM25
def search_query_bm25(query, index_name, size=5):
    body = {
        "query": {
            "multi_match": {
                "query": query,
                "fields": ["title", "plot"]  # Campos donde buscar
            }
        }
    }
    response = es.search(index=index_name, body=body, size=size)
    
    # Mostrar resultados
    results = []
    for result in response["hits"]["hits"]:
        title = result['_source']['title']
        plot = " ".join(result['_source']['plot'].split()[:10])  # Limitar a 10 palabras del plot
        score = result['_score']
        results.append({"Title": title, "Plot": plot, "BM25_Score": score})
    return results

# Realizar una consulta con BM25
query = "space adventure with dinosaurs"  # Cambia esto por tu consulta
index_name = "movies"  # Nombre del índice en Elasticsearch
bm25_results = search_query_bm25(query, index_name)

# Mostrar los resultados
print("Documentos recuperados con BM25:")
for result in bm25_results:
    print(f"Title: {result['Title']}")
    print(f"Plot: {result['Plot']}...")
    print(f"BM25 Score: {result['BM25_Score']}")
    print("-" * 50)


Documentos recuperados con BM25:
Title:  Blinky Bill the Movie
Plot: In the town of Greenpatch in Australia, a courageous young...
BM25 Score: 16.69516
--------------------------------------------------
Title: Magic Tree House
Plot: Jack is a shy but confident bookworm and his sister...
BM25 Score: 14.60886
--------------------------------------------------
Title: Unknown Island
Plot: Adventure-seeker Ted Osborne (Phillip Reed) and his fiancee Carole (Virginia...
BM25 Score: 14.467151
--------------------------------------------------
Title: Robot Monster
Plot: Evil Moon robot Ro-Man Extension XJ-2 (Barrows), referred to as...
BM25 Score: 13.367545
--------------------------------------------------
Title: We're Back! A Dinosaur's Story
Plot: In present-day New York City, an Eastern bluebird named Buster...
BM25 Score: 13.172708
--------------------------------------------------


  response = es.search(index=index_name, body=body, size=size)


### PASO 3: Evaluar los resultados

In [24]:
# Comparar los resultados de BM25 y TF-IDF
def compare_bm25_tfidf(tfidf_results, bm25_results):
    print("\n--- Comparación de Resultados ---")
    for i, (tfidf, bm25) in enumerate(zip(tfidf_results.to_dict('records'), bm25_results)):
        print(f"\nComparación {i + 1}:")
        print(f"TF-IDF - Title: {tfidf['Title']}, Similarity Score: {tfidf['Similarity_Score']}")
        print(f"TF-IDF - Plot: {' '.join(tfidf['Plot'].split()[:10])}...")
        print(f"BM25  - Title: {bm25['Title']}, BM25 Score: {bm25['BM25_Score']}")
        print(f"BM25  - Plot: {bm25['Plot']}...")
        print("-" * 50)

# Llamar la función de comparación
compare_bm25_tfidf(evaluated_results, bm25_results)



--- Comparación de Resultados ---

Comparación 1:
TF-IDF - Title: The Thousand Faces of Dunjia, Similarity Score: 0.3827709009592723
TF-IDF - Plot: The film follows a group of swordsmen's adventures to secretly...
BM25  - Title:  Blinky Bill the Movie, BM25 Score: 16.69516
BM25  - Plot: In the town of Greenpatch in Australia, a courageous young...
--------------------------------------------------

Comparación 2:
TF-IDF - Title: Theodore Rex, Similarity Score: 0.3737716024557386
TF-IDF - Plot: In an alternate futuristic society where humans and anthropomorphic dinosaurs...
BM25  - Title: Magic Tree House, BM25 Score: 14.60886
BM25  - Plot: Jack is a shy but confident bookworm and his sister...
--------------------------------------------------

Comparación 3:
TF-IDF - Title: Çalgı Çengi İkimiz, Similarity Score: 0.3535381884884966
TF-IDF - Plot: Two musicians, Salih and Gürkan, described the adventures of their...
BM25  - Title: Unknown Island, BM25 Score: 14.467151
BM25  - Plot: Adve

## PARTE 3: RECUPERACION CON FAISS

### PASO 1: IMPORTAMOS LIBRERIAS Y CARGAMOS EL DATASET

In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss

# Cargar el dataset
file_path = "./wiki_movie_plots_deduped.csv"  # Cambia esto por la ruta del archivo
df = pd.read_csv(file_path)

# Verificar las primeras filas
print("Dataset cargado con las siguientes columnas:")
print(df.head())


Dataset cargado con las siguientes columnas:
   Release Year                             Title Origin/Ethnicity  \
0          1901            Kansas Saloon Smashers         American   
1          1901     Love by the Light of the Moon         American   
2          1901           The Martyred Presidents         American   
3          1901  Terrible Teddy, the Grizzly King         American   
4          1902            Jack and the Beanstalk         American   

                             Director Cast    Genre  \
0                             Unknown  NaN  unknown   
1                             Unknown  NaN  unknown   
2                             Unknown  NaN  unknown   
3                             Unknown  NaN  unknown   
4  George S. Fleming, Edwin S. Porter  NaN  unknown   

                                           Wiki Page  \
0  https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...   
1  https://en.wikipedia.org/wiki/Love_by_the_Ligh...   
2  https://en.wikipedia.org/wiki/Th

### Verificamos dependencias

In [31]:
import torch
import torchvision
from sentence_transformers import SentenceTransformer

print(f"PyTorch version: {torch.__version__}")
print(f"Torchvision version: {torchvision.__version__}")
print("Sentence-Transformers importado correctamente")


PyTorch version: 2.2.2
Torchvision version: 0.17.2
Sentence-Transformers importado correctamente


### Cargamos dataset

In [32]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss

# Cargar el dataset
file_path = "./wiki_movie_plots_deduped.csv"  # Cambia esto por la ruta de tu archivo
df = pd.read_csv(file_path)

# Verificar las primeras filas del dataset
print("Dataset cargado con las siguientes columnas:")
print(df.head())


Dataset cargado con las siguientes columnas:
   Release Year                             Title Origin/Ethnicity  \
0          1901            Kansas Saloon Smashers         American   
1          1901     Love by the Light of the Moon         American   
2          1901           The Martyred Presidents         American   
3          1901  Terrible Teddy, the Grizzly King         American   
4          1902            Jack and the Beanstalk         American   

                             Director Cast    Genre  \
0                             Unknown  NaN  unknown   
1                             Unknown  NaN  unknown   
2                             Unknown  NaN  unknown   
3                             Unknown  NaN  unknown   
4  George S. Fleming, Edwin S. Porter  NaN  unknown   

                                           Wiki Page  \
0  https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...   
1  https://en.wikipedia.org/wiki/Love_by_the_Ligh...   
2  https://en.wikipedia.org/wiki/Th

### PASO 1: NORMALIZAR TEXTO

In [33]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Descargar recursos necesarios de NLTK
nltk.download('stopwords')
nltk.download('wordnet')

# Inicializar lematizador y stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Función para normalizar el texto
def normalize_text(text):
    tokens = text.lower().split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Aplicar normalización al texto de las tramas
df['Normalized_Plot'] = df['Plot'].apply(normalize_text)

# Verificar los primeros resultados normalizados
print(df[['Title', 'Normalized_Plot']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Saitama\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Saitama\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                              Title  \
0            Kansas Saloon Smashers   
1     Love by the Light of the Moon   
2           The Martyred Presidents   
3  Terrible Teddy, the Grizzly King   
4            Jack and the Beanstalk   

                                     Normalized_Plot  
0  bartender working saloon, serving drink custom...  
1  moon, painted smiling face hang park night. yo...  
2  film, minute long, composed two shots. first, ...  
3  lasting 61 second consisting two shots, first ...  
4  earliest known adaptation classic fairytale, f...  


### PASO 2: GENERAR EMBEDDINGS CON SENTENCETRANSFORMER

In [34]:
from sentence_transformers import SentenceTransformer

# Cargar el modelo preentrenado
model = SentenceTransformer('all-MiniLM-L6-v2')  # Puedes cambiar el modelo si prefieres

# Generar embeddings para las tramas normalizadas
embeddings = model.encode(df['Normalized_Plot'].tolist(), show_progress_bar=True)

# Verificar las dimensiones de los embeddings
print(f"Embeddings generados con dimensiones: {embeddings.shape}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1091 [00:00<?, ?it/s]

Embeddings generados con dimensiones: (34886, 384)


### PASO 3: CREAR EL INDICE FAISS

In [35]:
# Crear el índice FAISS
import faiss

dimension = embeddings.shape[1]  # Dimensión de los embeddings
index = faiss.IndexFlatL2(dimension)  # L2 = Distancia Euclidiana

# Agregar los embeddings al índice
index.add(embeddings)

# Verificar cuántos vectores hay en el índice
print(f"Cantidad de vectores en el índice: {index.ntotal}")


Cantidad de vectores en el índice: 34886


### PASO 4: REALIZAR CONSULTAS CON FAISS

In [36]:
# Función para realizar consultas con FAISS
def search_faiss(query, model, index, top_n=5):
    # Generar embedding de la consulta
    query_embedding = model.encode([query])
    
    # Buscar en el índice FAISS
    distances, indices = index.search(query_embedding, top_n)
    
    # Mostrar resultados
    results = []
    for idx, distance in zip(indices[0], distances[0]):
        title = df.iloc[idx]['Title']
        plot = " ".join(df.iloc[idx]['Plot'].split()[:10])  # Limitar a 10 palabras
        results.append({"Title": title, "Plot": plot, "Distance": distance})
    return results

# Realizar una consulta
query = "space adventure with dinosaurs"  # Cambia esto por tu consulta
faiss_results = search_faiss(query, model, index)

# Mostrar los resultados
print("Documentos recuperados con FAISS:")
for result in faiss_results:
    print(f"Title: {result['Title']}")
    print(f"Plot: {result['Plot']}...")
    print(f"Distance: {result['Distance']}")
    print("-" * 50)


Documentos recuperados con FAISS:
Title: Dinosaurus!
Plot: The movie is about American men building a harbour on...
Distance: 0.9749438166618347
--------------------------------------------------
Title: Dinosaurs! – A Fun-Filled Trip Back in Time!
Plot: The video—with beginning scenes filmed in 1987—begins with a young...
Distance: 0.9765295386314392
--------------------------------------------------
Title: We're Back! A Dinosaur's Story
Plot: In present-day New York City, an Eastern bluebird named Buster...
Distance: 0.9784306287765503
--------------------------------------------------
Title: Yona Yona Penguin
Plot: An animated adventure about three children who travel to a...
Distance: 1.007900595664978
--------------------------------------------------
Title: Jurassic Park III
Plot: Ben Hildebrand and 12-year-old Eric Kirby go parasailing around the...
Distance: 1.0250296592712402
--------------------------------------------------


### PASO 5: COMPARAMOS RESULTADOS CON TF-IDF Y BM25

In [2]:
# Comparar resultados de FAISS con TF-IDF y BM25
def compare_all_results(tfidf_results, bm25_results, faiss_results):
    print("\n--- Comparación de Resultados ---")
    for i, (tfidf, bm25, faiss) in enumerate(zip(tfidf_results.to_dict('records'), bm25_results, faiss_results)):
        print(f"\nComparación {i + 1}:")
        print(f"TF-IDF - Title: {tfidf['Title']}, Similarity Score: {tfidf['Similarity_Score']}")
        print(f"BM25  - Title: {bm25['Title']}, BM25 Score: {bm25['BM25_Score']}")
        print(f"FAISS - Title: {faiss['Title']}, Distance: {faiss['Distance']}")
        print("-" * 50)

# Comparar resultados
compare_all_results(evaluated_results, bm25_results, faiss_results)


NameError: name 'evaluated_results' is not defined

## PARTE 4: RECUPERACION CON CHROMADB

### PASO 1: CONFIGURAR CHROMADB

In [2]:
import chromadb
from chromadb.config import Settings

# Configuración de la base de datos ChromaDB con la nueva arquitectura
print("Configurando ChromaDB...")
chroma_client = chromadb.Client(Settings(
    persist_directory="./chroma_db",  # Directorio donde se guardará la base de datos
    anonymized_telemetry=False  # Desactiva telemetría si lo prefieres
))

# Listar colecciones existentes
print("Listando colecciones existentes...")
existing_collections = [col.name for col in chroma_client.list_collections()]
print("Colecciones existentes:", existing_collections)

# Verificar si la colección 'movies' ya existe
collection_name = "movies"
if collection_name in existing_collections:
    # Cargar colección existente
    collection = chroma_client.get_collection(name=collection_name)
    print(f"La colección '{collection_name}' ya existe con {collection.count()} documentos.")
else:
    # Crear nueva colección
    collection = chroma_client.create_collection(name=collection_name)
    print(f"Se creó una nueva colección: {collection_name}")


Configurando ChromaDB...
Listando colecciones existentes...
Colecciones existentes: []
Se creó una nueva colección: movies


### PASO 2: INSERTAR DOCUMENTOS Y EMBEDDINGS

### Cargamos dataset

In [4]:
import pandas as pd

# Cargar el dataset
file_path = "./wiki_movie_plots_deduped.csv"  # Cambia esto por la ruta de tu archivo
df = pd.read_csv(file_path)

print(f"Dataset cargado con {df.shape[0]} registros y columnas: {df.columns.tolist()}")


Dataset cargado con 34886 registros y columnas: ['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast', 'Genre', 'Wiki Page', 'Plot']


In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pickle

# Ruta para guardar los embeddings y metadatos
EMBEDDINGS_FILE = "embeddings.npy"
METADATA_FILE = "metadata.pkl"

try:
    # Intentar cargar los embeddings y metadatos si ya existen
    print("Intentando cargar embeddings y metadatos guardados...")
    embeddings = np.load(EMBEDDINGS_FILE)
    with open(METADATA_FILE, "rb") as f:
        metadata = pickle.load(f)
    titles = metadata["titles"]
    plots = metadata["plots"]
    print("Embeddings y metadatos cargados correctamente.")
except FileNotFoundError:
    # Si los archivos no existen, generar los embeddings
    print("No se encontraron archivos guardados. Generando embeddings nuevos...")
    # Cargar modelo más pequeño
    model = SentenceTransformer('paraphrase-MiniLM-L3-v2')

    # Generar embeddings
    print("Generando embeddings...")
    embeddings = model.encode(df['Plot'].tolist(), show_progress_bar=True)

    # Guardar embeddings y metadatos
    titles = df['Title'].tolist()
    plots = df['Plot'].tolist()
    np.save(EMBEDDINGS_FILE, embeddings)
    with open(METADATA_FILE, "wb") as f:
        pickle.dump({"titles": titles, "plots": plots}, f)

    print(f"Embeddings generados y guardados correctamente. Dimensiones: {embeddings.shape}")

# Embeddings y metadatos están listos para usarse
print(f"Listos para usar. Dimensiones de los embeddings: {embeddings.shape}")


Intentando cargar embeddings y metadatos guardados...
No se encontraron archivos guardados. Generando embeddings nuevos...
Generando embeddings...


Batches:   0%|          | 0/1091 [00:00<?, ?it/s]

Embeddings generados y guardados correctamente. Dimensiones: (34886, 384)
Listos para usar. Dimensiones de los embeddings: (34886, 384)


In [7]:
# Definir las rutas para los archivos
EMBEDDINGS_FILE = "embeddings.npy"
METADATA_FILE = "metadata.pkl"

# Importar las librerías necesarias
import numpy as np
import pickle

try:
    # Intentar cargar los embeddings y metadatos
    embeddings = np.load(EMBEDDINGS_FILE)
    with open(METADATA_FILE, "rb") as f:
        metadata = pickle.load(f)
    titles = metadata["titles"]
    plots = metadata["plots"]
    print("Embeddings y metadatos cargados correctamente.")
    print(f"Dimensiones de los embeddings: {embeddings.shape}")
except FileNotFoundError:
    print("Los archivos de embeddings o metadatos no están disponibles. Necesitas generarlos nuevamente.")


Embeddings y metadatos cargados correctamente.
Dimensiones de los embeddings: (34886, 384)


### Paso 3: Insertar Documentos y Embeddings en ChromaDB python



In [None]:
# Configurar el tamaño del lote
batch_size = 1000  # Ajusta este valor según los recursos disponibles

try:
    # Dividir los documentos y embeddings en lotes para la inserción
    for i in range(0, len(plots), batch_size):
        # Crear el lote actual
        batch_documents = plots[i:i + batch_size]
        batch_embeddings = embeddings[i:i + batch_size]
        batch_ids = [str(j) for j in range(i, i + len(batch_documents))]
        batch_metadatas = [{"title": titles[j]} for j in range(i, i + len(batch_documents))]

        # Insertar el lote en la colección
        collection.add(
            embeddings=batch_embeddings,
            documents=batch_documents,
            metadatas=batch_metadatas,
            ids=batch_ids
        )
        print(f"Se insertó el lote {i // batch_size + 1} con {len(batch_documents)} documentos.")

    # Verificar el número total de documentos en la colección
    print(f"Cantidad total de documentos en la colección '{collection_name}': {collection.count()}")

except Exception as e:
    print(f"Ocurrió un error durante la inserción: {e}")


### Paso 4: Realizar Consultas con ChromaDB


In [None]:
# Función para realizar consultas
def query_chromadb(query_text, model, collection, top_n=5):
    # Generar el embedding de la consulta
    query_embedding = model.encode([query_text])

    # Realizar la consulta en la colección
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=top_n
    )

    # Mostrar resultados
    print(f"Resultados para la consulta: '{query_text}'")
    for i, (doc, metadata) in enumerate(zip(results['documents'][0], results['metadatas'][0])):
        print(f"\nResultado {i+1}:")
        print(f"Título: {metadata['title']}")
        print(f"Trama: {doc}")

# Ejemplo de consulta
query_text = "space adventure with dinosaurs"
query_chromadb(query_text, model, collection)


## Parte 5: Comparacion de Resultados

In [1]:
def compare_results(faiss_results, tfidf_results, bm25_results, chromadb_results):
    print("\n--- Comparación de Resultados ---")
    max_results = max(len(faiss_results), len(tfidf_results), len(bm25_results), len(chromadb_results))
    
    for i in range(max_results):
        print(f"\nComparación {i + 1}:")
        if i < len(faiss_results):
            print(f"FAISS    - Title: {faiss_results[i]['Title']}, Distance: {faiss_results[i]['Distance']:.4f}")
        else:
            print("FAISS    - Sin resultado")

        if i < len(tfidf_results):
            print(f"TF-IDF   - Title: {tfidf_results[i]['Title']}, Similarity: {tfidf_results[i]['Similarity_Score']:.4f}")
        else:
            print("TF-IDF   - Sin resultado")

        if i < len(bm25_results):
            print(f"BM25     - Title: {bm25_results[i]['Title']}, Score: {bm25_results[i]['BM25_Score']:.4f}")
        else:
            print("BM25     - Sin resultado")

        if i < len(chromadb_results):
            print(f"ChromaDB - Title: {chromadb_results[i]['Title']}, Distance: {chromadb_results[i]['Distance']:.4f}")
        else:
            print("ChromaDB - Sin resultado")

        print("-" * 50)
