# Búsquedas semánticas con películas

## Entorno:

```shellscript
docker run -p 6333:6333 -p 6334:6334 -v $(pwd)/q_storage:/qdrant/storage:z qdrant/qdrant
```

```shellscript	
python -m venv .venv
source .venv/bin/activate
```


In [None]:
%pip install -r requirements.txt
#!jupyter nbextension enable --py widgetsnbextension


In [None]:
# Ejemplo con Chroma DB para insertar y consultar
import chromadb

# Inicialización del cliente Chroma
client = chromadb.Client()

# Creación de una colección
try:
    client.delete_collection("basic_example")
except:
    pass
# Chroma Distance metric: https://docs.trychroma.com/usage-guide#changing-the-distance-function
collection = client.create_collection(name="basic_example", metadata={"hnsw:space": "l2"})

# Creación de 4 vectores de dimensión 3
vectores = [
    [0.1, 0.1, 0.1],
    [0.2, 0.2, 0.2],
    [0.3, 0.3, 0.3],
    [0.4, 0.4, 0.4]
]
metadatas = [
    {"color": "blue"},
    {"color": "blue"},
    {"color": "red"},
    {"color": "red"}
]

collection.upsert(ids=["1", "2", "3", "4"], embeddings=vectores, metadatas=metadatas)

q_embedding = [0.12, 0.12, 0.12]
results = collection.query(query_embeddings=q_embedding, n_results=3, include = ["metadatas", "embeddings", "distances"],)
print(f'Resultados para embedding: {q_embedding}')
for i, id in enumerate(results['ids'][0]):
    distance = results['distances'][0][i]
    metadata = results['metadatas'][0][i]
    vector = results['embeddings'][0][i]
    print(f' - ID: {id}, Distance: {distance:.5f}, Metadata: {metadata}, Vector: [{vector[0]:.1f}, ...]')

results = collection.query(query_embeddings=q_embedding, where={"color": "red"}, n_results=3, include = ["metadatas", "embeddings", "distances"],)
print(f'Resultados para embedding: {q_embedding} con filtro color=red')
for i, id in enumerate(results['ids'][0]):
    distance = results['distances'][0][i]
    metadata = results['metadatas'][0][i]
    vector = results['embeddings'][0][i]
    print(f' - ID: {id}, Distance: {distance:.5f}, Metadata: {metadata}, Vector: [{vector[0]:.1f}, ...]')



In [None]:
import pandas as pd
import numpy as np

# Load the data
df = pd.read_json('datasets/star_wars_plots.json')
print('Columns:', df.columns, ', Records:', df.shape[0])
df.head()
startwars_movies = df

### Creamos embeddings del argumento de las películas

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-minilm-l6-v2', device='cuda')
#model = SentenceTransformer('all-mpnet-base-v2', device='cuda')
print(model)

### Preparamos los datos para almacenarlos en la BBDD Vectorial

In [None]:
import time
from tqdm.notebook import tqdm

CHUNK_WORDS = 200

# Creamos chunks de texto de unas 170 palabras con un solape de 10 palabras
def chunk_text(text, chunk_size=CHUNK_WORDS, overlap=CHUNK_WORDS//10):
    '''
    Divide el texto en chunks de un tamaño dado con un solape entre ellos.
    Cada modelo solo puede procesar un número limitado de tokens, por lo que 
    es necesario dividir el texto en chunks.
    1000 tokens son aproximadamente 750 palabras.
    https://openai.com/pricing#language-models
    '''
    chunks = []
    words = text.split()
    for i in range(0, len(words), chunk_size-overlap):
        chunks.append(' '.join(words[i:i+chunk_size]))
    return chunks

# Añadimos columnas plot_chunks y chunk_embeddings que serán arrays en  startwars_movies
plot_embeddings = []
for i, row in tqdm(startwars_movies.iterrows()):
    plot_chunks = chunk_text(row['plot'])
    embeddings = model.encode(plot_chunks).tolist()
    plot_embeddings.append([{'chunk': chunk, 'embedding': embedding} for chunk, embedding in zip(plot_chunks, embeddings)])
    print(i, row['title'], 'chunks:', len(plot_chunks))
    


In [None]:
#Creamos un documento por cada chunk de texto de cada película con su embedding y metadatos

vector_data = []
i = 0
for k, title in enumerate(startwars_movies['title'].tolist()):
    #print(md)
    for plot_chunk in plot_embeddings[k]:
        metadata = dict(title=title)
        i += 1
        metadata['plot_chunk'] = plot_chunk['chunk'] 
        vector_data.append({
            "id": i,
            "embedding": plot_chunk['embedding'],
            "metadata": metadata,
        })

for v in vector_data[:10]:
    print(v['id'], v['metadata']['title'],'-', v['metadata']['plot_chunk'][:80])
print('Vectors:', len(vector_data), 'Movies:', len(startwars_movies))  


### Conectamos con Qdrant para crear una colección y alimentarla con los embeddings


In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance

qd = QdrantClient(url="http://localhost:6333")

MOVIES_COLLECTION = "movies_sw"

qd.delete_collection(MOVIES_COLLECTION)
# "size" is the dimension of the vectors, "distance" is the metric used to calculate the distance between vectors
qd.create_collection(MOVIES_COLLECTION, VectorParams(size=model.get_sentence_embedding_dimension(), distance=Distance.COSINE))
movies = qd.get_collection(MOVIES_COLLECTION)

print(movies)

In [None]:
from qdrant_client.models import PointStruct

points = list(map(lambda x: PointStruct(id=int(x['id']), vector=x['embedding'], payload=x['metadata']), vector_data))
print('Points:', len(points), 'Movies:', len(startwars_movies))
op = qd.upsert(
    collection_name=MOVIES_COLLECTION,
    wait=True,
    points=points,
)

print(qd.get_collection(MOVIES_COLLECTION))



### Realizamos búsquedas semánticas sobre la saga Star Wars (Spoilers warning!!)

In [None]:


queries = [
    'Anakin gana una carrera cuando era un niño',
    'Los clones reciben la orden de ejecutar a los Jedi',
    'Han Solo gana el Halcón Milenario en una partida de cartas',
    #'Kylo Ren mata a su padre, Han Solo',
    #'Palpatine es derrotado definitivamente por Rey',
    # Less accurate queries
    'Luke descubre que Darth Vader es su padre',
    'Luke encuentra a Yoda y es entrenado como Jedi',
]

for q in queries:
    query_emb = model.encode(q)
    results = qd.search(MOVIES_COLLECTION, query_vector=query_emb, limit=3)
    print('Q:', q)
    for r in results:
        print('     ', r.score, ' => ', r.payload['title'])
        print('     ', f'(id: {r.id})', r.payload['plot_chunk'])

