# Embedding CRUD Notebook

Este codigo incluye la clase EmbeddingCRUD la cual contiene las funcionalidades CRUD y las importaciones necesarias para el funcionamiento de la misma.

In [77]:
import pandas as pd
from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer
from qdrant_client.models import VectorParams, Distance
import numpy as np
import random

class EmbeddingCRUD:
    def __init__(self):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.client = QdrantClient(
            url="https://4715095d-53aa-4f96-8cfe-e542e53f8dd8.eu-west-2-0.aws.cloud.qdrant.io:6333", 
            api_key="2Md9iPHFRxTCSdr6a_tgQrk1BJaf2SAfzx6e9qvqaH7aDc-wogWwtw",
            https=True
        )
        self.index_name = "embeddings_index"

        self._initialize_collection()

    def _initialize_collection(self):
        try:
            # Verifica si la colección ya existe
            self.client.get_collection(self.index_name)
            print(f"La colección '{self.index_name}' ya existe.")
        except:
            print(f"Colección '{self.index_name}' no encontrada, creando...")

            self.client.create_collection(
                collection_name=self.index_name,
                vectors_config=VectorParams(
                    size=self.model.get_sentence_embedding_dimension(),
                    distance=Distance.COSINE  # Se usa distancia de coseno para la similitud
                )
            )
            print(f"Colección '{self.index_name}' creada exitosamente.")

    def id_generation(self):
        return random.randint(1000, 9999)

    def create(self, text):
        embedding = self.model.encode([text])[0].tolist()
        new_id = self.id_generation()
        self.client.upsert(
            collection_name=self.index_name,
            points=[{
                'id': new_id,
                'vector': embedding,
                'payload': {'text': text}
            }]
        )
        return new_id, embedding

    def read(self):
        embeddings = []
        scroll_response = self.client.scroll(collection_name=self.index_name, limit=1000)

        while scroll_response and scroll_response[0]:
            for point in scroll_response[0]:
                embeddings.append({
                    'id': point.id,
                    'vector': point.vector if point.vector else [],
                    'payload': point.payload.get('text', '')
                })

            scroll_token = scroll_response[1]
            if not scroll_token:
                break
            
            scroll_response = self.client.scroll(collection_name=self.index_name, scroll_token=scroll_token)

        return embeddings

    def read_similarity(self, text_query, top_k=5):
        embedding_query = self.model.encode([text_query])[0]
        resultados = self.client.search(
            collection_name=self.index_name,
            query_vector=embedding_query.tolist(),
            limit=top_k
        )
    
        return [{
            "id": res.id,
            "text": res.payload["text"],
            "score": res.score
        } for res in resultados]
    
    def update(self, id, new_text):
        updated_embedding = self.model.encode([new_text])[0]
        self.client.upsert(
            collection_name=self.index_name,
            points=[{
                'id': id,
                'vector': updated_embedding.tolist(),
                'payload': {'text': new_text}
            }]
        )
        print(f"El texto en el id {id} ha sido actualizado a '{new_text}'")

    def delete(self, id):
        self.client.delete(
            collection_name=self.index_name,
            points_selector=[id]
        )
        print(f"Se ha eliminado el índice {id}")

## Example execution file

Instancia de la clase con los metodos CRUD

In [78]:
crud = EmbeddingCRUD()


La colección 'embeddings_index' ya existe.


##### Crear algunos embeddings

In [66]:
id1, embedding1 = crud.create("This is a sample sentence")
id2, embedding2 = crud.create("Another sentence for testing")

[-0.006979598663747311, 0.0739775225520134, -0.000775974418502301, 0.0502782016992569, 0.005629922263324261, -0.010968574322760105, 0.08738940209150314, -0.006780859082937241, -0.06438897550106049, 0.032161545008420944, 0.13903601467609406, -0.05940850079059601, 0.026748960837721825, 0.0030251152347773314, -0.008497769013047218, -0.02316739782691002, 0.056991662830114365, 0.018725097179412842, -0.047967761754989624, 0.027849968522787094, -8.675764547660947e-05, -0.04626139625906944, 0.0025781297590583563, 0.049383871257305145, -0.04872032254934311, 0.020609918981790543, -0.07404062151908875, -0.004054099787026644, 0.05895817652344704, -0.002789926715195179, 0.0173474308103323, 0.05741634964942932, -0.033127401024103165, 0.05332649499177933, 0.0833962932229042, 0.0012227274710312486, 0.024846797809004784, -0.001949883415363729, 0.012833165004849434, -0.024153605103492737, -0.001943204435519874, -0.11962920427322388, 0.04737550765275955, 0.02345479652285576, -0.0031391808297485113, -0.00

#### Leer los embeddings

In [91]:
results = crud.read()
print(results)

[]


##### Buscar embeddings similares a una query

In [86]:
resultados = crud.read_similarity("sample")
print("\nEmbeddings más similares a la consulta:")
for resultado in resultados:
    print(resultado)

  resultados = self.client.search(



Embeddings más similares a la consulta:
{'id': 7605, 'text': 'Another sentence for testing', 'score': 0.41199708}
{'id': 9165, 'text': 'Another sentence for testing', 'score': 0.41199708}
{'id': 9496, 'text': 'Another sentence for testing', 'score': 0.41199708}
{'id': 1149, 'text': 'Updated sentence', 'score': 0.23405258}


##### Actualizar un embedding

In [68]:
crud.update(1149, "Updated sentence")

El texto en el id 1149 ha sido actualizado a 'Updated sentence'


##### Eliminar un embedding

In [90]:
crud.delete(1149)

Se ha eliminado el índice 1149
