In [1]:
import os
from dotenv import load_dotenv

In [2]:
load_dotenv()
# Check if variables are correctly loaded from .env   
EMBEDDING_KEY = os.getenv('EMBEDDING_KEY')
if not EMBEDDING_KEY:
    raise ValueError("EMBEDDING_KEY not found in environment variables")

DEPLOYMENT_NAME_EMBEDDING = os.getenv('DEPLOYMENT_NAME_EMBEDDING')
if not DEPLOYMENT_NAME_EMBEDDING:
    raise ValueError("DEPLOYMENT_NAME_EMBEDDING not found in environment variables")
    
AZURE_ENDPOINT_EMBEDDING = os.getenv('AZURE_ENDPOINT_EMBEDDING')
if not AZURE_ENDPOINT_EMBEDDING:
    raise ValueError("AZURE_ENDPOINT_EMBEDDING not found in environment variables")

API_BASE_EMBEDDING = os.getenv('API_BASE_EMBEDDING')
if not API_BASE_EMBEDDING:
    raise ValueError("API_BASE_EMBEDDING not found in environment variables")

ELASTICSEARCH_USER = os.getenv('ELASTICSEARCH_USER')
if not API_BASE_EMBEDDING:
    raise ValueError("ELASTICSEARCH_USER not found in environment variables")

ELASTICSEARCH_PASSWORD = os.getenv('ELASTICSEARCH_PASSWORD')
if not API_BASE_EMBEDDING:
    raise ValueError("ELASTICSEARCH_PASSWORD not found in environment variables")

ELASTICSEARCH_ENDPOINT = os.getenv('ELASTICSEARCH_ENDPOINT')
if not ELASTICSEARCH_ENDPOINT:
    raise ValueError("ELASTICSEARCH_ENDPOINT not found in environment variables")

In [3]:
def return_vectordb_full_text_and_questions():
    '''
    collection_full_text: Connection to the full-text index.
    collection_questions_text: Connection to the question-based index.
    es: Direct Elasticsearch client instance.
    '''
    from langchain_openai import AzureOpenAIEmbeddings
    from langchain_elasticsearch import ElasticsearchStore
    from elasticsearch import Elasticsearch

    # Embedding for LangChain
    embedding_function_to_langchain = AzureOpenAIEmbeddings(
        model = 'text-embedding-3-small',
        api_key = EMBEDDING_KEY,
        deployment = DEPLOYMENT_NAME_EMBEDDING,
        azure_endpoint = AZURE_ENDPOINT_EMBEDDING
    ) 

    dict_return = {}
    # Conect to elastic search
    dict_return['collection_full_text'] = ElasticsearchStore(
        es_url = ELASTICSEARCH_ENDPOINT,
        index_name = "collection_full_text",
        embedding = embedding_function_to_langchain,
        es_user = ELASTICSEARCH_USER,
        es_password = ELASTICSEARCH_PASSWORD,
    )
    
    dict_return['collection_questions_text'] = ElasticsearchStore(
        es_url = ELASTICSEARCH_ENDPOINT,
        index_name = "collection_questions_text",
        embedding = embedding_function_to_langchain,
        es_user = ELASTICSEARCH_USER,
        es_password = ELASTICSEARCH_PASSWORD,
    )

    dict_return['es'] = Elasticsearch(
        ELASTICSEARCH_ENDPOINT,
        basic_auth=(ELASTICSEARCH_USER, ELASTICSEARCH_PASSWORD)
    )
    
    return dict_return

# Get necessary information about the collection in the vector db
index_name_collection_full_text = 'collection_full_text'
index_name_collection_questions_text = 'collection_questions_text'
# Connect with elastic seach and langchain
dict_vectordb = return_vectordb_full_text_and_questions()

## Deleta documentos por metadados document_name

In [4]:
# search_document_by_document_name
def search_document_by_document_name(dict_vectordb_fc, index_name, document_name_to_find):
    """
    Search Elasticsearch index for a document with a matching content query and return its metadata.
    
    :param index_name: Elasticsearch index name
    :param query: Query to search in the document content
    :param size: Number of documents to retrieve
    :return: Document id, including the metadata
    """
    search_result = dict_vectordb_fc['es'].search(
        index = index_name,
        body = {
            "query": {
                "term": {
                    "metadata.document_name.keyword": document_name_to_find  # Search in the 'text' field
                }
            },
            "size": 10000  # Move 'size' inside the 'body'
        }
    )
    
    if search_result["hits"]["total"]["value"] > 0:  # represents the total number of documents that match the query.
        document_ids = [hit['_id'] for hit in search_result['hits']['hits']]
        return document_ids
    else:
        return None

In [5]:
doc_to_delete = 'DeepSeek-R1 Incentivizing Reasoning Capability in LLMs via.pdf'
ids_to_delete_full_text = search_document_by_document_name(dict_vectordb_fc = dict_vectordb, 
    index_name = "collection_full_text", 
    document_name_to_find = doc_to_delete,
)
ids_to_delete_questions_text = search_document_by_document_name(dict_vectordb_fc = dict_vectordb, 
    index_name = 'collection_questions_text', 
    document_name_to_find = doc_to_delete,
)
print(f'Length of ids_to_delete: {len(ids_to_delete_full_text)}')
display(ids_to_delete_full_text)

Length of ids_to_delete: 72


['c10d282f-24b3-4d10-99af-72b8c3e4401b',
 '5a0dcece-13fd-4fe2-83f8-a1abb97a2368',
 'bd58cfa3-668a-41de-b54a-b62a063fed1d',
 '01225d80-d6e8-459a-bc42-fdc9184c2766',
 '53d954c3-95de-456b-b23d-27ce9cbc0118',
 'a0598d11-e923-4f14-9a98-2bdab0bac451',
 '5f57f8dc-8a0c-48f8-903d-0423dde7f9c7',
 '88c6b1fa-0087-4f6d-8e84-f09667a826bf',
 'e6355a00-bfa6-4c26-8ef4-4d031c1981c8',
 'a518c894-f374-4d84-859a-072ed7f436bb',
 'd801c1d7-94d9-4034-b4a6-774fabad24ca',
 'de7eff27-d9fc-4a52-aa02-ad000389e942',
 '0ebd032c-65f7-4ccb-84a6-f79e094cd81c',
 '79e6c556-c3ba-4046-8941-18cea876d674',
 '3287694f-5143-4b28-88fd-64d7b5594375',
 '907b7096-570d-4993-ad37-b0353e6eea42',
 '14d89d59-f4a7-4568-acbc-b8cb7b4e6d03',
 '9f2314ef-766a-414c-a857-4be0bd41a670',
 '12fff5b1-a8fc-43e3-9783-54dac5359061',
 '35e7e1ab-63d7-4342-8a03-97cd7a445e44',
 '5df1e21b-347c-48e6-9db0-43ec876567cb',
 '010ff2ad-93e9-4b05-8cb4-fe29bfef024f',
 '6e04abec-292f-4af7-908e-e59617ab551d',
 '26e36824-e51c-48dd-af76-152bc8e9db4a',
 '69b903ff-20bd-

In [6]:
# Delete from Elasticsearch
dict_vectordb['collection_full_text'].delete(ids = ids_to_delete_full_text)
dict_vectordb['collection_questions_text'].delete(ids = ids_to_delete_questions_text)

True

## Verify if it was deleted

In [10]:
doc_to_delete = 'DeepSeek-R1 Incentivizing Reasoning Capability in LLMs via.pdf'
ids_to_delete_full_text = search_document_by_document_name(dict_vectordb_fc = dict_vectordb, 
    index_name = "collection_full_text", 
    document_name_to_find = doc_to_delete,
)
display(ids_to_delete_full_text)

None