In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np

import chromadb.utils.embedding_functions as embedding_functions
import chromadb
from langchain_openai import AzureOpenAIEmbeddings
from langchain_chroma import Chroma

In [2]:
load_dotenv()
# Check if variables are correctly loaded from .env
AZURE_OPENAI_API_KEY_2 = os.getenv('AZURE_OPENAI_API_KEY')
if not AZURE_OPENAI_API_KEY_2:
    raise ValueError("AZURE_OPENAI_API_KEY not found in environment variables")

DEPLOYMENT_NAME_LLM = os.getenv('DEPLOYMENT_NAME_LLM')
if not DEPLOYMENT_NAME_LLM:
    raise ValueError("DEPLOYMENT_NAME_LLM not found in environment variables")

API_VERSION = os.getenv('API_VERSION')
if not API_VERSION:
    raise ValueError("API_VERSION not found in environment variables")
    
AZURE_ENDPOINT_LLM = os.getenv('AZURE_ENDPOINT_LLM')
if not AZURE_ENDPOINT_LLM:
    raise ValueError("AZURE_ENDPOINT_LLM not found in environment variables")

EMBEDDING_KEY = os.getenv('EMBEDDING_KEY')
if not EMBEDDING_KEY:
    raise ValueError("EMBEDDING_KEY not found in environment variables")

DEPLOYMENT_NAME_EMBEDDING = os.getenv('DEPLOYMENT_NAME_EMBEDDING')
if not DEPLOYMENT_NAME_EMBEDDING:
    raise ValueError("DEPLOYMENT_NAME_EMBEDDING not found in environment variables")
    
AZURE_ENDPOINT_EMBEDDING = os.getenv('AZURE_ENDPOINT_EMBEDDING')
if not AZURE_ENDPOINT_EMBEDDING:
    raise ValueError("AZURE_ENDPOINT_EMBEDDING not found in environment variables")

API_BASE_EMBEDDING = os.getenv('API_BASE_EMBEDDING')
if not API_BASE_EMBEDDING:
    raise ValueError("API_BASE_EMBEDDING not found in environment variables")

In [3]:
# Embedding for ChromaDB
embedding_function = embedding_functions.OpenAIEmbeddingFunction(
                api_key = EMBEDDING_KEY,
                api_base = API_BASE_EMBEDDING ,
                api_type = "azure",
                api_version="2023-05-15",
                model_name=DEPLOYMENT_NAME_EMBEDDING
            )
# Embedding for LangChain
embedding_function_to_langchain = AzureOpenAIEmbeddings(
    model = 'text-embedding-3-small'
    , deployment = DEPLOYMENT_NAME_EMBEDDING
    , azure_endpoint = AZURE_ENDPOINT_EMBEDDING
) 

# Create/load a local Croma database.
persist_directory = 'embedding_OpenAI/chroma/'
client = chromadb.PersistentClient(path = persist_directory)
# Create or get a collection
collection_full_text = client.get_or_create_collection(name = "full_text", embedding_function = embedding_function)
collection_questions = client.get_or_create_collection(name = "questions_text", embedding_function = embedding_function)
# Chroma instaces
chroma_instance_full_text = Chroma(
    client = client,
    collection_name = "full_text",
    embedding_function = embedding_function_to_langchain,
)
chroma_instance_questions = Chroma(
    client = client,
    collection_name = "questions_text",
    embedding_function = embedding_function_to_langchain,
)

In [6]:
# Get index to be deleted
ids_to_delete = []
documents = collection_full_text.get(include=["documents", "metadatas"])

# Transform metadata into a pandas data frame to filter ids
df_metadata =  pd.DataFrame.from_dict(documents['metadatas'])

# Get only documents of interes
doc_to_delete = 'DeepSeek-R1 Incentivizing Reasoning Capability in LLMs via.pdf'
df_to_delete = df_metadata[np.isin(df_metadata['document_name'], doc_to_delete)]
# Get only ids
ids_to_delete = df_to_delete['vector_db_id'].to_list()
# Transform ids into string
ids_to_delete = [str(x) for x in ids_to_delete]

print(f'Length of ids_to_delete: {len(ids_to_delete)}')
display(ids_to_delete)

Length of ids_to_delete: 36


['112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '120',
 '121',
 '122',
 '123',
 '124',
 '125',
 '126',
 '127',
 '128',
 '129',
 '130',
 '131',
 '132',
 '133',
 '134',
 '135',
 '136',
 '137',
 '138',
 '139',
 '140',
 '141',
 '142',
 '143',
 '144',
 '145',
 '146',
 '147']

In [7]:
# Delete from Chroma
chroma_instance_full_text.delete(ids = ids_to_delete)
chroma_instance_questions.delete(ids = ids_to_delete)

## Verify if it was deleted

In [8]:
# We want an empty list!
# Get index to be deleted
ids_to_delete = []
documents = collection_full_text.get(include=["documents", "metadatas"])

# Transform metadata into a pandas data frame to filter ids
df_metadata =  pd.DataFrame.from_dict(documents['metadatas'])

# Get only documents of interes
doc_to_delete = 'DeepSeek-R1 Incentivizing Reasoning Capability in LLMs via.pdf'
df_to_delete = df_metadata[np.isin(df_metadata['document_name'], doc_to_delete)]
# Get only ids
ids_to_delete = df_to_delete['vector_db_id'].to_list()
# Transform ids into string
ids_to_delete = [str(x) for x in ids_to_delete]

print(f'Length of ids_to_delete: {len(ids_to_delete)}')
display(ids_to_delete)

Length of ids_to_delete: 0


[]