# Exploration de la base de données vectorielle

## Récupération de la base de données vectorielle

In [None]:
from config import DB_DIR_S3, DB_DIR_LOCAL, EMB_MODEL_NAME
from utils import loading_utilities

loading_utilities.load_chroma_db(
    s3_path=f"s3/projet-llm-insee-open-data/{DB_DIR_S3}", persist_directory=DB_DIR_LOCAL
)

In [None]:
import chromadb

client = chromadb.PersistentClient(path=DB_DIR_LOCAL)
collection = client.get_collection("insee_data")
collection.get(
    include=["metadatas", "documents", "embeddings"],
    limit=1,
)
collection.get(include=["metadatas"], limit=1)

In [None]:
docs = vectordb.similarity_search("Quelle est la définition du PIB ?", k=5)
docs_dict = [{"page_content": doc.page_content, "metadata": doc.metadata} for doc in docs]
docs_data = pd.json_normalize(docs_dict)
docs_data

## Avec langchain

In [None]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from config import EMB_DEVICE

embedding_model = HuggingFaceEmbeddings(  # load from sentence transformers
    model_name=EMB_MODEL_NAME,
    model_kwargs={"device": EMB_DEVICE},
    encode_kwargs={"normalize_embeddings": True},  # set True for cosine similarity
    show_progress=False,
)

vectordb = Chroma(
    collection_name="insee_data", persist_directory=DB_DIR_LOCAL, embedding_function=embedding_model
)
vectordb.persist()

In [None]:
import os

from config import EMB_MODEL_NAME, DB_DIR_S3
from db_building import build_database_from_csv

# Global parameters
EXPERIMENT_NAME = "BUILD_CHROMA_TEST"
MAX_NUMBER_PAGES = 100
CHROMA_DB_LOCAL_DIRECTORY = "data/chroma_database/chroma_test/"
path_s3 = f"s3/projet-llm-insee-open-data/{DB_DIR_S3}"

path_s3

In [None]:
# Global parameters
EXPERIMENT_NAME = "BUILD_CHROMA_TEST"
MAX_NUMBER_PAGES = 100
CHROMA_DB_LOCAL_DIRECTORY = "data/chroma_database/chroma_test/"

# Rustine temporaire
os.environ["MLFLOW_TRACKING_URI"] = "https://projet-llm-insee-open-data-mlflow.user.lab.sspcloud.fr"

# Check mlflow URL is defined
assert (
    "MLFLOW_TRACKING_URI" in os.environ
), "Please set the MLFLOW_TRACKING_URI environment variable."

In [None]:
db = build_database_from_csv(
    "../data_complete.csv",
    persist_directory=CHROMA_DB_LOCAL_DIRECTORY,
    max_pages=100,
)

In [None]:
len(db.get()["documents"])

In [None]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from config import EMB_MODEL_NAME, EMB_DEVICE

CHROMA_DB_LOCAL_DIRECTORY = "data/chroma_database/chroma_test/"

embedding_model = HuggingFaceEmbeddings(  # load from sentence transformers
    model_name=EMB_MODEL_NAME,
    model_kwargs={"device": EMB_DEVICE},
    encode_kwargs={"normalize_embeddings": True},  # set True for cosine similarity
    show_progress=False,
)

vectordb = Chroma(
    collection_name="insee_data",
    persist_directory=CHROMA_DB_LOCAL_DIRECTORY,
    embedding_function=embedding_model,
)
vectordb.persist()

In [None]:
len(vectordb.get()["documents"])