In [1]:
import logging
import os
import s3fs

from langchain.schema.runnable.config import RunnableConfig
from langchain_core.prompts import PromptTemplate

from src.chain_building.build_chain import build_chain
from src.chain_building.build_chain_validator import build_chain_validator
from src.config import CHATBOT_TEMPLATE, EMB_MODEL_NAME
from src.db_building import (
    load_retriever,
    load_vector_database
)
from src.model_building import build_llm_model
from src.utils.formatting_utilities import add_sources_to_messages, str_to_bool

# Logging configuration
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %I:%M:%S %p",
    level=logging.DEBUG,
)

# Remote file configuration
os.environ['MLFLOW_TRACKING_URI'] = "https://projet-llm-insee-open-data-mlflow.user.lab.sspcloud.fr/"
fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": f"""https://{os.environ["AWS_S3_ENDPOINT"]}"""})

# PARAMETERS --------------------------------------

os.environ['UVICORN_TIMEOUT_KEEP_ALIVE'] = "0"

model = os.getenv("LLM_MODEL_NAME")
CHROMA_DB_LOCAL_DIRECTORY = "./data/chroma_db"
CLI_MESSAGE_SEPARATOR = f"{80*'-'} \n"
quantization = True
DEFAULT_MAX_NEW_TOKENS = 10
DEFAULT_MODEL_TEMPERATURE = 1
embedding = os.getenv("EMB_MODEL_NAME", EMB_MODEL_NAME)

model_id = "meta-llama/Llama-3.2-3B-Instruct"
LLM_MODEL = os.getenv("LLM_MODEL_NAME", "meta-llama/Llama-3.2-3B-Instruct")
LLM_MODEL = "microsoft/Phi-3.5-mini-instruct"
QUANTIZATION = os.getenv("QUANTIZATION", True)
MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", DEFAULT_MAX_NEW_TOKENS))
MODEL_TEMPERATURE = int(os.getenv("MODEL_TEMPERATURE", DEFAULT_MODEL_TEMPERATURE))
RETURN_FULL_TEXT = os.getenv("RETURN_FULL_TEXT", True)
DO_SAMPLE = os.getenv("DO_SAMPLE", True)
DATABASE_RUN_ID = "32d4150a14fa40d49b9512e1f3ff9e8c"


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import os

from langchain_huggingface import HuggingFaceEmbeddings
from src.db_building.corpus_building import (
    build_or_use_from_cache, DEFAULT_LOCATIONS,
    _preprocess_data
)
from src.model_building.fetch_llm_model import cache_model_from_hf_hub
from langchain_chroma import Chroma
import logging

logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %I:%M:%S %p",
    level=logging.DEBUG,
)

def create_vector_db(
    embedding_model: str,
    fs: s3fs.S3FileSystem,
    collection_name: str = "insee_data",
    persist_directory: str = "./data/chroma_db",
    embed: bool = True,
    **kwargs
):

    embedding_device = kwargs.get("embedding_device","cuda")
    dataset_location = kwargs.get("dataset_location", DEFAULT_LOCATIONS)
    max_pages = kwargs.get("max_pages", None)
    s3_bucket = kwargs.get("s3_bucket", "projet-llm-insee-open-data")
    hf_token = kwargs.get("hf_token", os.environ["HF_TOKEN"])
    s3_token = kwargs.get("s3_token", os.environ["AWS_SESSION_TOKEN"])

    cache_model_from_hf_hub(
        embedding_model, hf_token = hf_token, s3_token = s3_token
    )

    logging.info(f"{50*'-'}\nStructuring documents in a dataframe")

    df, all_splits = _preprocess_data(
            filesystem=fs,
            s3_bucket=s3_bucket,
            location_dataset=dataset_location,
            embedding_model=embedding_model,
            max_pages=max_pages
        )

    if embed is False:
        return df, all_splits, None

    emb_model = HuggingFaceEmbeddings(  # load from sentence transformers
            model_name=embedding_model,
            model_kwargs={"device": embedding_device},
            encode_kwargs={"normalize_embeddings": True},  # set True for cosine similarity
            show_progress=False
        )

    ids = [str(idx) for idx, _ in enumerate(all_splits)]

    logging.info(f"{50*'-'}\nStarting to embed documents in a vector database")

    vector_store = Chroma(
        collection_name=collection_name,
        embedding_function=emb_model,
        persist_directory=persist_directory,
    )

    vector_store.add_documents(documents=all_splits)

    return df, all_splits, vector_store

In [3]:
persist_directory = "./test_chroma"
embedding_model = "OrdalieTech/Solon-embeddings-large-0.1"

df, all_splits, _ = create_vector_db(
    embedding_model=embedding_model, fs=fs,
    embed = False
)

Fetching model OrdalieTech/Solon-embeddings-large-0.1 from S3.


2024-11-19 08:26:25,660 - INFO - --------------------------------------------------
Structuring documents in a dataframe
2024-11-19 08:26:25,662 - INFO - Input data extracted from s3://projet-llm-insee-open-data
2024-11-19 08:26:25,711 - INFO - Found credentials in environment variables.
2024-11-19 08:26:36,141 - INFO - Processing page 1280888 -- 0/43226
2024-11-19 08:26:36,186 - INFO - Processing page 1280890 -- 1/43226
2024-11-19 08:26:36,263 - INFO - Processing page 1280892 -- 2/43226
2024-11-19 08:26:36,307 - INFO - Processing page 1280894 -- 3/43226
2024-11-19 08:26:36,339 - INFO - Processing page 1280896 -- 4/43226
2024-11-19 08:26:36,421 - INFO - Processing page 1280898 -- 5/43226
2024-11-19 08:26:36,455 - INFO - Processing page 1280904 -- 6/43226
2024-11-19 08:26:36,527 - INFO - Processing page 1280900 -- 7/43226
2024-11-19 08:26:36,588 - INFO - Processing page 1280902 -- 8/43226
2024-11-19 08:26:36,621 - INFO - Processing page 1280906 -- 9/43226
2024-11-19 08:26:36,658 - INFO 

In [None]:
from src.db_building.utils_db import split_list

max_batch_size = 41600
split_docs_chunked = split_list(all_splits, max_batch_size)

<generator object split_list at 0x7faca0eafc40>

In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name = embedding_model,
    model_kwargs = {'device': 'cuda'},
    encode_kwargs = {'normalize_embeddings': False}
)

    # Loop through the chunks and build the Chroma database
    try:
        for split_docs_chunk in split_docs_chunked:
            db = Chroma.from_documents(
                collection_name=collection_name,
                documents=split_docs_chunk,
                persist_directory=persist_directory,
                embedding=emb_model,
                client_settings=Settings(anonymized_telemetry=False, is_persistent=True),
            )
    except Exception as e:
        logging.error(f"An error occurred while building the Chroma database: {e}")


2024-11-19 10:48:18,989 - INFO - Load pretrained SentenceTransformer: OrdalieTech/Solon-embeddings-large-0.1


AttributeError: 'list' object has no attribute 'page_content'

In [None]:
import chromadb

client = chromadb.PersistentClient(path="./chroma_langchain_db2")  # or HttpClient()
col = client.get_or_create_collection("example_collection")  # create a new collection

col.count()

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

collection_name = "insee_data",
persist_directory = "./data/chroma_db",

ids = [str(idx) for idx, _ in enumerate(all_splits)]


emb_model = HuggingFaceEmbeddings(  # load from sentence transformers
            model_name=embedding_model,
            model_kwargs={"device": "cuda"},
            encode_kwargs={"normalize_embeddings": True},  # set True for cosine similarity
            show_progress=False
)

vector_store = Chroma(
    collection_name=collection_name,
    embedding_function=emb_model,
    persist_directory=persist_directory,
)

vector_store.add_documents(documents=all_splits[:5], ids = ids[:5])

In [None]:
import chromadb

persistent_client = chromadb.PersistentClient()
collection = persistent_client.get_or_create_collection("collection_name")
collection.add(ids=["1", "2", "3"], documents=["a", "b", "c"])

vector_store_from_client = Chroma(
    client=persistent_client,
    collection_name="collection_name",
    embedding_function=embeddings,
)

In [None]:
import chromadb

collection_name = "insee_data"

client = chromadb.PersistentClient(path=persist_directory)  # or HttpClient()
col = client.get_or_create_collection("insee_data")  # create a new collection

col.count()

In [None]:
import langchain_chroma as lc
lc.version

In [None]:
vector_store.similarity_search("définition PIB", k=2)

In [None]:
from langchain_chroma import Chroma
collection_name = "insee_data"
persist_directory = "./test_chroma_emb"


vector_store = Chroma(
    collection_name=collection_name,
    embedding_function=emb_model,
    persist_directory=persist_directory,  # Where to save data locally, remove if not necessary
)

In [None]:
vector_store.similarity_search(
    "Définition PIB",
    k=2
)

In [None]:
retriever = vector_store.as_retriever(
    search_type="similarity", search_kwargs={"k": 1}
)
retriever.invoke("Quelle est la définition du PIB")

In [None]:


for split_docs_chunk in split_docs_chunked:
    db = Chroma.from_documents(
        collection_name=collection_name,
        documents=split_docs_chunk,
        persist_directory=persist_directory,
        embedding=emb_model,
        client_settings=Settings(anonymized_telemetry=False, is_persistent=True),
    )

In [None]:
df2 = df.head(100)

In [None]:
model_id = 'meta-llama/Llama-3.2-3B-Instruct'
kwargs = {"embedding_model": model_id, "token": os.environ["HF_token"]}

from src.db_building.document_chunker import chunk_documents

chunk_documents(data=df2, **kwargs)

In [None]:
    from transformers import AutoTokenizer
    hf_token = kwargs.get("HF_token", os.environ.get("HF_token", None))

    # Load the tokenizer
    autokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)

In [None]:
    # Get the maximum token length the tokenizer can handle
    chunk_size = autokenizer.model_max_length

    # Compute chunk overlap as 10% of the chunk size
    chunk_overlap = int(chunk_size * 0.1)


In [None]:
chunk_size, chunk_overlap

In [None]:
autokenizer, chunk_size, chunk_overlap = compute_autokenizer_chunk_size(
            kwargs.get("embedding_model")
        )