In [1]:
import logging
import os
import s3fs

from langchain.schema.runnable.config import RunnableConfig
from langchain_core.prompts import PromptTemplate

from src.chain_building.build_chain import build_chain
from src.chain_building.build_chain_validator import build_chain_validator
from src.config import CHATBOT_TEMPLATE, EMB_MODEL_NAME
from src.db_building import (
    load_retriever,
    load_vector_database
)
from src.model_building import build_llm_model
from src.utils.formatting_utilities import add_sources_to_messages, str_to_bool

# Logging configuration
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %I:%M:%S %p",
    level=logging.DEBUG,
)

# Remote file configuration
os.environ['MLFLOW_TRACKING_URI'] = "https://projet-llm-insee-open-data-mlflow.user.lab.sspcloud.fr/"
fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": f"""https://{os.environ["AWS_S3_ENDPOINT"]}"""})

# PARAMETERS --------------------------------------

os.environ['UVICORN_TIMEOUT_KEEP_ALIVE'] = "0"

model = os.getenv("LLM_MODEL_NAME")
CHROMA_DB_LOCAL_DIRECTORY = "./data/chroma_db"
CLI_MESSAGE_SEPARATOR = f"{80*'-'} \n"
quantization = True
DEFAULT_MAX_NEW_TOKENS = 10
DEFAULT_MODEL_TEMPERATURE = 1
embedding = os.getenv("EMB_MODEL_NAME", EMB_MODEL_NAME)

model_id = "meta-llama/Llama-3.2-3B-Instruct"
LLM_MODEL = os.getenv("LLM_MODEL_NAME", "meta-llama/Llama-3.2-3B-Instruct")
LLM_MODEL = "microsoft/Phi-3.5-mini-instruct"
QUANTIZATION = os.getenv("QUANTIZATION", True)
MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", DEFAULT_MAX_NEW_TOKENS))
MODEL_TEMPERATURE = int(os.getenv("MODEL_TEMPERATURE", DEFAULT_MODEL_TEMPERATURE))
RETURN_FULL_TEXT = os.getenv("RETURN_FULL_TEXT", True)
DO_SAMPLE = os.getenv("DO_SAMPLE", True)
DATABASE_RUN_ID = "32d4150a14fa40d49b9512e1f3ff9e8c"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from src.db_building.corpus_building import (
    build_or_use_from_cache, DEFAULT_LOCATIONS,
    _preprocess_data
)

In [None]:
import os

In [4]:
embedding_model = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"

df, all_splits = _preprocess_data(
        filesystem=fs,
        s3_bucket="projet-llm-insee-open-data",
        location_dataset=DEFAULT_LOCATIONS,
        embedding_model=embedding_model
    )

2024-11-15 08:14:37,964 - INFO - Input data extracted from s3://projet-llm-insee-open-data
2024-11-15 08:14:38,002 - INFO - Found credentials in environment variables.
2024-11-15 08:14:46,670 - INFO - Processing page 1280888 -- 0/43226
2024-11-15 08:14:46,714 - INFO - Processing page 1280890 -- 1/43226
2024-11-15 08:14:46,793 - INFO - Processing page 1280892 -- 2/43226
2024-11-15 08:14:46,838 - INFO - Processing page 1280894 -- 3/43226
2024-11-15 08:14:46,871 - INFO - Processing page 1280896 -- 4/43226
2024-11-15 08:14:46,966 - INFO - Processing page 1280898 -- 5/43226
2024-11-15 08:14:47,008 - INFO - Processing page 1280904 -- 6/43226
2024-11-15 08:14:47,120 - INFO - Processing page 1280900 -- 7/43226
2024-11-15 08:14:47,185 - INFO - Processing page 1280902 -- 8/43226
2024-11-15 08:14:47,219 - INFO - Processing page 1280906 -- 9/43226
2024-11-15 08:14:47,256 - INFO - Processing page 1280908 -- 10/43226
2024-11-15 08:14:47,297 - INFO - Processing page 1280910 -- 11/43226
2024-11-15 08:

KeyboardInterrupt: 

In [None]:
import os

from langchain_huggingface import HuggingFaceEmbeddings
from src.model_building.fetch_llm_model import cache_model_from_hf_hub

embedding_device = "cuda"
embedding_model = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"

cache_model_from_hf_hub(
    embedding_model, hf_token = os.environ["HF_token"], s3_token = os.environ["AWS_SESSION_TOKEN"]
)

emb_model = HuggingFaceEmbeddings(  # load from sentence transformers
        model_name=embedding_model,
        model_kwargs={"device": embedding_device},
        encode_kwargs={"normalize_embeddings": True},  # set True for cosine similarity
        show_progress=False
    )

2024-11-14 14:11:33,620 - INFO - Load pretrained SentenceTransformer: Alibaba-NLP/gte-Qwen2-1.5B-instruct


Model Alibaba-NLP/gte-Qwen2-1.5B-instruct found in local cache. 


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.08it/s]
2024-11-14 14:11:37,403 - INFO - 1 prompts are loaded, with the keys: ['query']


In [7]:
documents_to_embed = all_splits[:100]

In [2]:
from langchain_chroma import Chroma

collection_name = "insee_data"
persist_directory = "./test_chroma"

ids = [str(idx) for idx, _ in enumerate(all_splits)]

vector_store = Chroma(
    collection_name=collection_name,
    embedding_function=emb_model,
    persist_directory=persist_directory,
)

vector_store.add_documents(documents=all_splits, ids=ids)

ModuleNotFoundError: No module named 'langchain_chroma'

In [9]:
persist_directory

'./test_chroma_emb'

In [3]:
import chromadb

collection_name = "insee_data"
persist_directory = "./test_chroma"

client = chromadb.PersistentClient(path=persist_directory)  # or HttpClient()
col = client.get_or_create_collection("insee_data")  # create a new collection

col.count()

0

In [54]:
vector_store.similarity_search("définition PIB", k=2)

[Document(metadata={'categorie': 'Publications grand public', 'collection': 'Insee Première', 'dateDiffusion': '2012-10-23 22:00', 'id': '', 'libelleAffichageGeo': 'France', 'sousTitre': '', 'theme': 'Mondialisation, compétitivité et innovation', 'titre': 'Innover pour résister à la crise ou se développer à l’export', 'url': 'https://www.insee.fr/fr/statistiques/1280888', 'xml_auteurs': '', 'xml_content': '', 'xml_intertitre': ''}, page_content='# Innover pour résister à la crise ou se développer à l’export\n\nAnthony Bouvier, division Enquêtes thématiques et études transversales, Insee\n\n## Résumé :\n\n### Entre 2008 et 2010, la moitié des sociétés de 10 salariés ou plus ont innové\n\nAu cours des années 2008 à 2010, 49\xa0% des sociétés marchandes de 10 salariés ou plus implantées en France ont innové (*sources*; *graphique 1*). L’innovation est entendue ici au sens large\xa0: elle peut concerner la création ou l’amélioration de produits (biens ou prestations de services), porter su

In [5]:
from langchain_chroma import Chroma
collection_name = "insee_data"
persist_directory = "./test_chroma_emb"


vector_store = Chroma(
    collection_name=collection_name,
    embedding_function=emb_model,
    persist_directory=persist_directory,  # Where to save data locally, remove if not necessary
)

2024-11-14 08:46:06 AM Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [14]:
vector_store.similarity_search(
    "Définition PIB",
    k=2
)

[]

In [11]:
retriever = vector_store.as_retriever(
    search_type="similarity", search_kwargs={"k": 1}
)
retriever.invoke("Quelle est la définition du PIB")

[]

In [None]:


for split_docs_chunk in split_docs_chunked:
    db = Chroma.from_documents(
        collection_name=collection_name,
        documents=split_docs_chunk,
        persist_directory=persist_directory,
        embedding=emb_model,
        client_settings=Settings(anonymized_telemetry=False, is_persistent=True),
    )

NameError: name 'db' is not defined

In [3]:
df2 = df.head(100)

In [5]:
model_id = 'meta-llama/Llama-3.2-3B-Instruct'
kwargs = {"embedding_model": model_id, "token": os.environ["HF_token"]}

from src.db_building.document_chunker import chunk_documents

chunk_documents(data=df2, **kwargs)

  from .autonotebook import tqdm as notebook_tqdm
2024-11-13 13:58:31,797 - INFO - Building the list of Document objects
2024-11-13 13:58:31,806 - INFO - Applying markdown spliter
2024-11-13 13:58:31,807 - INFO - Initializing token splitter
2024-11-13 13:58:31,809 - INFO - Using model meta-llama/Llama-3.2-3B-Instruct to tokenize text
2024-11-13 13:58:34,565 - INFO - Number of created chunks: 100 in the Vector Database


[Document(metadata={'titre': 'Innover pour résister à la crise ou se développer à l’export', 'categorie': 'Publications grand public', 'url': 'https://www.insee.fr/fr/statistiques/1280888', 'dateDiffusion': '2012-10-23 22:00', 'theme': 'Mondialisation, compétitivité et innovation', 'collection': 'Insee Première', 'libelleAffichageGeo': 'France', 'id': '', 'xml_intertitre': '', 'xml_auteurs': '', 'sousTitre': '', 'xml_content': ''}, page_content='# Innover pour résister à la crise ou se développer à l’export\n\nAnthony Bouvier, division Enquêtes thématiques et études transversales, Insee\n\n## Résumé :\n\n### Entre 2008 et 2010, la moitié des sociétés de 10 salariés ou plus ont innové\n\nAu cours des années 2008 à 2010, 49\xa0% des sociétés marchandes de 10 salariés ou plus implantées en France ont innové (*sources*; *graphique 1*). L’innovation est entendue ici au sens large\xa0: elle peut concerner la création ou l’amélioration de produits (biens ou prestations de services), porter su

In [11]:
    from transformers import AutoTokenizer
    hf_token = kwargs.get("HF_token", os.environ.get("HF_token", None))

    # Load the tokenizer
    autokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)

In [12]:
    # Get the maximum token length the tokenizer can handle
    chunk_size = autokenizer.model_max_length

    # Compute chunk overlap as 10% of the chunk size
    chunk_overlap = int(chunk_size * 0.1)


In [13]:
chunk_size, chunk_overlap

(131072, 13107)

In [14]:
autokenizer, chunk_size, chunk_overlap = compute_autokenizer_chunk_size(
            kwargs.get("embedding_model")
        )

NameError: name 'compute_autokenizer_chunk_size' is not defined