### Generate a Complete Database 

- date : 24/05/2024
- New features : generate a complete database using "OrdalieTech/Solon-embeddings-large-0.1" embedding model (config file have been improved)
- Expected improvement : improving the retrieval capabilities with a much strong embedding model.

In [None]:
import os
import sys

### Building New Complete Dataset based on config files 

In [None]:
# db = build_database_from_csv('/home/onyxia/work/llm-open-data-insee/data_complete.csv')
# db.similarity_search("Quels sont les chiffres du chômages en 2023")

### Loading Dataset based on config files

In [None]:
from db_building import reload_database_from_local_dir

db = reload_database_from_local_dir(
    persist_directory="/home/onyxia/work/llm-open-data-insee/data/chroma_db"
)

In [None]:
# check if there are at least one encoded document in our vectorstore
print(len(db.get()["ids"]))

In [None]:
result = db.similarity_search(
    "Quels résultats au BAC les étudiants de classes préparatoires ont ils généralement?", k=5
)
print(result[0])

In [None]:
from chain_building.build_chain import load_retriever
from config import EMB_MODEL_NAME, MODEL_NAME, RAG_PROMPT_TEMPLATE
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from model_building import build_llm_model


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


def build_chain_test(retriever, prompt, llm):
    """
    Build a LLM chain based on Langchain package and INSEE data
    """
    # Create a Langchain LLM Chain
    chain = (
        RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
        | prompt
        | llm
        | StrOutputParser()
    )

    rag_chain_with_source = RunnableParallel(
        {"context": retriever, "question": RunnablePassthrough()}
    ).assign(answer=chain)

    return rag_chain_with_source

In [None]:
prompt = PromptTemplate(input_variables=["context", "question"], template=RAG_PROMPT_TEMPLATE)
print(prompt)

In [None]:
retriever = db.as_retriever(search_type="mmr", search_kwargs={"score_threshold": 0.5, "k": 5})

# retriever = load_retriever(emb_model_name=EMB_MODEL_NAME,
# persist_directory="/home/onyxia/work/llm-open-data-insee/data/chroma_db")

In [None]:
os.environ["HF_TOKEN"] = "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"

In [None]:
llm = build_llm_model(
    model_name=MODEL_NAME, quantization_config=True, config=True, token=os.environ["HF_TOKEN"]
)

In [None]:
chain = build_chain_test(retriever, prompt, llm)

In [None]:
chain

In [None]:
question = "Quel est le but initial derrière la création du système de retraites français après la Seconde Guerre mondiale?"
# question = "Quelle est la cause principale de l'augmentation de l'indice des prix à la consommation (IPC)?"
results = retriever.invoke(question)

for i, doc in enumerate(results):
    print(f"Doc {i} : {doc.metadata["source"]}")
    print(doc.page_content)

In [None]:
for chunk in chain.stream(question):
    print(chunk)

In [None]:
answer = chain.invoke(question)

In [None]:
print(answer["answer"])

### Adding a Reranker 

The goal of this part is to build a pipeline Langchain where we have added a reranker: a BM25, a ColBERT model, a french cross-encoder, a multilingual cross-encoder and several hyperparameters.  

Reranker model list : 
- multilingual cross encoder : BAAI/bge-reranker-large (multilingual),
- french cross encoder : antoinelouis/crossencoder-electra-base-french-mmarcoFR  OR dangvantuan/CrossEncoder-camembert-large
- BM25 : langchain_community.retrievers import BM25Retriever
- ColBERT : antoinelouis/colbertv2-camembert-L4-mmarcoFR


In [None]:
!mc cp s3/projet-llm-insee-open-data/data/chroma_database/chroma_db /home/onyxia/work/llm-open-data-insee/data --recursive

In [None]:
sys.path.append("/home/onyxia/work/llm-open-data-insee/src")

In [None]:
from chain_building import load_retriever
from config import MODEL_NAME

retriever = load_retriever(
    emb_model_name=EMB_MODEL_NAME,
    persist_directory="/home/onyxia/work/llm-open-data-insee/data/chroma_db",
    device="cuda",
    collection_name="insee_data",
)

In [None]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [None]:
# test embedding retriever
question = "Comment est calculé le pouvoir d'achat ?"
# question = "Quelle est la cause principale de l'augmentation de l'indice des prix à la consommation (IPC)?"
results = retriever.invoke(question)
pretty_print_docs(results)  # OK

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder  # CrossEncoder
from langchain_community.retrievers import BM25Retriever  # BM25
from ragatouille import RAGPretrainedModel  # ColBERT

colBERT = RAGPretrainedModel.from_pretrained("antoinelouis/colbertv2-camembert-L4-mmarcoFR")
colBERT_retriever = ContextualCompressionRetriever(
    base_compressor=colBERT.as_langchain_document_compressor(k=5), base_retriever=retriever
)

compressed_docs = colBERT_retriever.invoke(question)
pretty_print_docs(compressed_docs)

In [None]:
model = HuggingFaceCrossEncoder(
    model_name="dangvantuan/CrossEncoder-camembert-large"
)  # "antoinelouis/crossencoder-electra-base-french-mmarcoFR")
compressor_1 = CrossEncoderReranker(model=model, top_n=5)
compression_retriever_1 = ContextualCompressionRetriever(
    base_compressor=compressor_1, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke(question)
pretty_print_docs(compressed_docs)

In [None]:
model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-large")
compressor_2 = CrossEncoderReranker(model=model, top_n=5)
compression_retriever_2 = ContextualCompressionRetriever(
    base_compressor=compressor_2, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke(question)
pretty_print_docs(compressed_docs)

In [None]:
from langchain.retrievers import EnsembleRetriever

ensemble_retriever = EnsembleRetriever(
    retrievers=[compression_retriever_1, compression_retriever_2, colBERT_retriever],
    weigths=[1 / 3, 1 / 3, 1 / 3],
)

compressed_docs = ensemble_retriever.invoke(question)
pretty_print_docs(compressed_docs)

In [None]:
from typing import Any, Dict, Sequence

from langchain.schema import Document
from langchain_core.runnables import RunnableLambda


# Define the compression function
def compress_documents_lambda(
    documents: Sequence[Document], query: str, k: int = 5, **kwargs: Dict[str, Any]
) -> Sequence[Document]:
    """Compress retrieved documents given the query context."""

    # Initialize the retriever with the documents
    retriever = BM25Retriever.from_documents(documents, k=k, **kwargs)
    relevant_docs = retriever.get_relevant_documents(query)
    return relevant_docs


# Define the complete chain
bm25_retriever = RunnableParallel(
    {"documents": retriever, "query": RunnablePassthrough()}
) | RunnableLambda(lambda r: compress_documents_lambda(documents=r["documents"], query=r["query"]))

bm25_retriever.invoke(question)

In [None]:
from langchain.retrievers import ContextualCompressionRetriever, EnsembleRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder  # CrossEncoder

model = HuggingFaceCrossEncoder(
    model_name="dangvantuan/CrossEncoder-camembert-large"
)  # "antoinelouis/crossencoder-electra-base-french-mmarcoFR")
compressor_1 = CrossEncoderReranker(model=model, top_n=5)

compression_retriever_cross_encoder = ContextualCompressionRetriever(
    base_compressor=compressor_1, base_retriever=retriever
)

emsemble_reranking = EnsembleRetriever(
    retrievers=[compression_retriever_cross_encoder, bm25_retriever], weigths=[0.5, 0.5]
)

In [None]:
emsemble_reranking.invoke(question)

## Test New LLM model 

In [None]:
import torch
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
)

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

# quantization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False,
)

hf_token = "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"
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

pipe = pipeline(model=model, tokenizer=tokenizer)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt",
).to(model.device)

terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]

outputs = pipe(
    input_ids,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
response = outputs[0][input_ids.shape[-1] :]
print(tokenizer.decode(response, skip_special_tokens=True))