In [25]:
import os
from dotenv import load_dotenv
from src.utils.utils_vllm import get_model_from_env

load_dotenv(override=True)

URL_EMBEDDING = os.getenv('URL_EMBEDDING_MODEL').rstrip('/')
URL_GENERATION = os.getenv('URL_GENERATIVE_MODEL')
MODEL_COMPLETION = get_model_from_env('URL_GENERATIVE_MODEL')
MODEL_EMBEDDING = get_model_from_env('URL_EMBEDDING_MODEL')

In [26]:
from openai import OpenAI

client = OpenAI(
    base_url=URL_GENERATION,
    api_key="EMPTY",
)

completion = client.chat.completions.create(
    model=MODEL_COMPLETION, messages=[{"role": "user", "content": "Hello!"}]
)

print(completion.choices[0].message)

ChatCompletionMessage(content="Hello! How can I assist you today? If you're up for it, let's share a interesting fact to start. Did you know that a day on Venus is longer than a year on Venus? It takes Venus about 243 Earth days to rotate once on its axis, but it only takes around 225 Earth days for Venus to orbit the Sun. Isn't that amazing? Now, what would you like to talk about or ask me?", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[])


In [None]:
from openai import OpenAI

# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = URL_EMBEDDING

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)

models = client.models.list()
model = models.data[0].id

responses = client.embeddings.create(
    input=["Hello my name is", "The best thing about vLLM is that it supports many different models"],
    model=model,
)

for data in responses.data:
    print(data.embedding)  # List of float of len 4096

[0.0013580322265625, -0.003936767578125, -0.000370025634765625, 0.00130462646484375, 0.003082275390625, -0.004974365234375, -0.0004520416259765625, -0.01031494140625, 0.00185394287109375, 0.006622314453125, 0.0035247802734375, 0.000213623046875, 0.003814697265625, -0.00439453125, -0.001678466796875, -0.0072021484375, 0.00360107421875, -0.0048828125, 0.004547119140625, -0.004913330078125, 0.01190185546875, 0.0018157958984375, -0.004302978515625, -0.0013275146484375, 0.0012054443359375, 0.00592041015625, -0.0037841796875, 0.0019989013671875, -0.0002841949462890625, -0.00408935546875, -0.0025177001953125, 0.0118408203125, -0.004486083984375, 0.004302978515625, -0.0004749298095703125, -0.0103759765625, 0.0036468505859375, -0.01300048828125, -0.00074005126953125, -0.0020751953125, -0.00159454345703125, -0.0027008056640625, -0.00494384765625, -0.000850677490234375, -0.1357421875, 0.000316619873046875, -0.00081634521484375, 0.00077056884765625, -0.0004825592041015625, -0.0023345947265625, -0.

In [30]:
from litellm import embedding

response = embedding(
    model=f"hosted_vllm/{MODEL_EMBEDDING}",
    api_base="https://user-lgaliana-vllm.user.lab.sspcloud.fr/v1",
    input=["good morning from litellm"],
)

response.data[0]["embedding"][:3]

[0.0113525390625, -0.005340576171875, -0.0126953125]

In [31]:
from litellm import completion

response = completion(
    model=f"hosted_vllm/{MODEL_COMPLETION}",
    api_base=URL_GENERATION,
    messages=[{"role": "user", "content": "what llm are you"}],
)

response.choices[0]["message"]["content"]

"I am a Large Language Model (LLM) developed by Mistral AI. I'm designed to understand and generate human-like text based on the input I receive. I can provide information, answer questions, explain concepts, and even engage in creative writing, among other tasks. My capabilities are based on the data I've been trained on, which includes a wide range of texts from the internet up until 2023.\n\nHere are a few things I can do:\n\n1. **Answer Questions**: I can provide information based on the data I've been trained on.\n2. **Explain Concepts**: I can help explain complex ideas in a simple way.\n3. **Engage in Dialogue**: I can participate in conversations on a wide range of topics.\n4. **Creative Writing**: I can generate creative content like stories, poems, etc.\n\nHowever, please keep in mind that I don't have real-time information, personal experiences, or feelings, and my knowledge cutoff is 2023. Also, I strive to generate helpful, honest, and harmless responses, but I don't have 

In [32]:
import litellm
from litellm import CustomLLM, completion

class MyCustomLLM(CustomLLM):
    def completion(self, *args, **kwargs) -> litellm.ModelResponse:
        return litellm.completion(
            model=f"hosted_vllm/{MODEL_COMPLETION}",
            api_base= URL_GENERATION,
            messages=[{"role": "user", "content": "Hello world"}],
        )  # type: ignore

    async def acompletion(self, *args, **kwargs) -> litellm.ModelResponse:
        return litellm.completion(
            model=f"hosted_vllm/{MODEL_COMPLETION}",
            api_base= URL_GENERATION,
            messages=[{"role": "user", "content": "Hello world"}],
        )  # type: ignore

    def embedding(self, *args, **kwargs) -> litellm.ModelResponse:
        return litellm.embedding(
            model=f"hosted_vllm/{MODEL_EMBEDDING}",
            api_base= URL_EMBEDDING,
            input=["good morning from litellm"],
        )  # type: ignore

    async def aembedding(self, *args, **kwargs) -> litellm.ModelResponse:
        return litellm.aembedding(
            model=f"hosted_vllm/{MODEL_EMBEDDING}",
            api_base= URL_EMBEDDING,
            # set API Base of your Custom OpenAI Endpoint
            input=["good morning from litellm"],
        )  # type: ignore


my_custom_llm = MyCustomLLM()

litellm.custom_provider_map = [  # 👈 KEY STEP - REGISTER HANDLER
    {"provider": "my-custom-llm", "custom_handler": my_custom_llm}
]

resp = completion(
    model="my-custom-llm/my-fake-model",
    messages=[{"role": "user", "content": "fais le chien"}],
)

#emb = embedding(
#    model="my-custom-llm/my-fake-model",
#    input=["good morning from litellm"],
#)

resp.choices[0].message.content

"Hello! How can I assist you today? If you're up for it, I can tell a joke to lighten the mood:\n\nWhat do you call fake spaghetti?\n\nAn impasta!"

In [33]:
import giskard
giskard.llm.set_llm_model(f"hosted_vllm/{MODEL_COMPLETION}", api_base = URL_GENERATION)
giskard.llm.set_embedding_model(f"hosted_vllm/{MODEL_EMBEDDING}", api_base = URL_EMBEDDING)

In [34]:
import pandas as pd
import s3fs

s3_path = "s3://projet-llm-insee-open-data/data/raw_data/applishare_solr_joined.parquet"

filesystem = s3fs.S3FileSystem(endpoint_url="https://minio.lab.sspcloud.fr")
df = pd.read_parquet(s3_path, engine="pyarrow", filesystem=filesystem)

In [None]:
from src.db_building.document_chunker import parse_transform_documents
documents = parse_transform_documents(data=df.sample(5), max_document_size=None, engine_output="langchain")

[32m2025-03-15 16:25:04.331[0m | [1mINFO    [0m | [36msrc.db_building.utils_db[0m:[36mparse_xmls[0m:[36m169[0m - [1mParsing XML from page 3356461 -- 24609/5 (492180.00%)[0m
[32m2025-03-15 16:25:04.335[0m | [1mINFO    [0m | [36msrc.db_building.utils_db[0m:[36mparse_xmls[0m:[36m169[0m - [1mParsing XML from page 6525240 -- 38234/5 (764680.00%)[0m
[32m2025-03-15 16:25:04.338[0m | [1mINFO    [0m | [36msrc.db_building.utils_db[0m:[36mparse_xmls[0m:[36m169[0m - [1mParsing XML from page 4996825 -- 33252/5 (665040.00%)[0m
[32m2025-03-15 16:25:04.342[0m | [1mINFO    [0m | [36msrc.db_building.utils_db[0m:[36mparse_xmls[0m:[36m169[0m - [1mParsing XML from page 1293164 -- 5148/5 (102960.00%)[0m
[32m2025-03-15 16:25:04.344[0m | [1mINFO    [0m | [36msrc.db_building.utils_db[0m:[36mparse_xmls[0m:[36m169[0m - [1mParsing XML from page 1908449 -- 14294/5 (285880.00%)[0m


In [None]:
# convert documents preprocessed to pandas
documents_to_pandas = []
for doc in documents:
    row = {"page_content": doc.page_content}
    row.update(doc.metadata)
    documents_to_pandas.append(row)

result = pd.DataFrame(documents_to_pandas)

In [49]:
from giskard.rag import generate_testset, KnowledgeBase
knowledge_base = KnowledgeBase.from_pandas(result, columns=["titre", "url", "theme", "page_content"])

In [50]:
# Generate a testset with 10 questions & answers for each question types (this will take a while)
testset = generate_testset(
    knowledge_base,
    num_questions=10,
    language='fr',  # optional, we'll auto detect if not provided
    agent_description="Un assistant à la recherche d'information sur le site insee.fr", # helps generating better questions
)

2025-03-15 16:29:10,131 pid:555096 MainThread giskard.rag  INFO     Finding topics in the knowledge base.


  warn(


2025-03-15 16:29:23,907 pid:555096 MainThread giskard.rag  INFO     Found 1 topics in the knowledge base.


Generating questions: 100%|██████████| 10/10 [00:19<00:00,  1.94s/it]


In [59]:
from langchain_openai import OpenAIEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

def _embedding_client_api():
    emb_model = OpenAIEmbeddings(
        model=MODEL_EMBEDDING,
        base_url=URL_EMBEDDING,
        api_key="EMPTY",
    )

    return emb_model

emb_model = _embedding_client_api()
client = QdrantClient(url=os.getenv("QDRANT_URL"), api_key=os.getenv("QDRANT_API_KEY"), port="443", https="true")

vectorstore = QdrantVectorStore(
        client=client,
        collection_name="web4g_mistrall_small_023521a406e44fef8840788aecb8d13b",
        embedding=emb_model,
        vector_name=MODEL_EMBEDDING,
)

retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [67]:
# Create the RAG chain
from langchain.chains import RetrievalQA

llm = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base
)

rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

ValidationError: 2 validation errors for LLMChain
llm.is-instance[Runnable]
  Input should be an instance of Runnable [type=is_instance_of, input_value=<openai.OpenAI object at 0x7f8615a83a10>, input_type=OpenAI]
    For further information visit https://errors.pydantic.dev/2.10/v/is_instance_of
llm.is-instance[Runnable]
  Input should be an instance of Runnable [type=is_instance_of, input_value=<openai.OpenAI object at 0x7f8615a83a10>, input_type=OpenAI]
    For further information visit https://errors.pydantic.dev/2.10/v/is_instance_of

In [None]:
def answer_fn(question, history=None):
    if history:
        answer = chat_engine.chat(
            question,
            chat_history=[
                ChatMessage(
                    role=MessageRole.USER if msg["role"] == "user" else MessageRole.ASSISTANT,
                    content=msg["content"]
                ) for msg in history
            ]
        )
    else:
        answer = chat_engine.chat(question, chat_history=[])

    return AgentAnswer(
        message=answer.response,
        documents=[source.content for source in answer.sources]
    )

report = evaluate(answer_fn,
                testset=testset,
                knowledge_base=knowledge_base,
                metrics=[ragas_context_recall, ragas_context_precision])

In [None]:
from giskard.rag import evaluate, RAGReport, AgentAnswer
from giskard.rag.metrics.ragas_metrics import ragas_context_recall, ragas_context_precision


# Wrap your RAG model
def get_answer_fn(question: str, history=None) -> str:
    """A function representing your RAG agent."""
    # Format appropriately the history for your RAG agent
    messages = history if history else []
    messages.append({"role": "user", "content": question})

    # Get the answer
    answer = retriever.invoke(question)  # could be langchain, llama_index, etc.

    return answer


# Run the evaluation and get a report
report = evaluate(get_answer_fn, testset=testset, knowledge_base=knowledge_base)

Asking questions to the agent:   0%|          | 0/10 [00:00<?, ?it/s]


ValueError: The answer function must return a string or an AgentAnswer object. Got <class 'list'> instead.

In [7]:
import os
from langchain_community.llms import VLLMOpenAI
from langchain_openai import OpenAIEmbeddings

os.environ["OPENAI_API_KEY"] = "EMPTY"

llm_completion = VLLMOpenAI(
    openai_api_key="EMPTY",
    openai_api_base=URL_GENERATION,
    model_name=MODEL_COMPLETION,
    model_kwargs={"stop": ["."]},
)

emb_model = OpenAIEmbeddings(
    model=MODEL_EMBEDDING, openai_api_base=URL_EMBEDDING, openai_api_key="EMPTY"
)

print(emb_model.embed_query("A sentence to encode.")[:5])
print(llm_completion.invoke("Rome is"))

[0.0061492919921875, 0.026947021484375, 0.017791748046875, -0.004749298095703125, -0.001697540283203125]
 the capital of Italy


In [9]:
from langchain_openai import OpenAIEmbeddings
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.llms import LangchainLLMWrapper

generator_llm = LangchainLLMWrapper(llm_completion)
generator_embeddings = LangchainEmbeddingsWrapper(emb_model)

In [10]:
from langchain_community.document_loaders import DataFrameLoader

loader = DataFrameLoader(df.head(2), page_content_column="xml_content")
docs = loader.load()

In [None]:
docs

In [12]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(docs[:3], testset_size=1)

Applying HeadlinesExtractor:   0%|          | 0/2 [00:00<?, ?it/s]

2025-03-11 17:27:26,561 pid:69042 MainThread ragas.testset.transforms.engine ERROR    unable to apply transformation: Error code: 400 - {'object': 'error', 'message': "This model's maximum context length is 8200 tokens. However, you requested 22293 tokens (22037 in the messages, 256 in the completion). Please reduce the length of the messages or completion.", 'type': 'BadRequestError', 'param': None, 'code': 400}


Applying HeadlinesExtractor:  50%|█████     | 1/2 [01:37<01:37, 97.75s/it]

2025-03-11 17:28:14,838 pid:69042 MainThread ragas.testset.transforms.engine ERROR    unable to apply transformation: Error code: 400 - {'object': 'error', 'message': "This model's maximum context length is 8200 tokens. However, you requested 11139 tokens (10883 in the messages, 256 in the completion). Please reduce the length of the messages or completion.", 'type': 'BadRequestError', 'param': None, 'code': 400}


Applying HeadlineSplitter:   0%|          | 0/2 [00:00<?, ?it/s]          

2025-03-11 17:28:14,872 pid:69042 MainThread ragas.testset.transforms.engine ERROR    unable to apply transformation: 'headlines' property not found in this node
2025-03-11 17:28:14,876 pid:69042 MainThread ragas.testset.transforms.engine ERROR    unable to apply transformation: 'headlines' property not found in this node


Applying SummaryExtractor:   0%|          | 0/2 [00:00<?, ?it/s]

2025-03-11 17:29:51,197 pid:69042 MainThread ragas.testset.transforms.engine ERROR    unable to apply transformation: Error code: 400 - {'object': 'error', 'message': "This model's maximum context length is 8200 tokens. However, you requested 22189 tokens (21933 in the messages, 256 in the completion). Please reduce the length of the messages or completion.", 'type': 'BadRequestError', 'param': None, 'code': 400}


Applying SummaryExtractor:  50%|█████     | 1/2 [01:36<01:36, 96.24s/it]

2025-03-11 17:30:36,135 pid:69042 MainThread ragas.testset.transforms.engine ERROR    unable to apply transformation: Error code: 400 - {'object': 'error', 'message': "This model's maximum context length is 8200 tokens. However, you requested 11035 tokens (10779 in the messages, 256 in the completion). Please reduce the length of the messages or completion.", 'type': 'BadRequestError', 'param': None, 'code': 400}


Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/2 [00:00<?, ?it/s]

2025-03-11 17:30:36,242 pid:69042 MainThread ragas.testset.transforms.engine ERROR    unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
2025-03-11 17:30:36,243 pid:69042 MainThread ragas.testset.transforms.engine ERROR    unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'


Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]     

2025-03-11 17:30:36,309 pid:69042 MainThread ragas.testset.transforms.engine ERROR    unable to apply transformation: Node a2cc9315-69ec-4e42-b36b-24835d041994 has no summary_embedding


                                                                                              

ValueError: No nodes that satisfied the given filer. Try changing the filter.