In [0]:
pip install ragas

In [0]:
pip install sacrebleu

In [0]:
#test
from ragas import SingleTurnSample
from ragas.metrics import BleuScore

test_data = {
    "user_input": "summarise given text\nThe company reported an 8% rise in Q3 2024, driven by strong performance in the Asian market. Sales in this region have significantly contributed to the overall growth. Analysts attribute this success to strategic marketing and product localization. The positive trend in the Asian market is expected to continue into the next quarter.",
    "response": "The company experienced an 8% increase in Q3 2024, largely due to effective marketing strategies and product adaptation, with expectations of continued growth in the coming quarter.",
    "reference": "The company reported an 8% growth in Q3 2024, primarily driven by strong sales in the Asian market, attributed to strategic marketing and localized products, with continued growth anticipated in the next quarter."
}
metric = BleuScore()
test_data = SingleTurnSample(**test_data)
metric.single_turn_score(test_data)

In [0]:
test_data = {
    "user_input": "summarise given text\nVirat Kohli achieved a historic milestone during the India vs Australia ODI series, scoring his 50th century. His exceptional performance helped India secure a decisive victory in the final match. Cricket analysts praised his consistency and unmatched batting prowess, calling him one of the greatest players in the modern era. Kohli's form remains crucial for India's success in upcoming tournaments.",
    "response": "Virat Kohli scored his 50th ODI century during the India-Australia series, aiding India's victory and solidifying his reputation as a modern cricket legend. His performance is key for upcoming tournaments.",
    "reference": "Virat Kohli reached his 50th century in the India-Australia ODI series, earning widespread acclaim for his consistency and batting skill, which were instrumental in India's win and remain vital for future tournaments."
}

metric = BleuScore()
test_data = SingleTurnSample(**test_data)
metric.single_turn_score(test_data)

Evaluate with LLM

In [0]:
%pip install -U langgraph langsmith langchain transformers langchain_community mlflow torch

In [0]:
import mlflow

model_name = "system.ai.llama_v3_2_1b_instruct"
model_version = "2"

model_uri = f"models:/{model_name}/{model_version}"

print(f"Loading model: {model_name}...")
loaded_model = mlflow.pyfunc.load_model(model_uri)
print("Model successfully loaded!")

In [0]:
import mlflow
from langchain.llms.base import LLM
from typing import List, Optional, Any
from ragas.llms import LangchainLLMWrapper

class MLflowLangchainLLM(LLM):
    model: Any

    def __init__(self, model):
        super().__init__(model=model)

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        *args,  # Accept additional arguments
        **kwargs  # Capture extra keyword arguments
    ) -> str:
        print(f"[DEBUG] Prompt: {prompt}")
        print(f"[DEBUG] Stop: {stop}")
        print(f"[DEBUG] Additional args: {args}")
        print(f"[DEBUG] Additional kwargs: {kwargs}")

        # Transform the input to match the required schema
        formatted_input = {
            "messages": [
                {
                    "role": "user",       # Role field required by schema
                    "content": prompt     # Prompt as 'content'
                }
            ]
        }

        print(f"[DEBUG] Formatted Input: {formatted_input}")

        # Make prediction using MLflow model
        try:
            response = self.model.predict(formatted_input)
            print(f"[DEBUG] Model Response: {response}")

            # Extract content from response (implementation depends on format)
            if isinstance(response, list) and len(response) > 0:
                choices = response[0].get('choices', [])
                if len(choices) > 0:
                    message = choices[0].get('message', {})
                    content = message.get('content', None)
                    if content:
                        return content

            # If response format is unexpected
            raise ValueError("Unable to extract content from model response.")
        except Exception as e:
            print(f"[DEBUG] Error: {e}")
            raise MlflowException(f"Failed model prediction: {str(e)}")

    @property
    def _identifying_params(self) -> dict:
        return {"model_name": "mlflow_loaded_llm"}

    @property
    def _llm_type(self) -> str:
        return "custom_mlflow_llm"

evaluator_llm = LangchainLLMWrapper(MLflowLangchainLLM(model=loaded_model))

In [0]:
# Test input formatted for the model
test_input = {
    "messages": [
        {"role": "user", "content": "Who is the inventor of Telephone?"}
    ]
}

# Test the model prediction directly
response = loaded_model.predict(test_input)
print(f"[DEBUG RAW RESPONSE] {response}")

# Test the MLflowLangchainLLM wrapper
evaluator_llm = MLflowLangchainLLM(model=loaded_model)
output = evaluator_llm._call(prompt="Who is the inventor of Telephone?")

# Print the extracted content
print(f"Extracted Content: {output}")

In [0]:
from ragas import SingleTurnSample
from ragas.metrics import AspectCritic

test_data = {
    "user_input": "summarise given text\nThe company reported an 8% rise in Q3 2024, driven by strong performance in the Asian market. Sales in this region have significantly contributed to the overall growth. Analysts attribute this success to strategic marketing and product localization. The positive trend in the Asian market is expected to continue into the next quarter.",
    "response": "The company experienced an 8% increase in Q3 2024, largely due to effective marketing strategies and product adaptation, with expectations of continued growth in the coming quarter.",
}

metric = AspectCritic(name="summary_accuracy",llm=evaluator_llm, definition="Verify if the summary is accurate.")
test_data = SingleTurnSample(**test_data)
await metric.single_turn_ascore(test_data)