# MLflow 09: Custom Metrics and Evaluation for Generative Tasks

Welcome to Notebook 9! This notebook demonstrates advanced evaluation techniques for generative AI models using custom metrics with MLflow.

In [None]:
# Windows path handling workaround
import mlflow
from mlflow.utils.file_utils import local_file_uri_to_path
mlflow.utils.file_uri_to_path = local_file_uri_to_path

!pip install --quiet mlflow langchain langchain_community langchain_core langchain_ollama pydantic tiktoken rouge_score bert_score

In [None]:
import mlflow
import pandas as pd
import numpy as np
import re
import json
from datasets import load_dataset
from langchain_ollama.chat_models import ChatOllama
from langchain_core.messages import HumanMessage, SystemMessage
from mlflow.metrics import make_metric, MetricValue

# Configure MLflow
mlflow.set_tracking_uri('mlruns')
mlflow.set_experiment("LLM_Custom_Metrics_Summarization_Ollama")

In [None]:
# Model setup
ollama_judge_model = "qwen3:1.7b"
models_to_evaluate = {
    "gemma3:1b": "gemma3:1b",
    "deepseek-r1:1.5b": "deepseek-r1:1.5b"
}

# Initialize judge LLM
judge_llm = ChatOllama(model=ollama_judge_model, temperature=0.1, format="json")

In [None]:
# Custom metrics definition
def summary_length_ratio(predictions, targets, **kwargs):
    ratios = []
    for pred, target in zip(predictions, targets):
        pred_len = len(str(pred).split())
        target_len = len(str(target).split()) or 1
        ratios.append(pred_len / target_len)
    return MetricValue(
        scores=ratios,
        aggregate_results={
            "mean": np.mean(ratios),
            "std": np.std(ratios)
        }
    )

def keyword_presence(predictions, targets, custom_expected, **kwargs):
    scores = []
    for i, pred in enumerate(predictions):
        keywords = custom_expected[i].get("required_keywords", [])
        found = sum(1 for kw in keywords if re.search(fr'\b{re.escape(kw)}\b', pred, re.I))
        scores.append(found / len(keywords) if keywords else 1.0)
    return MetricValue(
        scores=scores,
        aggregate_results={"mean": np.mean(scores)}
    )

custom_metrics = [
    make_metric(summary_length_ratio, name="summary_length_ratio"),
    make_metric(keyword_presence, name="keyword_presence")
]

In [None]:
# Model wrapper class
class OllamaSummarizer:
    def __init__(self, model_name):
        self.llm = ChatOllama(model=model_name)
        
    def predict(self, data):
        if isinstance(data, pd.DataFrame):
            inputs = data['inputs'].tolist()
        return pd.Series([self.llm.invoke(f"Summarize: {text}").content for text in inputs])

In [None]:
# Evaluation execution
eval_data = pd.DataFrame({
    "inputs": ["Sample text 1", "Sample text 2"],  # Replace with actual data
    "targets": ["Sample summary 1", "Sample summary 2"],
    "custom_expected": [
        {"required_keywords": ["key1"]},
        {"required_keywords": ["key2"]}
    ]
})

for model_name in models_to_evaluate:
    with mlflow.start_run():
        mlflow.log_param("model", model_name)
        
        # Create and evaluate model
        model = OllamaSummarizer(models_to_evaluate[model_name])
        
        results = mlflow.evaluate(
            model=model.predict,
            data=eval_data,
            targets="targets",
            evaluator_config={
                "col_mapping": {
                    "inputs": "inputs",
                    "custom_expected": "custom_expected"
                }
            },
            extra_metrics=custom_metrics
        )
        
        print(f"Results for {model_name}:")
        print(results.metrics)