IBM · yoavkatz · Jul 2, 2024 · Jul 2, 2024 · Jul 2, 2024 · Jul 2, 2024
diff --git a/docs/docs/examples.rst b/docs/docs/examples.rst
@@ -49,8 +49,8 @@ Each example is a self contained python file that you can run and later modify.
      - Link to code
      - Related documentation
    * - Evaluate an existing question-answering dataset from the Unitxt catalog, and evaluate it
-     - Demonstrates how to evaluate an existing QA dataset (squad) using Huggingface
-       datasets and evaluate APIs, with no installation required. By using predefined LLM as a judge metric.
+     - Demonstrates how to evaluate an existing QA dataset (squad) using the Huggingface
+       datasets and evaluate APIs and leveraging a predefine LLM as a judge metric.
      - `code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_dataset_by_llm_as_judge_no_install.py>`_
      - | :ref:`Evaluating datasets <evaluating_datasets>`.
        | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
@@ -59,7 +59,8 @@ Each example is a self contained python file that you can run and later modify.
      - `code <https://github.com/IBM/unitxt/blob/main/examples/standalone_evaluation_llm_as_judge.py>`_
      - | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
    * - Evaluate an existing summarization dataset from the catalog with LLM as judge
-     - Demonstrates how to evaluate a document summarization dataset by define an LLM as a judge metric, specify the template it uses to produce the input to the judge, and select the judge model and platform.
+     - Demonstrates how to evaluate a document summarization dataset by define an LLM as a judge metrics, specify the template it uses to produce the input to the judge, and select the judge model and platform.
+       The example adds two LLM judges, one that uses the ground truth (references) from the dataset and one that does not.
      - `code <https://github.com/IBM/unitxt/blob/main/examples/evaluation_summarization_dataset_llm_as_judge.py>`_
      - | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
 

diff --git a/examples/evaluation_summarization_dataset_llm_as_judge.py b/examples/evaluation_summarization_dataset_llm_as_judge.py
@@ -78,3 +78,70 @@
             "score",
         ],
     )
+
+
+logger.info(
+    "Now, we will repeat the example except this time we will use the reference for the judgement."
+)
+
+judge_summary_rating_with_reference_template = InputOutputTemplate(
+    instruction="Please act as an impartial judge and evaluate if the assistant's summary summarise well the given text.\n"
+    "You will be given a reference answer and the assistant's answer."
+    " Begin your evaluation by comparing the assistant's answer with the reference answer."
+    " Identify and correct any mistakes."
+    'You must respond according the following format: "[[rate]] - explanation".\n'
+    'Were the rate is a score between 0 to 10 (10 for great summary, 0 for a very poor one)".\n'
+    "The explanation describe shortly why you decided to give the rank you chosen.\n"
+    "Please make sure to start with your rank ([[rank]]) before anything else.\n"
+    "For example: [[9]] The summary catches the main text ideas."
+    ".\n\n",
+    input_format="[Text:\n{question}\n\n"
+    "[The Start of Reference Summary]\n{reference_answer}\n[The End of Reference summary]\n\n"
+    "[The Start of Assistant's summary]\n{answer}\n[The End of Assistant's summary]",
+    output_format="[[{rating}]]",
+    postprocessors=[
+        r"processors.extract_mt_bench_rating_judgment",
+    ],
+)
+
+llm_judge_with_summary_metric = LLMAsJudge(
+    inference_model=inference_model,
+    template=judge_summary_rating_with_reference_template,
+    task="rating.single_turn_with_reference",
+    main_score=f"llm_judge_{model_name.split('/')[1].replace('-', '_')}_{platform}",
+    single_reference_per_prediction=True,
+    strip_system_prompt_and_format_from_inputs=False,
+)
+
+# Load XSUM dataset, with the above metric.
+dataset = load_dataset(
+    card="cards.xsum",
+    template="templates.summarization.abstractive.formal",
+    metrics=[llm_judge_with_summary_metric],
+    loader_limit=20,
+)
+
+test_dataset = dataset["test"]
+
+# Infer a model to get predictions.
+model_name = "google/flan-t5-base"
+inference_model = HFPipelineBasedInferenceEngine(
+    model_name=model_name, max_new_tokens=32
+)
+predictions = inference_model.infer(test_dataset)
+
+# Evaluate the predictions using the defined metric.
+evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
+
+# Print results
+for instance in evaluated_dataset:
+    print_dict(
+        instance,
+        keys_to_print=[
+            "source",
+            "prediction",
+            "processed_prediction",
+            "references",
+            "score",
+        ],
+    )
diff --git a/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py b/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py
@@ -8,7 +8,6 @@
 model = "meta-llama/llama-3-70b-instruct"
 format = "formats.llama3_instruct"
 template = "templates.response_assessment.rating.generic_single_turn"
-task = "rating.single_turn"
 
 gen_params = IbmGenAiInferenceEngineParams(max_new_tokens=252)
 inference_model = IbmGenAiInferenceEngine(model_name=model, parameters=gen_params)
@@ -19,7 +18,7 @@
 metric = LLMAsJudge(
     inference_model=inference_model,
     template=template,
-    task=task,
+    task="rating.single_turn",
     format=format,
     main_score=metric_label,
 )
@@ -29,3 +28,22 @@
     f"metrics.llm_as_judge.rating.{model_label}_template_{template_label}",
     overwrite=True,
 )
+
+
+template = "templates.response_assessment.rating.generic_single_turn_with_reference"
+template_label = template.split(".")[-1]
+metric_label = f"{model_label}_template_{template_label}"
+metric = LLMAsJudge(
+    inference_model=inference_model,
+    template=template,
+    task="rating.single_turn_with_reference",
+    format=format,
+    single_reference_per_prediction=True,
+    main_score=metric_label,
+)
+
+add_to_catalog(
+    metric,
+    f"metrics.llm_as_judge.rating.{model_label}_template_{template_label}",
+    overwrite=True,
+)
diff --git a/prepare/templates/response_assessment/rating/generic_single_turn_with_reference.py b/prepare/templates/response_assessment/rating/generic_single_turn_with_reference.py
@@ -0,0 +1,24 @@
+from unitxt import add_to_catalog
+from unitxt.templates import InputOutputTemplate
+
+add_to_catalog(
+    InputOutputTemplate(
+        instruction="Please act as an impartial judge and evaluate the quality of the response provided"
+        " by an AI assistant to the user input displayed below. Your evaluation should consider"
+        " factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of"
+        " detail of the response. You will be given a reference answer and the assistant's answer."
+        " Begin your evaluation by comparing the assistant's answer with the reference answer."
+        " Identify and correct any mistakes. Be as objective as possible. After providing your explanation,"
+        " you must rate the response on a scale of 1 to 10 by strictly following this format:"
+        ' "[[rating]]", for example: "Rating: [[5]]".\n\n',
+        input_format="[User input]\n{question}\n\n"
+        "[The Start of Reference Answer]\n{reference_answer}\n[The End of Reference Answer]\n\n"
+        "[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]",
+        output_format="[[{rating}]]",
+        postprocessors=[
+            r"processors.extract_mt_bench_rating_judgment",
+        ],
+    ),
+    "templates.response_assessment.rating.generic_single_turn_with_reference",
+    overwrite=True,
+)
diff --git a/...ge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn_with_reference.json b/...ge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn_with_reference.json
@@ -0,0 +1,16 @@
+{
+    "__type__": "llm_as_judge",
+    "inference_model": {
+        "__type__": "ibm_gen_ai_inference_engine",
+        "model_name": "meta-llama/llama-3-70b-instruct",
+        "parameters": {
+            "__type__": "ibm_gen_ai_inference_engine_params",
+            "max_new_tokens": 252
+        }
+    },
+    "template": "templates.response_assessment.rating.generic_single_turn_with_reference",
+    "task": "rating.single_turn_with_reference",
+    "format": "formats.llama3_instruct",
+    "single_reference_per_prediction": true,
+    "main_score": "llama_3_70b_instruct_ibm_genai_template_generic_single_turn_with_reference"
+}
diff --git a/...itxt/catalog/templates/response_assessment/rating/generic_single_turn_with_reference.json b/...itxt/catalog/templates/response_assessment/rating/generic_single_turn_with_reference.json
@@ -0,0 +1,9 @@
+{
+    "__type__": "input_output_template",
+    "instruction": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user input displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. You will be given a reference answer and the assistant's answer. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n",
+    "input_format": "[User input]\n{question}\n\n[The Start of Reference Answer]\n{reference_answer}\n[The End of Reference Answer]\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]",
+    "output_format": "[[{rating}]]",
+    "postprocessors": [
+        "processors.extract_mt_bench_rating_judgment"
+    ]
+}
diff --git a/src/unitxt/llm_as_judge.py b/src/unitxt/llm_as_judge.py
@@ -71,7 +71,7 @@ def _get_instance_for_judge_model(
                 {
                     "question": input_instance,
                     "answer": prediction,
-                    "reference_answer": reference,
+                    "reference_answer": reference[0],
                     "rating": 5.0,  # This is a dummy value that is not used in practice
                 }
                 for input_instance, prediction, reference in zip(