Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the LLM as a judge example a section which shows using judeg with… #981

Merged
merged 5 commits into from
Jul 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions docs/docs/examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ Each example is a self contained python file that you can run and later modify.
- Link to code
- Related documentation
* - Evaluate an existing question-answering dataset from the Unitxt catalog, and evaluate it
- Demonstrates how to evaluate an existing QA dataset (squad) using Huggingface
datasets and evaluate APIs, with no installation required. By using predefined LLM as a judge metric.
- Demonstrates how to evaluate an existing QA dataset (squad) using the Huggingface
datasets and evaluate APIs and leveraging a predefine LLM as a judge metric.
- `code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_dataset_by_llm_as_judge_no_install.py>`_
- | :ref:`Evaluating datasets <evaluating_datasets>`.
| :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
Expand All @@ -59,7 +59,8 @@ Each example is a self contained python file that you can run and later modify.
- `code <https://github.com/IBM/unitxt/blob/main/examples/standalone_evaluation_llm_as_judge.py>`_
- | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
* - Evaluate an existing summarization dataset from the catalog with LLM as judge
- Demonstrates how to evaluate a document summarization dataset by define an LLM as a judge metric, specify the template it uses to produce the input to the judge, and select the judge model and platform.
- Demonstrates how to evaluate a document summarization dataset by define an LLM as a judge metrics, specify the template it uses to produce the input to the judge, and select the judge model and platform.
The example adds two LLM judges, one that uses the ground truth (references) from the dataset and one that does not.
- `code <https://github.com/IBM/unitxt/blob/main/examples/evaluation_summarization_dataset_llm_as_judge.py>`_
- | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.

Expand Down
67 changes: 67 additions & 0 deletions examples/evaluation_summarization_dataset_llm_as_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,70 @@
"score",
],
)


logger.info(
"Now, we will repeat the example except this time we will use the reference for the judgement."
)

judge_summary_rating_with_reference_template = InputOutputTemplate(
instruction="Please act as an impartial judge and evaluate if the assistant's summary summarise well the given text.\n"
"You will be given a reference answer and the assistant's answer."
" Begin your evaluation by comparing the assistant's answer with the reference answer."
" Identify and correct any mistakes."
'You must respond according the following format: "[[rate]] - explanation".\n'
'Were the rate is a score between 0 to 10 (10 for great summary, 0 for a very poor one)".\n'
"The explanation describe shortly why you decided to give the rank you chosen.\n"
"Please make sure to start with your rank ([[rank]]) before anything else.\n"
"For example: [[9]] The summary catches the main text ideas."
".\n\n",
input_format="[Text:\n{question}\n\n"
"[The Start of Reference Summary]\n{reference_answer}\n[The End of Reference summary]\n\n"
"[The Start of Assistant's summary]\n{answer}\n[The End of Assistant's summary]",
output_format="[[{rating}]]",
postprocessors=[
r"processors.extract_mt_bench_rating_judgment",
],
)

llm_judge_with_summary_metric = LLMAsJudge(
inference_model=inference_model,
template=judge_summary_rating_with_reference_template,
task="rating.single_turn_with_reference",
main_score=f"llm_judge_{model_name.split('/')[1].replace('-', '_')}_{platform}",
single_reference_per_prediction=True,
strip_system_prompt_and_format_from_inputs=False,
)

# Load XSUM dataset, with the above metric.
dataset = load_dataset(
card="cards.xsum",
template="templates.summarization.abstractive.formal",
metrics=[llm_judge_with_summary_metric],
loader_limit=20,
)

test_dataset = dataset["test"]

# Infer a model to get predictions.
model_name = "google/flan-t5-base"
inference_model = HFPipelineBasedInferenceEngine(
model_name=model_name, max_new_tokens=32
)
predictions = inference_model.infer(test_dataset)

# Evaluate the predictions using the defined metric.
evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)

# Print results
for instance in evaluated_dataset:
print_dict(
instance,
keys_to_print=[
"source",
"prediction",
"processed_prediction",
"references",
"score",
],
)
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
model = "meta-llama/llama-3-70b-instruct"
format = "formats.llama3_instruct"
template = "templates.response_assessment.rating.generic_single_turn"
task = "rating.single_turn"

gen_params = IbmGenAiInferenceEngineParams(max_new_tokens=252)
inference_model = IbmGenAiInferenceEngine(model_name=model, parameters=gen_params)
Expand All @@ -19,7 +18,7 @@
metric = LLMAsJudge(
inference_model=inference_model,
template=template,
task=task,
task="rating.single_turn",
format=format,
main_score=metric_label,
)
Expand All @@ -29,3 +28,22 @@
f"metrics.llm_as_judge.rating.{model_label}_template_{template_label}",
overwrite=True,
)


template = "templates.response_assessment.rating.generic_single_turn_with_reference"
template_label = template.split(".")[-1]
metric_label = f"{model_label}_template_{template_label}"
metric = LLMAsJudge(
inference_model=inference_model,
template=template,
task="rating.single_turn_with_reference",
format=format,
single_reference_per_prediction=True,
main_score=metric_label,
)

add_to_catalog(
metric,
f"metrics.llm_as_judge.rating.{model_label}_template_{template_label}",
overwrite=True,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from unitxt import add_to_catalog
from unitxt.templates import InputOutputTemplate

add_to_catalog(
InputOutputTemplate(
instruction="Please act as an impartial judge and evaluate the quality of the response provided"
" by an AI assistant to the user input displayed below. Your evaluation should consider"
" factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of"
" detail of the response. You will be given a reference answer and the assistant's answer."
" Begin your evaluation by comparing the assistant's answer with the reference answer."
" Identify and correct any mistakes. Be as objective as possible. After providing your explanation,"
" you must rate the response on a scale of 1 to 10 by strictly following this format:"
' "[[rating]]", for example: "Rating: [[5]]".\n\n',
input_format="[User input]\n{question}\n\n"
"[The Start of Reference Answer]\n{reference_answer}\n[The End of Reference Answer]\n\n"
"[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]",
output_format="[[{rating}]]",
postprocessors=[
r"processors.extract_mt_bench_rating_judgment",
],
),
"templates.response_assessment.rating.generic_single_turn_with_reference",
overwrite=True,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"__type__": "llm_as_judge",
"inference_model": {
"__type__": "ibm_gen_ai_inference_engine",
"model_name": "meta-llama/llama-3-70b-instruct",
"parameters": {
"__type__": "ibm_gen_ai_inference_engine_params",
"max_new_tokens": 252
}
},
"template": "templates.response_assessment.rating.generic_single_turn_with_reference",
"task": "rating.single_turn_with_reference",
"format": "formats.llama3_instruct",
"single_reference_per_prediction": true,
"main_score": "llama_3_70b_instruct_ibm_genai_template_generic_single_turn_with_reference"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"__type__": "input_output_template",
"instruction": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user input displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. You will be given a reference answer and the assistant's answer. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n",
"input_format": "[User input]\n{question}\n\n[The Start of Reference Answer]\n{reference_answer}\n[The End of Reference Answer]\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]",
"output_format": "[[{rating}]]",
"postprocessors": [
"processors.extract_mt_bench_rating_judgment"
]
}
2 changes: 1 addition & 1 deletion src/unitxt/llm_as_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def _get_instance_for_judge_model(
{
"question": input_instance,
"answer": prediction,
"reference_answer": reference,
"reference_answer": reference[0],
"rating": 5.0, # This is a dummy value that is not used in practice
}
for input_instance, prediction, reference in zip(
Expand Down
Loading