Skip to content

Commit

Permalink
rebase updates
Browse files Browse the repository at this point in the history
  • Loading branch information
Yuan325 committed Jun 24, 2024
1 parent 0c061f8 commit 740b1b4
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 8 deletions.
4 changes: 2 additions & 2 deletions llm_demo/evaluation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@
# limitations under the License.

from .eval_golden import goldens
from .evaluation import run_llm_for_eval
from .evaluation import evaluate_retrieval_phase, run_llm_for_eval

__ALL__ = ["run_llm_for_eval", "goldens"]
__ALL__ = ["run_llm_for_eval", "goldens", "evaluate_retrieval_phase"]
1 change: 1 addition & 0 deletions llm_demo/evaluation/eval_golden.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class EvalData(BaseModel):
default=True, description="determine to reset the chat after invoke"
)


goldens = [
EvalData(
category="Search Airport Tool",
Expand Down
13 changes: 7 additions & 6 deletions llm_demo/evaluation/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import asyncio
import json
from typing import Dict, List

Expand All @@ -35,6 +36,7 @@ async def run_llm_for_eval(
agent = orc.get_user_session(session_id)
for eval_data in eval_list:
query_response = await agent.invoke(eval_data.query)

# Retrieve prediction_tool_calls from query response
prediction_tool_calls = []
for step in query_response.get("intermediate_steps"):
Expand All @@ -52,6 +54,7 @@ async def run_llm_for_eval(
orc.user_session_reset(session, session_id)
return eval_list


def evaluate_task(
eval_dataset: "pd.DataFrame", metrics: List[str], experiment_name: str
) -> evaluation_base.EvalResult:
Expand All @@ -64,22 +67,20 @@ def evaluate_task(
metrics=metrics,
experiment=experiment_name,
)

eval_result = eval_task.evaluate()
return eval_result


def evaluate_retrieval_phase(eval_datas: List[EvalData]) -> evaluation_base.EvalResult:
RETRIEVAL_EXPERIMENT_NAME = "retrieval-phase-eval"
metrics = ["tool_call_quality"]
responses = []
references = []
for e in eval_datas:
responses.append(
json.dumps({"content": e.content, "tool_calls": e.tool_calls})
)
responses.append(json.dumps({"content": e.content, "tool_calls": e.tool_calls}))
references.append(
json.dumps(
{"content": e.content, "tool_calls": e.prediction_tool_calls}
)
json.dumps({"content": e.content, "tool_calls": e.prediction_tool_calls})
)
eval_dataset = pd.DataFrame(
{
Expand Down

0 comments on commit 740b1b4

Please sign in to comment.