<a href="https://colab.research.google.com/github/ManikaNagpal/AI-Agent-Evaluation/blob/main/AI_Agent_Evaluation_(Good).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Dependencies

In [1]:
pip install langchain langchain-huggingface sentence-transformers transformers

Collecting langchain-huggingface
  Downloading langchain_huggingface-1.0.1-py3-none-any.whl.metadata (2.1 kB)
INFO: pip is looking at multiple versions of langchain-huggingface to determine which version is compatible with other requirements. This could take a while.
  Downloading langchain_huggingface-1.0.0-py3-none-any.whl.metadata (2.1 kB)
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Downloading langchain_huggingface-0.3.1-py3-none-any.whl (27 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.3.1


# Load a Small Instruction Model

In [2]:
from transformers import pipeline

from langchain_huggingface import HuggingFacePipeline

In [None]:
# Tiny instruction-following model (~80 MB)

pipe = pipeline("text2text-generation", model="google/flan-t5-small", max_new_tokens=32)

llm = HuggingFacePipeline(pipeline=pipe)

# Define Tools

In [4]:
# Unit Converter Tool

def unit_converter(expr: str) -> str:

    """

    Converts between km and miles.

    Supports format like '10 km to miles' or '5 miles to km'

    """

    try:

        parts = expr.lower().split()

        value = float(parts[0])

        from_unit = parts[1]

        to_unit = parts[3]

        if from_unit == "km" and to_unit == "miles":

            return str(round(value * 0.621371, 4)) + " miles"

        elif from_unit == "miles" and to_unit == "km":

            return str(round(value / 0.621371, 4)) + " km"

        else:

            return "Unsupported conversion"

    except Exception:

        return "Error"

def run_unit_agent(question: str) -> str:

    keywords = ["km", "miles"]

    if any(k in question.lower() for k in keywords) and "to" in question.lower():

        return unit_converter(question)

    else:

        return "Unsupported query"

# Build a Lightweight Tool-Selecting Agent

In [5]:
test_cases = [

    {"question": "10 km to miles", "expected": "6.2137 miles"},

    {"question": "5 miles to km", "expected": "8.0467 km"},

    {"question": "100 km to miles", "expected": "62.1371 miles"},

]

# Define Test Cases

In [6]:
for case in test_cases:

    pred = run_unit_agent(case["question"])

    print(f"Q: {case['question']}")

    print(f"Predicted: {pred}")

    print(f"Expected: {case['expected']}\n")

Q: 10 km to miles
Predicted: 6.2137 miles
Expected: 6.2137 miles

Q: 5 miles to km
Predicted: 8.0467 km
Expected: 8.0467 km

Q: 100 km to miles
Predicted: 62.1371 miles
Expected: 62.1371 miles



# Run Agent on Test Cases

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load tiny embedding model

sim_model = SentenceTransformer("all-MiniLM-L6-v2")

def compute_score(pred, expected):

    # For numeric conversion, exact match works best

    try:

        pred_val = float(pred.split()[0])

        exp_val = float(expected.split()[0])

        return 1.0 if round(pred_val, 4) == round(exp_val, 4) else 0.0

    except:

        # For text outputs, fallback to semantic similarity

        emb_pred = sim_model.encode(pred, convert_to_tensor=True)

        emb_exp = sim_model.encode(expected, convert_to_tensor=True)

        return util.cos_sim(emb_pred, emb_exp).item()



# Evaluate Using Semantic Similarity

In [8]:
# Evaluate

scores = []

for case in test_cases:

    pred = run_unit_agent(case["question"])

    score = compute_score(pred, case["expected"])

    scores.append(score)

    print(f"Predicted: {pred}, Expected: {case['expected']}, Score: {score:.2f}")

print(f"Average Agent Similarity Score: {sum(scores)/len(scores):.2f}")

Predicted: 6.2137 miles, Expected: 6.2137 miles, Score: 1.00
Predicted: 8.0467 km, Expected: 8.0467 km, Score: 1.00
Predicted: 62.1371 miles, Expected: 62.1371 miles, Score: 1.00
Average Agent Similarity Score: 1.00
