In [19]:
import torch
import gymnasium as gym
import ddpg_reacher_simple as d
import importlib
import cv2 
import threading
import matplotlib.pyplot as plt
from IPython.display import clear_output
import time
import plotly.graph_objects as go
import numpy as np
importlib.reload(d)

cpu


<module 'ddpg_reacher_simple' from '/app/ddpg_reacher_simple.py'>

#### Function for evaluating the agent based on reward metrics

In [20]:
def evaluate_agent(agent, env_name, n_episodes=50, cumulative_reward_threshold=-3):
    """
    Evaluates a DDPG agent over a specified number of episodes using two metrics:
    1. Accumulated reward average.
    2. Success rate defined by staying at the target for a minimum number of timesteps.

    Args:
        agent (DDPGAgent): The trained DDPG agent object with a .policy() method.
        env_name (str): The name of the Gymnasium environment (e.g., 'Reacher-v4').
        n_episodes (int): The number of episodes to run for the test.
        success_distance_threshold (float): Max L2 distance between fingertip and target to count as "at target."
        success_time_steps (int): Minimum consecutive timesteps required to count a successful episode.

    Returns:
        dict: A dictionary containing the average reward and success rate
        reward_control_weight=0.1,.
    """
    eval_env = gym.make(env_name, reward_control_weight=0.1, render_mode='human')
    total_rewards = []
    successful_episodes = 0

    best_reward = -np.inf
    best_episode_index = -1

    print(f"Starting evaluation of {n_episodes} episodes...")
    print(f"Using success definition: Cumulative Reward > {cumulative_reward_threshold}")

    for episode in range(n_episodes):
        state, info = eval_env.reset()
        episode_reward = 0
        terminated = False
        truncated = False

        while not terminated and not truncated:
            action_tensor = agent.policy(state)
           
            action = action_tensor.squeeze(0).cpu().numpy()

            state, reward, terminated, truncated, info = eval_env.step(action)
            episode_reward += reward

        total_rewards.append(episode_reward)

        if episode_reward > cumulative_reward_threshold:
            successful_episodes += 1

        if episode_reward > best_reward:
            best_reward = episode_reward
            best_episode_index = episode

    eval_env.close()

    avg_reward = np.mean(total_rewards)
    success_rate = successful_episodes / n_episodes

    print("\n--- Evaluation Complete ---")
    print(f"Accumulated Reward Average: **{avg_reward:.4f}**")
    print(f"Success Rate (Cumulative Reward > {cumulative_reward_threshold}): **{success_rate*100:.2f}%** ({successful_episodes}/{n_episodes})")
    print(f"Best Episode Reward: {best_reward:.4f} (Episode: {best_episode_index})")


    return {
        'avg_reward_50_episodes': avg_reward,
        'success_rate_cumulative_reward': success_rate
    }

#### Load the actor and critic models and initialize agent

In [21]:
env = gym.make("Reacher-v5",reward_control_weight=0.18, render_mode=None)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
buffer = d.ReplayBuffer(capacity=1_000_000, state_dim=state_dim)
agent = d.DDPGAgent(state_dim, action_dim, buffer)
agent.load_full_model('actor_best.pth','critic_best.pth')

Full critic model loaded from critic_best.pth
Full actor model loaded from actor_best.pth


#### Evaluate the agent

In [24]:
test_results = evaluate_agent(
    agent=agent, 
    env_name='Reacher-v5', 
    n_episodes=500
)

Starting evaluation of 500 episodes...
Using success definition: Cumulative Reward > -3

--- Evaluation Complete ---
Accumulated Reward Average: **-2.7973**
Success Rate (Cumulative Reward > -3): **68.00%** (340/500)
Best Episode Reward: -0.8485 (Episode: 179)
