In [15]:
import torch
import gymnasium as gym
import ddpg_reacher_simple as d
import importlib
import cv2 
import threading
import matplotlib.pyplot as plt
from IPython.display import clear_output
import time
import plotly.graph_objects as go
importlib.reload(d)

cpu


<module 'ddpg_reacher_simple' from '/app/ddpg_reacher_simple.py'>

#### Function for real time reward plotting

In [16]:
def plot_rewards_threaded(interval=1.0, smooth_window=10):
    fig = go.FigureWidget()
    fig.add_scatter(x=[], y=[], mode='lines+markers', name='Smoothed Reward')
    fig.update_layout(
        title='Episode Rewards Over Time',
        xaxis_title='Episode',
        yaxis_title='Reward',
        yaxis_range=[0, None]
    )
    display(fig)

    def plot_loop():
        while True:
            time.sleep(interval)
            if episode_rewards:
                smoothed = [
                    sum(episode_rewards[max(0, i - smooth_window):i + 1]) / (i - max(0, i - smooth_window) + 1)
                    for i in range(len(episode_rewards))
                ]
                with fig.batch_update():
                    fig.data[0].x = list(range(len(smoothed)))
                    fig.data[0].y = smoothed

    t = threading.Thread(target=plot_loop, daemon=True)
    t.start()

#### Initialize  agent

In [17]:
env = gym.make("Reacher-v5",reward_control_weight=0.15, render_mode=None)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
buffer = d.ReplayBuffer(capacity=1_000_000, state_dim=state_dim)
agent = d.DDPGAgent(state_dim, action_dim, buffer)
num_episodes = 5000

#### Train agent and plot the episode accumulated reward

In [18]:
episode_rewards = []
plot_rewards_threaded()

for episode in range(num_episodes):
    state, _ = env.reset()
    agent.ou_noise.reset()
    cumm_reward = 0
    terminated = truncated = False
    while not (terminated or truncated):
        action = agent.choose_actions(state).cpu().numpy().flatten()
        
        next_state, reward, terminated, truncated, _ = env.step(action)

        done = terminated or truncated
        
        buffer.push(state, action, reward, next_state, done)
        
        agent.train()

        state = next_state

        cumm_reward += reward
    
    episode_rewards.append(cumm_reward)

env.close()

FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Smoothed Reward',
              'type': 'scatter',
              'uid': '2475236d-bd4f-414c-94fa-9726204a779d',
              'x': [],
              'y': []}],
    'layout': {'template': '...',
               'title': {'text': 'Episode Rewards Over Time'},
               'xaxis': {'title': {'text': 'Episode'}},
               'yaxis': {'range': [0, None], 'title': {'text': 'Reward'}}}
})

#### Save the actor and critic models and test the agent

In [20]:
agent.save_full_model("actor_best.pth","critic_best.pth")

Full actor model saved to actor_best.pth
Full critic model saved to critic_best.pth


In [22]:
env = gym.make("Reacher-v5", render_mode="human")
test_episodes = 50

for episode in range(test_episodes):
    state, _ = env.reset()
    terminated = truncated = False
    while not (terminated or truncated):
        action = agent.policy(state).cpu().numpy().flatten()
        next_state, reward, terminated, truncated, _ = env.step(action)
        
        state = next_state

env.close()