In [1]:
import gymnasium as gym
import panda_gym
import numpy as np
from stable_baselines3 import TD3
from stable_baselines3.her import HerReplayBuffer
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import BaseCallback
import matplotlib.pyplot as plt

# Custom callback to log rewards for tracking purposes
class RewardCallback(BaseCallback):
    def __init__(self, max_episodes, verbose=0):
        super(RewardCallback, self).__init__(verbose)
        self.episode_rewards = []
        self.total_reward = 0
        self.episode_count = 0
        self.max_episodes = max_episodes

    def _on_step(self) -> bool:
        reward = self.locals["rewards"][0]
        self.total_reward += reward

        if self.locals["dones"][0]:  
            self.episode_rewards.append(self.total_reward)
            self.total_reward = 0  # Reset reward for the next episode
            self.episode_count += 1

        # Stop training once max_episodes is reached
        if self.episode_count >= self.max_episodes:
            return False
        return True

# Helper function to train TD3 with HER
def train_td3_with_her(env_id, max_episodes):
    # Create the gym environment and monitor
    env = gym.make(env_id)
    env = Monitor(env)

    # Set up TD3 with HER
    model = TD3(
        "MultiInputPolicy",
        env,
        replay_buffer_class=HerReplayBuffer,
        replay_buffer_kwargs=dict(
            n_sampled_goal=4,  # Number of HER samples per transition
            goal_selection_strategy="future",  # Strategy used to sample goals for HER
        ),
        buffer_size=1_000_000,
        verbose=1
    )

    # Initialize the callback for reward tracking
    reward_callback = RewardCallback(max_episodes=max_episodes)

    # Train the model
    model.learn(total_timesteps=int(1e10), callback=reward_callback)

    return reward_callback.episode_rewards



# Main execution
if __name__ == "__main__":
    env_id = "PandaPush-v3"
    max_episodes = 30000
    print("Training TD3 with HER for 30,000 episodes...")
    rewards_td3 = train_td3_with_her(env_id, max_episodes)

    print('Rewards:', rewards_td3)



Training TD3 with HER for 30,000 episodes...
Using cuda device
Wrapping the env in a DummyVecEnv.


  logger.warn(


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | -50      |
|    success_rate    | 0        |
| time/              |          |
|    episodes        | 4        |
|    fps             | 94       |
|    time_elapsed    | 2        |
|    total_timesteps | 200      |
| train/             |          |
|    actor_loss      | 0.388    |
|    critic_loss     | 0.209    |
|    learning_rate   | 0.001    |
|    n_updates       | 99       |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | -50      |
|    success_rate    | 0        |
| time/              |          |
|    episodes        | 8        |
|    fps             | 82       |
|    time_elapsed    | 4        |
|    total_timesteps | 400      |
| train/             |          |
|    actor_loss      | 0.36     |
|    critic_loss     | 0.0865   |
|    learning_