In [2]:
import os

import numpy as np
import torch

from src.ppo import PPO
from src.tools import evaluate, load_model, train
from src.utils import mp4_to_gif


In [3]:
# Set random seeds for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

ENV_NAME = "InvertedDoublePendulum-v5"
os.makedirs("results", exist_ok=True)

VAL_EPISODES = 1000

EPOCHS = 300
STEPS_PER_EPOCH = 4096
GAMMA = 0.99

Regular reward

In [4]:
agent = train(
    env_name=ENV_NAME,
    agent=PPO,
    num_epochs=EPOCHS,
    steps_per_epoch=STEPS_PER_EPOCH,
    gamma=GAMMA,
    reward_type="rewards",
)

PPO training: 100%|██████████| 300/300 [24:27<00:00,  4.89s/it]


In [5]:
agent = load_model("./results/ppo-rewards-best.pt", agent)
evaluate(
    ENV_NAME, agent, num_episodes=VAL_EPISODES, record_video=True, reward_type="rewards"
)


  logger.warn(
PPO evaluation: 100%|██████████| 1000/1000 [03:55<00:00,  4.25it/s]


Evaluation Results over 1000 episodes:
Average Episode Length: 1000.00
Average Episode Reward: 9358.25
Average Episode Energy Reward: 11766.82


(np.float64(9358.254787375121),
 np.float64(11766.82459066445),
 np.float64(1000.0))

Energy Reward

In [6]:
agent = train(
    env_name=ENV_NAME,
    agent=PPO,
    num_epochs=EPOCHS,
    steps_per_epoch=STEPS_PER_EPOCH,
    gamma=GAMMA,
    reward_type="energies",
)

PPO training: 100%|██████████| 300/300 [24:12<00:00,  4.84s/it]


In [7]:
agent = load_model("./results/ppo-energies-best.pt", agent)
evaluate(
    ENV_NAME,
    agent,
    num_episodes=VAL_EPISODES,
    record_video=True,
    reward_type="energies",
)


  logger.warn(
PPO evaluation: 100%|██████████| 1000/1000 [03:53<00:00,  4.29it/s]


Evaluation Results over 1000 episodes:
Average Episode Length: 998.04
Average Episode Reward: 9341.49
Average Episode Energy Reward: 11743.33


(np.float64(9341.494516223662),
 np.float64(11743.330215186734),
 np.float64(998.043))

In [8]:
mp4_to_gif("./results")

Video files conversion: 100%|██████████| 2/2 [00:34<00:00, 17.10s/it]
