In [1]:
import os

import numpy as np
import torch

from src.trpo import TRPO
from src.tools import train, evaluate, load_model
from src.utils import mp4_to_gif

In [2]:
# Set random seeds for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

ENV_NAME = "InvertedDoublePendulum-v5"
os.makedirs("results", exist_ok=True)

VAL_EPISODES = 1000

EPOCHS = 350
STEPS_PER_EPOCH = 4096
GAMMA = 0.99

Regular Reward

In [3]:
agent = train(
    env_name=ENV_NAME,
    agent=TRPO,
    num_epochs=EPOCHS,
    steps_per_epoch=STEPS_PER_EPOCH,
    gamma=GAMMA,
    reward_type="rewards",
)

TRPO training: 100%|██████████| 350/350 [12:43<00:00,  2.18s/it]


In [4]:
agent = load_model("./results/trpo-rewards-best.pt", agent)
evaluate(ENV_NAME, agent, num_episodes=VAL_EPISODES, record_video=True, reward_type="rewards")


  logger.warn(
TRPO evaluation: 100%|██████████| 1000/1000 [03:38<00:00,  4.57it/s]


Evaluation Results over 1000 episodes:
Average Episode Length: 921.11
Average Episode Reward: 8615.14
Average Episode Energy Reward: 10779.74


(np.float64(8615.137734428581),
 np.float64(10779.73764166097),
 np.float64(921.105))

Energy Reward

In [5]:
agent = train(
    env_name=ENV_NAME,
    agent=TRPO,
    num_epochs=EPOCHS,
    steps_per_epoch=STEPS_PER_EPOCH,
    gamma=GAMMA,
    reward_type="energies",
)

TRPO training: 100%|██████████| 350/350 [12:29<00:00,  2.14s/it]


In [6]:
agent = load_model("./results/trpo-energies-best.pt", agent)
evaluate(ENV_NAME, agent, num_episodes=VAL_EPISODES, record_video=True, reward_type="energies")


  logger.warn(
TRPO evaluation: 100%|██████████| 1000/1000 [03:31<00:00,  4.74it/s]


Evaluation Results over 1000 episodes:
Average Episode Length: 874.74
Average Episode Reward: 8179.57
Average Episode Energy Reward: 10266.51


(np.float64(8179.569864947839),
 np.float64(10266.509319444123),
 np.float64(874.738))

In [7]:
mp4_to_gif("./results")

Video files conversion: 100%|██████████| 2/2 [00:41<00:00, 20.76s/it]
