In [1]:
# https://github.com/haarnoja/sac
import argparse
import datetime
import gym
import numpy as np
import itertools
import torch
from sac import SAC

from torch.utils.tensorboard import SummaryWriter
from replay_memory import ReplayMemory

In [2]:

class arguments():
    def __init__(self):
        self.env_name = "MountainCarContinuous-v0"
        self.policy = "Gaussian"
        self.gamma = 0.99
        self.tau=0.005
        self.lr = 0.0003
        self.alpha = 0.2
        self.automatic_entropy_tuning = False
        self.seed=123456
        self.batch_size=64
        self.num_steps=500000
        self.hidden_size=256
        self.updates_per_step=1
        self.start_steps=30000
        self.target_update_interval=1
        self.replay_size = 500000
        self.cuda = False
        self.eval = True


args = arguments()

  and should_run_async(code)


In [3]:
env = gym.make(args.env_name)
env.reset(seed = args.seed)
env.action_space.seed(args.seed)

torch.manual_seed(args.seed)
np.random.seed(args.seed)

  deprecation(
  deprecation(


In [4]:
print(env.action_space.sample())
print(env.reset())

[0.2730275]
[-0.5230377  0.       ]


In [5]:
agent = SAC(env.observation_space.shape[0], env.action_space, args)

num_inputs: 2
action_space.shape[0]: (1,)
args.hidden_size: 256


In [6]:
#Tensorboard
writer = SummaryWriter('runs/{}_SAC_{}_{}_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name,
                                                             args.policy, "autotune" if args.automatic_entropy_tuning else ""))

# Memory
memory = ReplayMemory(args.replay_size, args.seed)

  and should_run_async(code)


In [7]:
print(env._max_episode_steps)

999


In [None]:
# Training Loop
total_numsteps = 0
updates = 0

for i_episode in itertools.count(1):
    print(f'episode: {i_episode}')
    episode_reward = 0
    episode_steps = 0
    done = False
    state = env.reset()

    while not done:
        #print(f'total_numsteps: {total_numsteps}')
        #print(f'episode_steps: {episode_steps}')
        if args.start_steps > total_numsteps:
            action = env.action_space.sample()  # Sample random action
        else:
            #print("policy following")
            action = agent.select_action(state)  # Sample action from policy

        if len(memory) > args.batch_size:
            # Number of updates per step in environment
            for i in range(args.updates_per_step):
                # Update parameters of all the networks
                critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(memory, args.batch_size, updates)

                writer.add_scalar('loss/critic_1', critic_1_loss, updates)
                writer.add_scalar('loss/critic_2', critic_2_loss, updates)
                writer.add_scalar('loss/policy', policy_loss, updates)
                writer.add_scalar('loss/entropy_loss', ent_loss, updates)
                writer.add_scalar('entropy_temprature/alpha', alpha, updates)
                updates += 1

        next_state, reward, done, _ = env.step(action) # Step
        episode_steps += 1
        total_numsteps += 1
        episode_reward += reward

        # Ignore the "done" signal if it comes from hitting the time horizon.
        # (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py)
        mask = 1 if episode_steps == env._max_episode_steps else float(not done)

        memory.push(state, action, reward, next_state, mask) # Append transition to memory

        state = next_state

    if total_numsteps > args.num_steps:
        break

    #print('a')
    writer.add_scalar('reward/train', episode_reward, i_episode)
    print("Episode: {}, total numsteps: {}, episode steps: {}, reward: {}".format(i_episode, total_numsteps, episode_steps, round(episode_reward, 2)))
    #print('b')

    if i_episode % 10 == 0 and args.eval is True:
        avg_reward = 0.
        episodes = 10
        for _  in range(episodes):
            state = env.reset()
            episode_reward = 0
            done = False
            while not done:
                action = agent.select_action(state, evaluate=True)

                next_state, reward, done, _ = env.step(action)
                episode_reward += reward


                state = next_state
            avg_reward += episode_reward
        avg_reward /= episodes


        writer.add_scalar('avg_reward/test', avg_reward, i_episode)

        print("----------------------------------------")
        print("Test Episodes: {}, Avg. Reward: {}".format(episodes, round(avg_reward, 2)))
        print("----------------------------------------")

env.close()


  if not isinstance(terminated, (bool, np.bool8)):


episode: 1
Episode: 1, total numsteps: 999, episode steps: 999, reward: -33.16
episode: 2
Episode: 2, total numsteps: 1998, episode steps: 999, reward: -32.78
episode: 3
Episode: 3, total numsteps: 2997, episode steps: 999, reward: -35.29
episode: 4
Episode: 4, total numsteps: 3996, episode steps: 999, reward: -32.28
episode: 5
Episode: 5, total numsteps: 4995, episode steps: 999, reward: -33.41
episode: 6
Episode: 6, total numsteps: 5994, episode steps: 999, reward: -32.45
episode: 7
Episode: 7, total numsteps: 6993, episode steps: 999, reward: -33.84
episode: 8
Episode: 8, total numsteps: 7992, episode steps: 999, reward: -33.35
episode: 9
Episode: 9, total numsteps: 8991, episode steps: 999, reward: -33.21
episode: 10
Episode: 10, total numsteps: 9990, episode steps: 999, reward: -33.0
----------------------------------------
Test Episodes: 10, Avg. Reward: -0.0
----------------------------------------
episode: 11
Episode: 11, total numsteps: 10989, episode steps: 999, reward: -34.8