In [2]:
import gymnasium as gym
from gymnasium.wrappers import RescaleAction
import numpy as np
import math
import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
import time
from distutils.util import strtobool
import os

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from stable_baselines3.common.buffers import ReplayBuffer
from torch.utils.tensorboard import SummaryWriter

  from distutils.util import strtobool


In [3]:
ENV_NAME = 'InvertedPendulum-v4'
#env = gym.make(ENV_NAME,render_mode = "human")
def make_env(env_id, seed, idx, capture_video, run_name):
    def thunk():
        if capture_video and idx == 0:
            env = gym.make(env_id, render_mode="rgb_array")
            min_action = -20
            max_action = 20
            env = RescaleAction(env, min_action=min_action, max_action=max_action)
            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
        else:
            env = gym.make(env_id)
            min_action = -20
            max_action = 20
            env = RescaleAction(env, min_action=min_action, max_action=max_action)
        env = gym.wrappers.RecordEpisodeStatistics(env)
        env.action_space.seed(seed)
        
        return env

    return thunk


In [4]:
# ALGO LOGIC: initialize agent here:
class QNetwork(nn.Module):
    def __init__(self, env):
        super().__init__()
        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 1)

    def forward(self, x, a):
        x = torch.cat([x, a], 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


class Actor(nn.Module):
    def __init__(self, env):
        super().__init__()
        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc_mu = nn.Linear(256, np.prod(env.single_action_space.shape))
        # action rescaling
        self.register_buffer(
            "action_scale", torch.tensor((env.action_space.high - env.action_space.low) / 2.0, dtype=torch.float32)
        )
        self.register_buffer(
            "action_bias", torch.tensor((env.action_space.high + env.action_space.low) / 2.0, dtype=torch.float32)
        )

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.tanh(self.fc_mu(x))
        return x * self.action_scale + self.action_bias



In [5]:
def reward_function(observation, action):
    diag_q = [1,10,1,1]; 
    r = 1;
    #print("observation:", observation)
    #print("observation:", observation[0,1])
    reward = diag_q[0]*(observation[0,0]**2) + diag_q[1]*(observation[0,1]**2) +\
                diag_q[2]*(observation[0,2]**2) + diag_q[3]*(observation[0,3]**2) +\
                r*(action**2)

    return -reward

In [None]:
if __name__ == "__main__":

    given_seed = 1
    buffer_size = int(1e6)
    batch_size = 256
    total_timesteps = 100000 #default = 1000000
    learning_starts = 25000 #default = 25e3
    exploration_noise = 0.1
    policy_frequency = 2
    tau = 0.005
    gamma = 0.99
    learning_rate = 3e-4
    
    exp_name = 'carpole_test'
    run_name = 'test'
    random.seed(given_seed)
    np.random.seed(given_seed)
    torch.manual_seed(given_seed)
    torch.backends.cudnn.deterministic = True
    writer = SummaryWriter(f"runs/{run_name}")

    #reward function parameters
    
    # if GPU is to be used
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    print(f"Using {device}");

    envs = gym.vector.SyncVectorEnv([make_env(ENV_NAME, given_seed, 0, False, run_name)])
    assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported"

    actor = Actor(envs).to(device)
    qf1 = QNetwork(envs).to(device)
    qf1_target = QNetwork(envs).to(device)
    target_actor = Actor(envs).to(device)
    target_actor.load_state_dict(actor.state_dict())
    qf1_target.load_state_dict(qf1.state_dict())
    q_optimizer = optim.Adam(list(qf1.parameters()), lr=learning_rate)
    actor_optimizer = optim.Adam(list(actor.parameters()), lr=learning_rate)

    envs.single_observation_space.dtype = np.float32
    rb = ReplayBuffer(
        buffer_size,
        envs.single_observation_space,
        envs.single_action_space,
        device,
        handle_timeout_termination=False,
    )

    start_time = time.time()

    # TRY NOT TO MODIFY: start the game
    obs, _ = envs.reset(seed=given_seed)
    for global_step in range(total_timesteps):
        # ALGO LOGIC: put action logic here
        if global_step < learning_starts:
            actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
            
        else:
            with torch.no_grad():
                actions = actor(torch.Tensor(obs).to(device))
                actions += torch.normal(0, actor.action_scale * exploration_noise)
                actions = actions.cpu().numpy().clip(envs.single_action_space.low, envs.single_action_space.high)

        # TRY NOT TO MODIFY: execute the game and log data.
        next_obs, rewards, terminations, truncations, infos = envs.step(actions)
        rewards = reward_function(next_obs, actions)
        #print('step=', global_step, ' actions=', actions, ' rewards=', rewards,\
        #      ' obs=', next_obs, ' termination=', terminations, ' trunctions=', truncations)
        # TRY NOT TO MODIFY: record rewards for plotting purposes
        if "final_info" in infos:
            for info in infos["final_info"]:
                #print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
                writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
                writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)
                break

        # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation`
        real_next_obs = next_obs.copy()
        for idx, trunc in enumerate(truncations):
            if trunc:
                real_next_obs[idx] = infos["final_observation"][idx]
        rb.add(obs, real_next_obs, actions, rewards, terminations, infos)

        # TRY NOT TO MODIFY: CRUCIAL step easy to overlook
        obs = next_obs

        # ALGO LOGIC: training.
        if global_step > learning_starts:
            data = rb.sample(batch_size)
            with torch.no_grad():
                next_state_actions = target_actor(data.next_observations)
                qf1_next_target = qf1_target(data.next_observations, next_state_actions)
                next_q_value = data.rewards.flatten() + (1 - data.dones.flatten()) * gamma * (qf1_next_target).view(-1)

            qf1_a_values = qf1(data.observations, data.actions).view(-1)
            qf1_loss = F.mse_loss(qf1_a_values, next_q_value)
            
            # optimize the model
            q_optimizer.zero_grad()
            qf1_loss.backward()
            q_optimizer.step()

            if global_step % policy_frequency == 0:
                actor_loss = -qf1(data.observations, actor(data.observations)).mean()
                print('step=', global_step, ' rewards=', rewards, ' qf1_loss = ', qf1_loss.item(), \
                      ' actor_loss = ', actor_loss.item(), ' observations=', obs, ' action=', actions)
                actor_optimizer.zero_grad()
                actor_loss.backward()
                actor_optimizer.step()

                # update the target network
                for param, target_param in zip(actor.parameters(), target_actor.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
                for param, target_param in zip(qf1.parameters(), qf1_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

            if global_step % 100 == 0:
                writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step)
                writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step)
                writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step)
                print("SPS:", int(global_step / (time.time() - start_time)))
                writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)

        if abs(next_obs[0,0])>= 10:
            print('resetting')
            obs, _ = envs.reset(seed=given_seed)


    save_model = True
    if save_model:
        model_path = f"runs/{run_name}/{exp_name}.cleanrl_model"
        torch.save((actor.state_dict(), qf1.state_dict()), model_path)
        print(f"model saved to {model_path}")
        # from cleanrl_utils.evals.ddpg_eval import evaluate

        # episodic_returns = evaluate(
        #     model_path,
        #     make_env,
        #     ENV_NAME,
        #     eval_episodes=10,
        #     run_name=f"{run_name}-eval",
        #     Model=(Actor, QNetwork),
        #     device=device,
        #     exploration_noise=exploration_noise,
        # )
        # for idx, episodic_return in enumerate(episodic_returns):
        #     writer.add_scalar("eval/episodic_return", episodic_return, idx)

    envs.close()
    writer.close()




Using cuda


  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


step= 25002  rewards= [[-1180.4208]]  qf1_loss =  235871.53125  actor_loss =  0.42932549118995667  observations= [[  6.11766913  -2.23922116  14.10842274 -26.41823206]]  action= [[13.995831]]
step= 25004  rewards= [[-1063.4685]]  qf1_loss =  198421.75  actor_loss =  1.5309642553329468  observations= [[ 7.66333603 -3.32040724 25.00867073  1.56752966]]  action= [[16.327888]]
step= 25006  rewards= [[-498.10294]]  qf1_loss =  260156.8125  actor_loss =  2.6983704566955566  observations= [[-0.00717292  3.1321351   0.00949633 -0.00939284]]  action= [[20.]]
step= 25008  rewards= [[-110.44574]]  qf1_loss =  249843.640625  actor_loss =  3.7227468490600586  observations= [[ 0.06145635  3.15663992  1.58563956 -0.08186014]]  action= [[2.8770208]]
step= 25010  rewards= [[-103.5922]]  qf1_loss =  208904.4375  actor_loss =  4.77769136428833  observations= [[0.18479149 3.14244713 1.78587828 0.2868652 ]]  action= [[1.2396215]]
step= 25012  rewards= [[-101.819374]]  qf1_loss =  215200.015625  actor_loss 