In [6]:
import copy
import pickle
from collections import defaultdict

import gym
import numpy as np
import time
from tqdm import tqdm
from typing import List, Tuple, Dict
import matplotlib.pyplot as plt

from memory_profiler import profile

from rl2023.constants import EX4_PENDULUM_CONSTANTS as PENDULUM_CONSTANTS
from rl2023.constants import EX4_BIPEDAL_CONSTANTS as BIPEDAL_CONSTANTS
from rl2023.exercise4.agents import DDPG
from rl2023.exercise3.replay import ReplayBuffer
from rl2023.util.hparam_sweeping import generate_hparam_configs
from rl2023.util.result_processing import Run

import os
import gym
import numpy as np
from torch.optim import Adam
from typing import Dict, Iterable
import torch
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Normal

from rl2023.exercise3.agents import Agent
from rl2023.exercise3.networks import FCNetwork
from rl2023.exercise3.replay import Transition

class DiagGaussian(torch.nn.Module):
    def __init__(self, mean, std):
        self.mean = mean
        self.std = std

    def sample(self):
        eps = Variable(torch.randn(*self.mean.size()))
        return self.mean + self.std * eps


class DDPG(Agent):
    """ DDPG

        ** YOU NEED TO IMPLEMENT THE FUNCTIONS IN THIS CLASS **

        :attr critic (FCNetwork): fully connected critic network
        :attr critic_optim (torch.optim): PyTorch optimiser for critic network
        :attr policy (FCNetwork): fully connected actor network for policy
        :attr policy_optim (torch.optim): PyTorch optimiser for actor network
        :attr gamma (float): discount rate gamma
        """

    def __init__(
            self,
            action_space: gym.Space,
            observation_space: gym.Space,
            gamma: float,
            critic_learning_rate: float,
            policy_learning_rate: float,
            critic_hidden_size: Iterable[int],
            policy_hidden_size: Iterable[int],
            tau: float,
            **kwargs,
    ):
        """
        :param action_space (gym.Space): environment's action space
        :param observation_space (gym.Space): environment's observation space
        :param gamma (float): discount rate gamma
        :param critic_learning_rate (float): learning rate for critic optimisation
        :param policy_learning_rate (float): learning rate for policy optimisation
        :param critic_hidden_size (Iterable[int]): list of hidden dimensionalities for fully connected critic
        :param policy_hidden_size (Iterable[int]): list of hidden dimensionalities for fully connected policy
        :param tau (float): step for the update of the target networks
        """
        super().__init__(action_space, observation_space)
        STATE_SIZE = observation_space.shape[0]
        ACTION_SIZE = action_space.shape[0]

        self.upper_action_bound = action_space.high[0]
        self.lower_action_bound = action_space.low[0]

        # ######################################### #
        #  BUILD YOUR NETWORKS AND OPTIMIZERS HERE  #
        # ######################################### #
        # self.actor = Actor(STATE_SIZE, policy_hidden_size, ACTION_SIZE)
        self.actor = FCNetwork(#the guy generating actions
            (STATE_SIZE, *policy_hidden_size, ACTION_SIZE), output_activation=torch.nn.Tanh
        )
        self.actor_target = FCNetwork(
            (STATE_SIZE, *policy_hidden_size, ACTION_SIZE), output_activation=torch.nn.Tanh
        )

        self.actor_target.hard_update(self.actor)
        # self.critic = Critic(STATE_SIZE + ACTION_SIZE, critic_hidden_size)
        # self.critic_target = Critic(STATE_SIZE + ACTION_SIZE, critic_hidden_size)

        self.critic = FCNetwork(
            (STATE_SIZE + ACTION_SIZE, *critic_hidden_size, 1), output_activation=None
        )
        self.critic_target = FCNetwork(
            (STATE_SIZE + ACTION_SIZE, *critic_hidden_size, 1), output_activation=None
        )
        self.critic_target.hard_update(self.critic)

        self.policy_optim = Adam(self.actor.parameters(), lr=policy_learning_rate, eps=1e-3)
        self.critic_optim = Adam(self.critic.parameters(), lr=critic_learning_rate, eps=1e-3)

        for parameter in self.critic_target.parameters():
            parameter.requires_grad=False
        for parameter in self.actor_target.parameters():
            parameter.requires_grad=False

        # ############################################# #
        # WRITE ANY HYPERPARAMETERS YOU MIGHT NEED HERE #
        # ############################################# #
        self.gamma = gamma
        self.critic_learning_rate = critic_learning_rate
        self.policy_learning_rate = policy_learning_rate
        self.tau = tau



        # ################################################### #
        # DEFINE A GAUSSIAN THAT WILL BE USED FOR EXPLORATION #
        # ################################################### #
        mean = torch.zeros(ACTION_SIZE)
        std = 0.1 * torch.ones(ACTION_SIZE)
        self.noise = DiagGaussian(mean, std)

        # ############################### #
        # WRITE ANY AGENT PARAMETERS HERE #
        # ############################### #
        self.counter=0
        self.saveables.update(
            {
                "actor": self.actor,
                "actor_target": self.actor_target,
                "critic": self.critic,
                "critic_target": self.critic_target,
                "policy_optim": self.policy_optim,
                "critic_optim": self.critic_optim,
            }
        )


    def save(self, path: str, suffix: str = "") -> str:
        """Saves saveable PyTorch models under given path

        The models will be saved in directory found under given path in file "models_{suffix}.pt"
        where suffix is given by the optional parameter (by default empty string "")

        :param path (str): path to directory where to save models
        :param suffix (str, optional): suffix given to models file
        :return (str): path to file of saved models file
        """
        torch.save(self.saveables, path)
        return path


    def restore(self, filename: str, dir_path: str = None):
        """Restores PyTorch models from models file given by path

        :param filename (str): filename containing saved models
        :param dir_path (str, optional): path to directory where models file is located
        """

        if dir_path is None:
            dir_path, _ = os.path.split(os.path.abspath(__file__))
        save_path = os.path.join(dir_path, filename)
        checkpoint = torch.load(save_path)
        for k, v in self.saveables.items():
            v.load_state_dict(checkpoint[k].state_dict())


    def schedule_hyperparameters(self, timestep: int, max_timesteps: int):
        """Updates the hyperparameters

        **YOU MAY IMPLEMENT THIS FUNCTION FOR Q5**

        This function is called before every episode and allows you to schedule your
        hyperparameters.

        :param timestep (int): current timestep at the beginning of the episode
        :param max_timestep (int): maximum timesteps that the training loop will run for
        """
        ### PUT YOUR CODE HERE ###
        #print(self.epsilon)
        pass
        #self.epsilon= self.epsilon_start+min(timestep, max_timesteps*self.exploration_fraction)*(self.epsilon_min-self.epsilon_start)/(max_timesteps*self.exploration_fraction)
        #mean = torch.zeros(self.ACTION_SIZE)
        ##std = self.epsilon * torch.ones(self.ACTION_SIZE)
        #self.noise = DiagGaussian(mean, std)

    def act(self, obs: np.ndarray, explore: bool):
        """Returns an action (should be called at every timestep)

        **YOU MUST IMPLEMENT THIS FUNCTION FOR Q4**

        When explore is False you should select the best action possible (greedy). However, during exploration,
        you should be implementing exporation using the self.noise variable that you should have declared in the __init__.
        Use schedule_hyperparameters() for any hyperparameters that you want to change over time.

        :param obs (np.ndarray): observation vector from the environment
        :param explore (bool): flag indicating whether we should explore
        :return (sample from self.action_space): action the agent should perform
        """
        ### I PUT MY CODE HERE - beginning ###
        #print(123)
        
        if explore:
            return (self.actor(torch.tensor(obs))+self.noise.sample()).detach().numpy()
        return self.actor(torch.tensor(obs)).detach().numpy()
        #raise NotImplementedError("Needed for Q4")
        ### I PUT MY CODE HERE - end ###
    def update(self, batch: Transition) -> Dict[str, float]:
        #critic network update
        self.critic_optim.zero_grad()
        critic_target_input=torch.cat((batch[2],self.actor_target(batch[2])),1)
        y_batch=batch[3]+self.gamma*(1-batch[4])*self.critic_target(critic_target_input)
        critic_input=torch.cat((batch[0], batch[1]),1)
        x_batch=self.critic(critic_input)
        loss=torch.nn.MSELoss()
        loss=loss(y_batch,x_batch)
        loss.backward()
        q_loss=float(loss.detach().numpy())
        self.critic_optim.step()
        for parameter in self.critic.parameters():
            parameter.requires_grad=False

        #actor network update
        self.policy_optim.zero_grad()
        critic_input=torch.cat((batch[0], self.actor(batch[0])),1)
        critic_batch=self.critic(critic_input)
        loss=-torch.mean(critic_batch)
        p_loss=float(loss.detach().numpy())
        loss.backward()
        self.policy_optim.step()
        for parameter in self.critic.parameters():
            parameter.requires_grad=True
        self.actor_target.soft_update(self.actor, tau=self.tau)
        self.critic_target.soft_update(self
                                       .critic, tau=self.tau)
        return {"p_loss":p_loss, "q_loss":q_loss}


In [19]:

licznik=0
RENDER = True # FALSE FOR FASTER TRAINING / TRUE TO VISUALIZE ENVIRONMENT DURING EVALUATION
SWEEP = False # TRUE TO SWEEP OVER POSSIBLE HYPERPARAMETER CONFIGURATIONS
NUM_SEEDS_SWEEP = 10 # NUMBER OF SEEDS TO USE FOR EACH HYPERPARAMETER CONFIGURATION
SWEEP_SAVE_RESULTS = True # TRUE TO SAVE SWEEP RESULTS TO A FILE
SWEEP_SAVE_ALL_WEIGTHS = True # TRUE TO SAVE ALL WEIGHTS FROM EACH SEED
ENV = "BIPEDAL" #"PENDULUM" OR "BIPEDAL"

PENDULUM_CONFIG = {
    "eval_freq": 2000,
    "eval_episodes": 3,
    "policy_learning_rate": 1e-3,
    "critic_learning_rate": 1e-3,
    "critic_hidden_size": [64, 64],
    "policy_hidden_size": [64, 64],
    "tau": 0.01,
    "batch_size": 64,
    "buffer_capacity": int(1e6),
}
PENDULUM_CONFIG.update(PENDULUM_CONSTANTS)

BIPEDAL_CONFIG = {
    "eval_freq": 2000,
    "critic_hidden_size": [256, 236],
    "policy_hidden_size": [128, 128],
}
BIPEDAL_CONFIG.update(BIPEDAL_CONSTANTS)

### INCLUDE YOUR CHOICE OF HYPERPARAMETERS HERE ###
BIPEDAL_HPARAMS = {
    "critic_hidden_size": [[256,236]],
    "policy_hidden_size": [[128,128]]
    }

SWEEP_RESULTS_FILE_BIPEDAL = "DDPG-Bipedal-sweep-results-ex4.pkl"


def play_episode(
        env,
        agent,
        replay_buffer,
        train=True,
        explore=True,
        render=False,
        max_steps=200,
        batch_size=64,
):

    ep_data = defaultdict(list)
    obs = env.reset()
    done = False
    if render:
        env.render()

    episode_timesteps = 0
    episode_return = 0

    while not done:
        action = agent.act(obs, explore=explore)
        nobs, reward, done, _ = env.step(action)
        if train:
            replay_buffer.push(
                np.array(obs, dtype=np.float32),
                np.array(action, dtype=np.float32),
                np.array(nobs, dtype=np.float32),
                np.array([reward], dtype=np.float32),
                np.array([done], dtype=np.float32),
            )
            if len(replay_buffer) >= batch_size:
                batch = replay_buffer.sample(batch_size)
                new_data = agent.update(batch)
                for k, v in new_data.items():
                    ep_data[k].append(v)

        episode_timesteps += 1
        episode_return += reward

        if render:
            env.render()

        if max_steps == episode_timesteps:
            break
        obs = nobs

    return episode_timesteps, episode_return, ep_data


def train(env: gym.Env, config, output: bool = True) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict]:
    """
    Execute training of DDPG on given environment using the provided configuration

    :param env (gym.Env): environment to train on
    :param config: configuration dictionary mapping configuration keys to values
    :param output (bool): flag whether evaluation results should be printed
    :return (Tuple[List[float], List[float]]): eval returns during training, times of evaluation
    """
    timesteps_elapsed = 0

    agent = DDPG(
        action_space=env.action_space, observation_space=env.observation_space, **config
    )
    replay_buffer = ReplayBuffer(config["buffer_capacity"])

    eval_returns_all = []
    eval_timesteps_all = []
    eval_times_all = []
    run_data = defaultdict(list)

    start_time = time.time()
    with tqdm(total=config["max_timesteps"]) as pbar:
        while timesteps_elapsed < config["max_timesteps"]:
            elapsed_seconds = time.time() - start_time
            if elapsed_seconds > config["max_time"]:
                pbar.write(f"Training ended after {elapsed_seconds}s.")
                break

            agent.schedule_hyperparameters(timesteps_elapsed, config["max_timesteps"])
            episode_timesteps, ep_return, ep_data = play_episode(
                env,
                agent,
                replay_buffer,
                train=True,
                explore=True,
                render=False,
                max_steps=config["episode_length"],
                batch_size=config["batch_size"],
            )
            timesteps_elapsed += episode_timesteps
            pbar.update(episode_timesteps)
            for k, v in ep_data.items():
                run_data[k].extend(v)
            run_data["train_ep_returns"].append(ep_return)

            if timesteps_elapsed % config["eval_freq"] < episode_timesteps:
                eval_returns = 0
                for _ in range(config["eval_episodes"]):
                    _, episode_return, _ = play_episode(
                        env,
                        agent,
                        replay_buffer,
                        train=False,
                        explore=False,
                        render=RENDER,
                        max_steps=config["episode_length"],
                        batch_size=config["batch_size"],
                    )
                    eval_returns += episode_return / config["eval_episodes"]
                if output:
                    pbar.write(
                        f"Evaluation at timestep {timesteps_elapsed} returned a mean returns of {eval_returns}"
                    )
                    # pbar.write(f"Epsilon = {agent.epsilon}")
                eval_returns_all.append(eval_returns)
                eval_timesteps_all.append(timesteps_elapsed)
                eval_times_all.append(time.time() - start_time)
                if eval_returns >= config["target_return"]:
                    pbar.write(
                        f"Reached return {eval_returns} >= target return of {config['target_return']}"
                    )
                    break

    if config["save_filename"]:
        print("Saving to: ", agent.save(config["save_filename"]))

    return np.array(eval_returns_all), np.array(eval_timesteps_all), np.array(eval_times_all), run_data




if ENV == "PENDULUM":
    CONFIG = PENDULUM_CONFIG
    HPARAMS_SWEEP = None # Not required for assignment
    SWEEP_RESULTS_FILE = None # Not required for assignment
elif ENV == "BIPEDAL":
    CONFIG = BIPEDAL_CONFIG
    HPARAMS_SWEEP = BIPEDAL_HPARAMS
    SWEEP_RESULTS_FILE = SWEEP_RESULTS_FILE_BIPEDAL
else:
    raise(ValueError(f"Unknown environment {ENV}"))

env = gym.make(CONFIG["env"])

if SWEEP and HPARAMS_SWEEP is not None:
    config_list, swept_params = generate_hparam_configs(CONFIG, HPARAMS_SWEEP)
    results = []
    for config in config_list:
        run = Run(config)
        hparams_values = '_'.join([':'.join([key, str(config[key])]) for key in swept_params])
        run.run_name = hparams_values
        print(f"\nStarting new run...")
        for i in range(NUM_SEEDS_SWEEP):
            print(f"\nTraining iteration: {i+1}/{NUM_SEEDS_SWEEP}")
            run_save_filename = '--'.join([run.config["algo"], run.config["env"], hparams_values, str(i)])
            if SWEEP_SAVE_ALL_WEIGTHS:
                run.set_save_filename(run_save_filename)
            eval_returns, eval_timesteps, times, run_data = train(env, run.config, output=False)
            run.update(eval_returns, eval_timesteps, times, run_data)
        results.append(copy.deepcopy(run))
        print(f"Finished run with hyperparameters {hparams_values}. "
              f"Mean final score: {run.final_return_mean} +- {run.final_return_ste}")

    if SWEEP_SAVE_RESULTS:
        print(f"Saving results to {SWEEP_RESULTS_FILE}")
        with open(SWEEP_RESULTS_FILE, 'wb') as f:
            pickle.dump(results, f)

else:
    _ = train(env, CONFIG)

env.close()


  3%|█▍                                                         | 10011/400000 [00:50<30:26, 213.57it/s]

Evaluation at timestep 10011 returned a mean returns of -2.0219146147642695


 25%|██████████████▌                                           | 100850/400000 [08:59<25:09, 198.11it/s]

Evaluation at timestep 100850 returned a mean returns of -2.8383429143561143


400118it [34:45, 191.86it/s]                                                                            

Evaluation at timestep 400118 returned a mean returns of -2.522776103474226
Saving to:  bipedal_q4_latest.pt





In [20]:
for _ in range(2):
    _, episode_return, _ = play_episode(
        env,
        agent,
        replay_buffer,
        train=False,
        explore=False,
        render=RENDER,
        max_steps=config["episode_length"],
        batch_size=config["batch_size"],
    )

NameError: name 'agent' is not defined