From 9c0658392d26204b4cdcbe4175605c1914bd96a4 Mon Sep 17 00:00:00 2001 From: Donal Date: Wed, 24 Jun 2020 07:56:24 +0100 Subject: [PATCH 01/14] Updated RL docs with latest models --- docs/source/rl.rst | 152 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 138 insertions(+), 14 deletions(-) diff --git a/docs/source/rl.rst b/docs/source/rl.rst index 3f06f01f96..a90348d623 100644 --- a/docs/source/rl.rst +++ b/docs/source/rl.rst @@ -13,9 +13,8 @@ Contributions by: `Donal Byrne `_ - Double DQN - Dueling DQN - Noisy DQN -- Prioritized Experience Replay DQN - NStep DQN -- Noisy DQN +- Prioritized Experience Replay DQN - Reinforce - Policy Gradient @@ -34,49 +33,174 @@ Original implementation by: `Donal Byrne `_ Example: - >>> from pl_bolts.models.rl import DQN + >>> from pl_bolts.models.rl.dqn.model import DQN ... - >>> dqn = DQN() + >>> dqn = DQN("PongNoFrameskip-v4") Train:: trainer = Trainer() trainer.fit(dqn) -.. autoclass:: pl_bolts.models.rl.DQN +.. autoclass:: pl_bolts.models.rl.dqn.model.DQN :noindex: Double DQN ^^^^^^^^^^^^^^^^^^^^ -Double DQN model introduced in TODO -Paper authors: TODO +Double DQN model introduced in `Deep Reinforcement Learning with Double Q-learning `_ +Paper authors: Hado van Hasselt, Arthur Guez, David Silver Original implementation by: `Donal Byrne `_ Example:: - >>> from pl_bolts.models.rl import DoubleDQN + >>> from pl_bolts.models.rl.double_dqn.model import DoubleDQN ... - >>> ddqn = DoubleDQN() + >>> ddqn = DoubleDQN("PongNoFrameskip-v4") Train:: trainer = Trainer() trainer.fit(ddqn) -.. autoclass:: pl_bolts.models.rl.DoubleDQN +.. autoclass:: pl_bolts.models.rl.double_dqn.model.DoubleDQN + :noindex: + +Dueling DQN +^^^^^^^^^^^^^^^^^^^^ +Dueling DQN model introduced in `Dueling Network Architectures for Deep Reinforcement Learning `_ +Paper authors: Ziyu Wang, Tom Schaul, Matteo Hessel, Hado van Hasselt, Marc Lanctot, Nando de Freitas + +Original implementation by: `Donal Byrne `_ + +Example:: + + >>> from pl_bolts.models.rl.dueling_dqn.model import DuelingDQN + ... + >>> dueling_dqn = DuelingDQN("PongNoFrameskip-v4") + +Train:: + + trainer = Trainer() + trainer.fit(dueling_dqn) + +.. autoclass:: pl_bolts.models.rl.dueling_dqn.model.DuelingDQN + :noindex: + +Noisy DQN +^^^^^^^^^^^^^^^^^^^^ +Noisy DQN model introduced in `Noisy Networks for Exploration `_ +Paper authors: Meire Fortunato, Mohammad Gheshlaghi Azar, Bilal Piot, Jacob Menick, Ian Osband, Alex Graves, +Vlad Mnih, Remi Munos, Demis Hassabis, Olivier Pietquin, Charles Blundell, Shane Legg + +Original implementation by: `Donal Byrne `_ + +Example:: + + >>> from pl_bolts.models.rl.noisy_dqn.model import NoisyDQN + ... + >>> noisy_dqn = NoisyDQN("PongNoFrameskip-v4") + +Train:: + + trainer = Trainer() + trainer.fit(noisy_dqn) + +.. autoclass:: pl_bolts.models.rl.noisy_dqn.model.NoisyDQN :noindex: + +N-Step DQN +^^^^^^^^^^^^^^^^^^^^ +N-Step DQN model introduced in `Learning to Predict by the Methods of Temporal Differences `_ +Paper authors: Richard S. Sutton + +Original implementation by: `Donal Byrne `_ + +Example:: + + >>> from pl_bolts.models.rl.n_step_dqn.model import NStepDQN + ... + >>> n_step_dqn = NStepDQN("PongNoFrameskip-v4") + +Train:: + + trainer = Trainer() + trainer.fit(n_step_dqn) + +.. autoclass:: pl_bolts.models.rl.n_step_dqn.model.NStepDQN + :noindex: + + +Prioritized Experience Replay DQN +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Double DQN model introduced in `Prioritized Experience Replay `_ +Paper authors: Tom Schaul, John Quan, Ioannis Antonoglou, David Silver + +Original implementation by: `Donal Byrne `_ + +Example:: + + >>> from pl_bolts.models.rl.per_dqn.model import PERDQN + ... + >>> per_dqn = PERDQN("PongNoFrameskip-v4") + +Train:: + + trainer = Trainer() + trainer.fit(per_dqn) + +.. autoclass:: pl_bolts.models.rl.per_dqn.model.PERDQN + :noindex: + + + -------------- Policy Gradient Models ---------------------- The following models are based on Policy gradient -Policy Gradient -^^^^^^^^^^^^^^^ -TODO: add description +REINFORCE +^^^^^^^^^^^^^^^^^^^^ +REINFORCE model introduced in `Policy Gradient Methods For Reinforcement Learning With Function Approximation `_ +Paper authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour + +Original implementation by: `Donal Byrne `_ + +Example:: + + >>> from pl_bolts.models.rl.reinforce.model import Reinforce + ... + >>> reinforce = Reinforce("CartPole-v0") + +Train:: + + trainer = Trainer() + trainer.fit(reinforce) + +.. autoclass:: pl_bolts.models.rl.reinforce.model.Reinforce + :noindex: + + +Vanilla Policy Gradient +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Vanilla Policy Gradient model introduced in `Policy Gradient Methods For Reinforcement Learning With Function Approximation `_ +Paper authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour + +Original implementation by: `Donal Byrne `_ + +Example:: + + >>> from pl_bolts.models.rl.vpg.model import PolicyGradient + ... + >>> vpg = PolicyGradient("CartPole-v0") + +Train:: + + trainer = Trainer() + trainer.fit(vpg) -.. autoclass:: pl_bolts.models.rl.PolicyGradient +.. autoclass:: pl_bolts.models.rl.vanilla_policy_gradient.model.PolicyGradient :noindex: From 86b0dee32ccbe387886d0ee9c914a92107481984 Mon Sep 17 00:00:00 2001 From: Donal Byrne Date: Fri, 3 Jul 2020 10:50:46 +0100 Subject: [PATCH 02/14] Added POC for train_batch interface when populating RL datasets What Changed: - Custom train_batch method in VPG model - This generates a batch of data at each time step - Experience source no longer gets initialized with a device, instead the correct device is passed to the step() method in the train_batch function - Moved experience methods from rl.comon to datamodules --- CHANGELOG.md | 3 + pl_bolts/datamodules/__init__.py | 2 + pl_bolts/datamodules/experience_source.py | 200 ++++++++++++++++++ pl_bolts/models/rl/__init__.py | 2 +- pl_bolts/models/rl/common/experience.py | 45 ++-- .../rl/vanilla_policy_gradient_model.py | 178 ++++++---------- tests/datamodules/test_experience_sources.py | 153 ++++++++++++++ tests/models/test_rl/unit/test_experience.py | 22 +- tests/models/test_rl/unit/test_vpg.py | 33 ++- 9 files changed, 486 insertions(+), 152 deletions(-) create mode 100644 pl_bolts/datamodules/experience_source.py create mode 100644 tests/datamodules/test_experience_sources.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 604ea16ca5..e2c3a30716 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,6 +45,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Device is no longer set in the DQN model init - Moved RL loss function to the losses module +- Moved rl.common.experience to datamodules +- train_batch function to VPG model to generate batch of data at each step (POC) +- Experience source no longer gets initialized with a device, instead the device is passed at each step() ### Fixed diff --git a/pl_bolts/datamodules/__init__.py b/pl_bolts/datamodules/__init__.py index f867284bb6..8e88a63b89 100644 --- a/pl_bolts/datamodules/__init__.py +++ b/pl_bolts/datamodules/__init__.py @@ -6,3 +6,5 @@ from pl_bolts.datamodules.ssl_imagenet_datamodule import SSLImagenetDataModule from pl_bolts.datamodules.stl10_datamodule import STL10DataModule from pl_bolts.datamodules.fashion_mnist_datamodule import FashionMNISTDataModule +from pl_bolts.datamodules.experience_source import ExperienceSourceDataset, ExperienceSource, \ + NStepExperienceSource, EpisodicExperienceStream diff --git a/pl_bolts/datamodules/experience_source.py b/pl_bolts/datamodules/experience_source.py new file mode 100644 index 0000000000..f3c9b69a5f --- /dev/null +++ b/pl_bolts/datamodules/experience_source.py @@ -0,0 +1,200 @@ +""" +Datamodules for RL models that rely on experiences generated during training + +Based on implementations found here: https://github.com/Shmuma/ptan/blob/master/ptan/experience.py +""" +from collections import deque +from typing import Iterable, Callable, Tuple, List +import numpy as np +import torch +from gym import Env +from torch.utils.data import IterableDataset + +# Datasets +from pl_bolts.models.rl.common.agents import Agent +from pl_bolts.models.rl.common.memory import Experience + + +class ExperienceSourceDataset(IterableDataset): + """ + Basic experience source dataset. Takes a generate_batch function that returns an iterator. + The logic for the experience source and how the batch is generated is defined the Lightning model itself + """ + + def __init__(self, generate_batch: Callable): + self.generate_batch = generate_batch + + def __iter__(self) -> Iterable: + iterator = self.generate_batch() + return iterator + +# Experience Sources + +class ExperienceSource: + """ + Basic single step experience source + + Args: + env: Environment that is being used + agent: Agent being used to make decisions + """ + + def __init__(self, env: Env, agent: Agent): + self.env = env + self.agent = agent + self.state = self.env.reset() + + def _reset(self) -> None: + """resets the env and state""" + self.state = self.env.reset() + + def step(self, device: torch.device) -> Tuple[Experience, float, bool]: + """Takes a single step through the environment""" + action = self.agent(self.state, device) + new_state, reward, done, _ = self.env.step(action) + experience = Experience( + state=self.state, + action=action, + reward=reward, + new_state=new_state, + done=done, + ) + self.state = new_state + + if done: + self.state = self.env.reset() + + return experience, reward, done + + def run_episode(self, device: torch.device) -> float: + """Carries out a single episode and returns the total reward. This is used for testing""" + done = False + total_reward = 0 + + while not done: + _, reward, done = self.step(device) + total_reward += reward + + return total_reward + + +class NStepExperienceSource(ExperienceSource): + """Expands upon the basic ExperienceSource by collecting experience across N steps""" + + def __init__(self, env: Env, agent: Agent, n_steps: int = 1, gamma: float = 0.99): + super().__init__(env, agent) + self.gamma = gamma + self.n_steps = n_steps + self.n_step_buffer = deque(maxlen=n_steps) + + def step(self, device: torch.device) -> Tuple[Experience, float, bool]: + """ + Takes an n-step in the environment + + Returns: + Experience + """ + exp = self.single_step(device) + + while len(self.n_step_buffer) < self.n_steps: + self.single_step(device) + + reward, next_state, done = self.get_transition_info() + first_experience = self.n_step_buffer[0] + multi_step_experience = Experience( + first_experience.state, first_experience.action, reward, done, next_state + ) + + return multi_step_experience, exp.reward, exp.done + + def single_step(self, device: torch.device) -> Experience: + """ + Takes a single step in the environment and appends it to the n-step buffer + + Returns: + Experience + """ + exp, _, _ = super().step(device) + self.n_step_buffer.append(exp) + return exp + + def get_transition_info(self) -> Tuple[np.float, np.array, np.int]: + """ + get the accumulated transition info for the n_step_buffer + Args: + gamma: discount factor + + Returns: + multi step reward, final observation and done + """ + last_experience = self.n_step_buffer[-1] + final_state = last_experience.new_state + done = last_experience.done + reward = last_experience.reward + + # calculate reward + # in reverse order, go through all the experiences up till the first experience + for experience in reversed(list(self.n_step_buffer)[:-1]): + reward_t = experience.reward + new_state_t = experience.new_state + done_t = experience.done + + reward = reward_t + self.gamma * reward * (1 - done_t) + final_state, done = (new_state_t, done_t) if done_t else (final_state, done) + + return reward, final_state, done + + +class EpisodicExperienceStream(ExperienceSource, IterableDataset): + """ + Basic experience stream that iteratively yield the current experience of the agent in the env + + Args: + env: Environmen that is being used + agent: Agent being used to make decisions + """ + + def __init__(self, env: Env, agent: Agent, device: torch.device, episodes: int = 1): + super().__init__(env, agent) + self.episodes = episodes + self.device = device + + def __getitem__(self, item): + return item + + def __iter__(self) -> List[Experience]: + """ + Plays a step through the environment until the episode is complete + + Returns: + Batch of all transitions for the entire episode + """ + episode_steps, batch = [], [] + + while len(batch) < self.episodes: + exp = self.step(self.device) + episode_steps.append(exp) + + if exp.done: + batch.append(episode_steps) + episode_steps = [] + + yield batch + + def step(self, device: torch.device) -> Experience: + """Carries out a single step in the environment""" + action = self.agent(self.state, device) + new_state, reward, done, _ = self.env.step(action) + experience = Experience( + state=self.state, + action=action, + reward=reward, + new_state=new_state, + done=done, + ) + self.state = new_state + + if done: + self.state = self.env.reset() + + return experience diff --git a/pl_bolts/models/rl/__init__.py b/pl_bolts/models/rl/__init__.py index f70f6bd959..f5a47f8af4 100644 --- a/pl_bolts/models/rl/__init__.py +++ b/pl_bolts/models/rl/__init__.py @@ -5,4 +5,4 @@ from pl_bolts.models.rl.noisy_dqn_model import NoisyDQN from pl_bolts.models.rl.per_dqn_model import PERDQN from pl_bolts.models.rl.reinforce_model import Reinforce -from pl_bolts.models.rl.vanilla_policy_gradient_model import PolicyGradient +# from pl_bolts.models.rl.vanilla_policy_gradient_model import PolicyGradient diff --git a/pl_bolts/models/rl/common/experience.py b/pl_bolts/models/rl/common/experience.py index e58251bcde..419ddbc046 100644 --- a/pl_bolts/models/rl/common/experience.py +++ b/pl_bolts/models/rl/common/experience.py @@ -1,11 +1,15 @@ """Experience sources to be used as datasets for Ligthning DataLoaders Based on implementations found here: https://github.com/Shmuma/ptan/blob/master/ptan/experience.py + +..note:: Deprecated, these functions have been moved to pl_bolts.datamodules.experience_source.py + """ from collections import deque from typing import List, Tuple import numpy as np +import torch from gym import Env from torch.utils.data import IterableDataset @@ -74,19 +78,18 @@ class ExperienceSource: agent: Agent being used to make decisions """ - def __init__(self, env: Env, agent: Agent, device): + def __init__(self, env: Env, agent: Agent): self.env = env self.agent = agent self.state = self.env.reset() - self.device = device def _reset(self) -> None: """resets the env and state""" self.state = self.env.reset() - def step(self) -> Tuple[Experience, float, bool]: + def step(self, device: torch.device) -> Tuple[Experience, float, bool]: """Takes a single step through the environment""" - action = self.agent(self.state, self.device) + action = self.agent(self.state, device) new_state, reward, done, _ = self.env.step(action) experience = Experience( state=self.state, @@ -102,13 +105,13 @@ def step(self) -> Tuple[Experience, float, bool]: return experience, reward, done - def run_episode(self) -> float: + def run_episode(self, device: torch.device) -> float: """Carries out a single episode and returns the total reward. This is used for testing""" done = False total_reward = 0 while not done: - _, reward, done = self.step() + _, reward, done = self.step(device) total_reward += reward return total_reward @@ -117,22 +120,23 @@ def run_episode(self) -> float: class NStepExperienceSource(ExperienceSource): """Expands upon the basic ExperienceSource by collecting experience across N steps""" - def __init__(self, env: Env, agent: Agent, device, n_steps: int = 1): - super().__init__(env, agent, device) + def __init__(self, env: Env, agent: Agent, n_steps: int = 1, gamma: float = 0.99): + super().__init__(env, agent) + self.gamma = gamma self.n_steps = n_steps self.n_step_buffer = deque(maxlen=n_steps) - def step(self) -> Tuple[Experience, float, bool]: + def step(self, device: torch.device) -> Tuple[Experience, float, bool]: """ Takes an n-step in the environment Returns: Experience """ - exp = self.single_step() + exp = self.single_step(device) while len(self.n_step_buffer) < self.n_steps: - self.single_step() + self.single_step(device) reward, next_state, done = self.get_transition_info() first_experience = self.n_step_buffer[0] @@ -142,18 +146,18 @@ def step(self) -> Tuple[Experience, float, bool]: return multi_step_experience, exp.reward, exp.done - def single_step(self) -> Experience: + def single_step(self, device: torch.device) -> Experience: """ Takes a single step in the environment and appends it to the n-step buffer Returns: Experience """ - exp, _, _ = super().step() + exp, _, _ = super().step(device) self.n_step_buffer.append(exp) return exp - def get_transition_info(self, gamma=0.9) -> Tuple[np.float, np.array, np.int]: + def get_transition_info(self) -> Tuple[np.float, np.array, np.int]: """ get the accumulated transition info for the n_step_buffer Args: @@ -174,7 +178,7 @@ def get_transition_info(self, gamma=0.9) -> Tuple[np.float, np.array, np.int]: new_state_t = experience.new_state done_t = experience.done - reward = reward_t + gamma * reward * (1 - done_t) + reward = reward_t + self.gamma * reward * (1 - done_t) final_state, done = (new_state_t, done_t) if done_t else (final_state, done) return reward, final_state, done @@ -189,9 +193,10 @@ class EpisodicExperienceStream(ExperienceSource, IterableDataset): agent: Agent being used to make decisions """ - def __init__(self, env: Env, agent: Agent, device, episodes: int = 1): - super().__init__(env, agent, device) + def __init__(self, env: Env, agent: Agent, device: torch.device, episodes: int = 1): + super().__init__(env, agent) self.episodes = episodes + self.device = device def __getitem__(self, item): return item @@ -206,7 +211,7 @@ def __iter__(self) -> List[Experience]: episode_steps, batch = [], [] while len(batch) < self.episodes: - exp = self.step() + exp = self.step(self.device) episode_steps.append(exp) if exp.done: @@ -215,9 +220,9 @@ def __iter__(self) -> List[Experience]: yield batch - def step(self) -> Experience: + def step(self, device: torch.device) -> Experience: """Carries out a single step in the environment""" - action = self.agent(self.state, self.device) + action = self.agent(self.state, device) new_state, reward, done, _ = self.env.step(action) experience = Experience( state=self.state, diff --git a/pl_bolts/models/rl/vanilla_policy_gradient_model.py b/pl_bolts/models/rl/vanilla_policy_gradient_model.py index 1b938f2c77..4349ae1816 100644 --- a/pl_bolts/models/rl/vanilla_policy_gradient_model.py +++ b/pl_bolts/models/rl/vanilla_policy_gradient_model.py @@ -5,7 +5,6 @@ import argparse from collections import OrderedDict from copy import deepcopy -from itertools import chain from typing import Tuple, List import gym @@ -17,10 +16,9 @@ from torch.optim.optimizer import Optimizer from torch.utils.data import DataLoader +from pl_bolts.datamodules.experience_source import ExperienceSourceDataset, NStepExperienceSource from pl_bolts.models.rl.common import cli from pl_bolts.models.rl.common.agents import PolicyAgent -from pl_bolts.models.rl.common.experience import EpisodicExperienceStream -from pl_bolts.models.rl.common.memory import Experience from pl_bolts.models.rl.common.networks import MLP from pl_bolts.models.rl.common.wrappers import ToTensor @@ -81,22 +79,26 @@ def __init__(self, env: str, gamma: float = 0.99, lr: float = 1e-4, batch_size: self.build_networks() self.agent = PolicyAgent(self.net) + self.source = NStepExperienceSource(env=self.env, agent=self.agent, n_steps=10) self.gamma = gamma self.lr = lr self.batch_size = batch_size self.batch_episodes = batch_episodes + self.entropy_beta = entropy_beta + self.baseline = 0 + + # Metrics + self.reward_sum = 0 + self.env_steps = 0 + self.total_steps = 0 self.total_reward = 0 - self.episode_reward = 0 self.episode_count = 0 - self.episode_steps = 0 - self.total_episode_steps = 0 - self.entropy_beta = entropy_beta self.reward_list = [] for _ in range(100): - self.reward_list.append(0) + self.reward_list.append(torch.tensor(0)) self.avg_reward = 0 def build_networks(self) -> None: @@ -140,83 +142,9 @@ def calc_qvals(self, rewards: List[Tensor]) -> List[Tensor]: mean_q = sum_q / len(res) return [q - mean_q for q in res] - def process_batch( - self, batch: List[List[Experience]] - ) -> Tuple[List[Tensor], List[Tensor], List[Tensor]]: - """ - Takes in a batch of episodes and retrieves the q vals, the states and the actions for the batch - - Args: - batch: list of episodes, each containing a list of Experiences - - Returns: - q_vals, states and actions used for calculating the loss - """ - # get outputs for each episode - batch_rewards, batch_states, batch_actions = [], [], [] - for episode in batch: - ep_rewards, ep_states, ep_actions = [], [], [] - - # log the outputs for each step - for step in episode: - ep_rewards.append(step[2].float()) - ep_states.append(step[0]) - ep_actions.append(step[1]) - - # add episode outputs to the batch - batch_rewards.append(ep_rewards) - batch_states.append(ep_states) - batch_actions.append(ep_actions) - - # get qvals - batch_qvals = [] - for reward in batch_rewards: - batch_qvals.append(self.calc_qvals(reward)) - - # flatten the batched outputs - batch_actions, batch_qvals, batch_rewards, batch_states = self.flatten_batch( - batch_actions, batch_qvals, batch_rewards, batch_states - ) - - return batch_qvals, batch_states, batch_actions, batch_rewards - - @staticmethod - def flatten_batch( - batch_actions: List[List[Tensor]], - batch_qvals: List[List[Tensor]], - batch_rewards: List[List[Tensor]], - batch_states: List[List[Tuple[Tensor, Tensor]]], - ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: - """ - Takes in the outputs of the processed batch and flattens the several episodes into a single tensor for each - batched output - - Args: - batch_actions: actions taken in each batch episodes - batch_qvals: Q vals for each batch episode - batch_rewards: reward for each batch episode - batch_states: states for each batch episodes - - Returns: - The input batched results flattend into a single tensor - """ - # flatten all episode steps into a single list - batch_qvals = list(chain.from_iterable(batch_qvals)) - batch_states = list(chain.from_iterable(batch_states)) - batch_actions = list(chain.from_iterable(batch_actions)) - batch_rewards = list(chain.from_iterable(batch_rewards)) - - # stack steps into single tensor and remove extra dimension - batch_qvals = torch.stack(batch_qvals).squeeze() - batch_states = torch.stack(batch_states).squeeze() - batch_actions = torch.stack(batch_actions).squeeze() - batch_rewards = torch.stack(batch_rewards).squeeze() - - return batch_actions, batch_qvals, batch_rewards, batch_states - def loss( self, - batch_qvals: List[Tensor], + batch_scales: List[Tensor], batch_states: List[Tensor], batch_actions: List[Tensor], ) -> torch.Tensor: @@ -225,7 +153,7 @@ def loss( been flattend into a single tensor. Args: - batch_qvals: current mini batch of q values + batch_scales: current mini batch of rewards minus the baseline batch_actions: current batch of actions batch_states: current batch of states @@ -235,7 +163,7 @@ def loss( logits = self.net(batch_states) log_prob, policy_loss = self.calc_policy_loss( - batch_actions, batch_qvals, batch_states, logits + batch_actions, batch_scales, batch_states, logits ) entropy_loss_v = self.calc_entropy_loss(log_prob, logits) @@ -281,6 +209,49 @@ def calc_policy_loss( policy_loss = -log_prob_actions.mean() return log_prob, policy_loss + def train_batch(self) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[torch.Tensor]]: + """ + Contains the logic for generating a new batch of data to be passed to the DataLoader + + Returns: + yields a tuple of Lists containing tensors for states, actions and rewards of the batch. + """ + states = [] + actions = [] + scales = [] + + for _ in range(self.batch_size): + + # take a step in the env + exp, reward, done = self.source.step(self.device) + self.env_steps += 1 + self.total_steps += 1 + + # update the baseline + self.reward_sum += exp.reward + self.baseline = self.reward_sum / self.total_steps + + # gather the experience data + states.append(exp.new_state) + actions.append(exp.action) + scales.append(exp.reward - self.baseline) + + self.total_reward += reward + + if done: + # tracking metrics + self.episode_count += 1 + self.reward_list.append(self.total_reward) + self.avg_reward = sum(self.reward_list[-100:]) / 100 + + self.logger.experiment.add_scalar("reward", self.total_reward, self.total_steps) + + # reset metrics + self.total_reward = 0 + self.env_steps = 0 + + yield from zip(states, actions, scales) + def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], _) -> OrderedDict: """ Carries out a single step through the environment to update the replay buffer. @@ -293,45 +264,26 @@ def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], _) -> OrderedD Returns: Training loss and log metrics """ - device = self.get_device(batch) - - batch_qvals, batch_states, batch_actions, batch_rewards = self.process_batch( - batch - ) - - # get avg reward over the batched episodes - self.episode_reward = sum(batch_rewards) / len(batch) - self.reward_list.append(self.episode_reward) - self.avg_reward = sum(self.reward_list) / len(self.reward_list) + states, actions, scales = batch # calculates training loss - loss = self.loss(batch_qvals, batch_states, batch_actions) + loss = self.loss(scales, states, actions) if self.trainer.use_dp or self.trainer.use_ddp2: loss = loss.unsqueeze(0) - self.episode_count += self.batch_episodes - log = { - "episode_reward": torch.tensor(self.episode_reward).to(device), "train_loss": loss, - "avg_reward": self.avg_reward, - } - status = { - "steps": torch.tensor(self.global_step).to(device), - "episode_reward": torch.tensor(self.episode_reward).to(device), - "episodes": torch.tensor(self.episode_count), - "avg_reward": self.avg_reward, + "avg_reward": self.avg_reward, + "episode_count": self.episode_count, + "baseline": self.baseline } - self.episode_reward = 0 - return OrderedDict( { "loss": loss, - "reward": self.avg_reward, "log": log, - "progress_bar": status, + "progress_bar": log } ) @@ -342,10 +294,8 @@ def configure_optimizers(self) -> List[Optimizer]: def _dataloader(self) -> DataLoader: """Initialize the Replay Buffer dataset used for retrieving experiences""" - dataset = EpisodicExperienceStream( - self.env, self.agent, self.device, episodes=self.batch_episodes - ) - dataloader = DataLoader(dataset=dataset) + dataset = ExperienceSourceDataset(self.train_batch) + dataloader = DataLoader(dataset=dataset, batch_size=self.batch_size) return dataloader def train_dataloader(self) -> DataLoader: diff --git a/tests/datamodules/test_experience_sources.py b/tests/datamodules/test_experience_sources.py new file mode 100644 index 0000000000..18199348bf --- /dev/null +++ b/tests/datamodules/test_experience_sources.py @@ -0,0 +1,153 @@ +from unittest import TestCase +from unittest.mock import Mock +import numpy as np +import gym +import torch +from torch.utils.data import DataLoader + +from pl_bolts.datamodules.experience_source import ExperienceSourceDataset, ExperienceSource, NStepExperienceSource, \ + EpisodicExperienceStream +from pl_bolts.models.rl.common.agents import Agent +from pl_bolts.models.rl.common.memory import Experience + + +class DummyAgent(Agent): + def __call__(self, states, agent_states): + return 0 + +class TestExperienceSourceDataset(TestCase): + + def train_batch(self): + return iter([i for i in range(100)]) + + def test_iterator(self): + source = ExperienceSourceDataset(self.train_batch) + batch_size = 10 + data_loader = DataLoader(source, batch_size=batch_size) + + for idx, batch in enumerate(data_loader): + self.assertEqual(len(batch), batch_size) + self.assertEqual(batch[0], 0) + self.assertEqual(batch[5], 5) + break + +class TestExperienceSource(TestCase): + + def setUp(self) -> None: + self.net = Mock() + self.agent = DummyAgent(net=self.net) + self.env = gym.make("CartPole-v0") + self.device = torch.device('cpu') + self.source = ExperienceSource(self.env, self.agent) + + def test_step(self): + exp, reward, done = self.source.step(self.device) + self.assertEqual(len(exp), 5) + + def test_episode(self): + total_reward = self.source.run_episode(self.device) + self.assertIsInstance(total_reward, float) + + +class TestNStepExperienceSource(TestCase): + + def setUp(self) -> None: + self.net = Mock() + self.agent = DummyAgent(net=self.net) + self.env = gym.make("CartPole-v0") + self.n_step = 2 + self.source = NStepExperienceSource(self.env, self.agent, n_steps=self.n_step) + self.device = torch.device('cpu') + + self.state = np.zeros([32, 32]) + self.state_02 = np.ones([32, 32]) + self.next_state = np.zeros([32, 32]) + self.next_state_02 = np.ones([32, 32]) + self.action = np.zeros([1]) + self.action_02 = np.ones([1]) + self.reward = np.zeros([1]) + self.reward_02 = np.ones([1]) + self.done = np.zeros([1]) + self.done_02 = np.zeros([1]) + + self.experience01 = Experience(self.state, self.action, self.reward, self.done, self.next_state) + self.experience02 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02) + self.experience03 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02) + + def test_step(self): + self.assertEqual(len(self.source.n_step_buffer), 0) + exp, reward, done = self.source.step(self.device) + self.assertEqual(len(exp), 5) + self.assertEqual(len(self.source.n_step_buffer), self.n_step) + + def test_multi_step(self): + self.source.env.step = Mock(return_value=(self.next_state_02, self.reward_02, self.done_02, Mock())) + self.source.n_step_buffer.append(self.experience01) + self.source.n_step_buffer.append(self.experience01) + + exp, reward, done = self.source.step(self.device) + + next_state = exp[4] + self.assertEqual(next_state.all(), self.next_state_02.all()) + + def test_discounted_transition(self): + self.source = NStepExperienceSource(self.env, self.agent, n_steps=3, gamma=0.9) + + self.source.n_step_buffer.append(self.experience01) + self.source.n_step_buffer.append(self.experience02) + self.source.n_step_buffer.append(self.experience03) + + reward, next_state, done = self.source.get_transition_info() + + reward_01 = self.experience02.reward + 0.9 * self.experience03.reward * (1 - done) + reward_gt = self.experience01.reward + 0.9 * reward_01 * (1 - done) + + self.assertEqual(reward, reward_gt) + self.assertEqual(next_state.all(), self.next_state_02.all()) + self.assertEqual(self.experience03.done, done) + + def test_multi_step_discount(self): + self.source = NStepExperienceSource(self.env, self.agent, n_steps=3, gamma=0.9) + self.source.env.step = Mock(return_value=(self.next_state_02, self.reward_02, self.done_02, Mock())) + + self.source.n_step_buffer.append(self.experience01) + self.source.n_step_buffer.append(self.experience02) + + reward_gt = 1.71 + + exp, reward, done = self.source.step(self.device) + + self.assertEqual(exp[0].all(), self.experience01.state.all()) + self.assertEqual(exp[1], self.experience01.action) + self.assertEqual(exp[2], reward_gt) + self.assertEqual(exp[3], self.experience02.done) + self.assertEqual(exp[4].all(), self.experience02.new_state.all()) + + +class TestEpisodicExperience(TestCase): + """Test the standard experience stream""" + + def setUp(self) -> None: + self.env = gym.make("CartPole-v0") + self.net = Mock() + self.agent = Agent(self.net) + self.xp_stream = EpisodicExperienceStream(self.env, self.agent, torch.device('cpu'), episodes=4) + self.rl_dataloader = DataLoader(self.xp_stream) + + def test_experience_stream_SINGLE_EPISODE(self): + """Check that the experience stream gives 1 full episode per batch""" + self.xp_stream.episodes = 1 + + for i_batch, batch in enumerate(self.rl_dataloader): + self.assertEqual(len(batch), 1) + self.assertIsInstance(batch[0][0], Experience) + self.assertEqual(batch[0][-1].done, True) + + def test_experience_stream_MULTI_EPISODE(self): + """Check that the experience stream gives 4 full episodes per batch""" + self.xp_stream.episodes = 4 + + for i_batch, batch in enumerate(self.rl_dataloader): + self.assertEqual(len(batch), 4) + self.assertIsInstance(batch[0][0], Experience) + self.assertEqual(batch[0][-1].done, True) \ No newline at end of file diff --git a/tests/models/test_rl/unit/test_experience.py b/tests/models/test_rl/unit/test_experience.py index c715fe1638..b2a9a5c6da 100644 --- a/tests/models/test_rl/unit/test_experience.py +++ b/tests/models/test_rl/unit/test_experience.py @@ -25,7 +25,7 @@ def setUp(self) -> None: self.env = ToTensor(gym.make("CartPole-v0")) self.net = Mock() self.agent = Agent(self.net) - self.xp_stream = EpisodicExperienceStream(self.env, self.agent, device=Mock(), episodes=4) + self.xp_stream = EpisodicExperienceStream(self.env, self.agent, torch.device('cpu'), episodes=4) self.rl_dataloader = DataLoader(self.xp_stream) def test_experience_stream_SINGLE_EPISODE(self): @@ -53,14 +53,15 @@ def setUp(self) -> None: self.net = Mock() self.agent = DummyAgent(net=self.net) self.env = gym.make("CartPole-v0") - self.source = ExperienceSource(self.env, self.agent, Mock()) + self.device = torch.device('cpu') + self.source = ExperienceSource(self.env, self.agent) def test_step(self): - exp, reward, done = self.source.step() + exp, reward, done = self.source.step(self.device) self.assertEqual(len(exp), 5) def test_episode(self): - total_reward = self.source.run_episode() + total_reward = self.source.run_episode(self.device) self.assertIsInstance(total_reward, float) @@ -71,7 +72,8 @@ def setUp(self) -> None: self.agent = DummyAgent(net=self.net) self.env = gym.make("CartPole-v0") self.n_step = 2 - self.source = NStepExperienceSource(self.env, self.agent, Mock(), n_steps=self.n_step) + self.source = NStepExperienceSource(self.env, self.agent, n_steps=self.n_step) + self.device = torch.device('cpu') self.state = np.zeros([32, 32]) self.state_02 = np.ones([32, 32]) @@ -90,7 +92,7 @@ def setUp(self) -> None: def test_step(self): self.assertEqual(len(self.source.n_step_buffer), 0) - exp, reward, done = self.source.step() + exp, reward, done = self.source.step(self.device) self.assertEqual(len(exp), 5) self.assertEqual(len(self.source.n_step_buffer), self.n_step) @@ -99,13 +101,13 @@ def test_multi_step(self): self.source.n_step_buffer.append(self.experience01) self.source.n_step_buffer.append(self.experience01) - exp, reward, done = self.source.step() + exp, reward, done = self.source.step(self.device) next_state = exp[4] self.assertEqual(next_state.all(), self.next_state_02.all()) def test_discounted_transition(self): - self.source = NStepExperienceSource(self.env, self.agent, Mock(), n_steps=3) + self.source = NStepExperienceSource(self.env, self.agent, n_steps=3, gamma=0.9) self.source.n_step_buffer.append(self.experience01) self.source.n_step_buffer.append(self.experience02) @@ -121,7 +123,7 @@ def test_discounted_transition(self): self.assertEqual(self.experience03.done, done) def test_multi_step_discount(self): - self.source = NStepExperienceSource(self.env, self.agent, Mock(), n_steps=3) + self.source = NStepExperienceSource(self.env, self.agent, n_steps=3, gamma=0.9) self.source.env.step = Mock(return_value=(self.next_state_02, self.reward_02, self.done_02, Mock())) self.source.n_step_buffer.append(self.experience01) @@ -129,7 +131,7 @@ def test_multi_step_discount(self): reward_gt = 1.71 - exp, reward, done = self.source.step() + exp, reward, done = self.source.step(self.device) self.assertEqual(exp[0].all(), self.experience01.state.all()) self.assertEqual(exp[1], self.experience01.action) diff --git a/tests/models/test_rl/unit/test_vpg.py b/tests/models/test_rl/unit/test_vpg.py index a6dd5a3f24..4b310faba4 100644 --- a/tests/models/test_rl/unit/test_vpg.py +++ b/tests/models/test_rl/unit/test_vpg.py @@ -10,9 +10,10 @@ from pl_bolts.models.rl.common import cli from pl_bolts.models.rl.common.agents import Agent from pl_bolts.models.rl.common.experience import EpisodicExperienceStream +from pl_bolts.models.rl.common.memory import Experience from pl_bolts.models.rl.common.networks import MLP from pl_bolts.models.rl.common.wrappers import ToTensor -from pl_bolts.models.rl.vanilla_policy_gradient_model import PolicyGradient +from pl_bolts.models.rl.vpg import PolicyGradient class TestPolicyGradient(TestCase): @@ -23,8 +24,6 @@ def setUp(self) -> None: self.n_actions = self.env.action_space.n self.net = MLP(self.obs_shape, self.n_actions) self.agent = Agent(self.net) - self.xp_stream = EpisodicExperienceStream(self.env, self.agent, Mock(), episodes=4) - self.rl_dataloader = DataLoader(self.xp_stream) parent_parser = argparse.ArgumentParser(add_help=False) parent_parser = cli.add_base_args(parent=parent_parser) @@ -49,13 +48,33 @@ def test_loss(self): """Test the PolicyGradient loss function""" self.model.net = self.net self.model.agent = self.agent + self.model.logger = Mock() + xp_dataloader = self.model.train_dataloader() - for i_batch, batch in enumerate(self.rl_dataloader): - exp_batch = batch - batch_qvals, batch_states, batch_actions, _ = self.model.process_batch(exp_batch) + for i_batch, batch in enumerate(xp_dataloader): + states, actions, scales = batch - loss = self.model.loss(batch_qvals, batch_states, batch_actions) + loss = self.model.loss(scales, states, actions) self.assertIsInstance(loss, torch.Tensor) break + + def test_train_batch(self): + state = np.random.rand(4, 84, 84) + self.source = Mock() + exp = Experience(state=state, action=0, reward=5, done=False, new_state=state) + self.source.step = Mock(return_value=(exp, 1, False)) + self.model.source = self.source + + xp_dataloader = self.model.train_dataloader() + + for i_batch, batch in enumerate(xp_dataloader): + self.assertEqual(len(batch), 3) + self.assertEqual(len(batch[0]), self.model.batch_size) + self.assertTrue(isinstance(batch, list)) + self.assertEqual(self.model.baseline, 5) + self.assertIsInstance(batch[0], torch.Tensor) + self.assertIsInstance(batch[1], torch.Tensor) + self.assertIsInstance(batch[2], torch.Tensor) + break From 885f198f8154ddda2abca10ff1cf5edecddf15ed Mon Sep 17 00:00:00 2001 From: Donal Byrne Date: Fri, 3 Jul 2020 11:15:32 +0100 Subject: [PATCH 03/14] Updated other models to use train_batch interface --- pl_bolts/models/rl/double_dqn_model.py | 2 +- pl_bolts/models/rl/dqn_model.py | 7 +++---- pl_bolts/models/rl/n_step_dqn_model.py | 4 +--- pl_bolts/models/rl/noisy_dqn_model.py | 2 +- pl_bolts/models/rl/per_dqn_model.py | 5 ++--- tests/models/test_rl/unit/test_vpg.py | 4 +--- 6 files changed, 9 insertions(+), 15 deletions(-) diff --git a/pl_bolts/models/rl/double_dqn_model.py b/pl_bolts/models/rl/double_dqn_model.py index b4c4d62ba8..a723dfef9e 100644 --- a/pl_bolts/models/rl/double_dqn_model.py +++ b/pl_bolts/models/rl/double_dqn_model.py @@ -74,7 +74,7 @@ def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], _) -> OrderedD self.agent.update_epsilon(self.global_step) # step through environment with agent and add to buffer - exp, reward, done = self.source.step() + exp, reward, done = self.source.step(self.device) self.buffer.append(exp) self.episode_reward += reward diff --git a/pl_bolts/models/rl/dqn_model.py b/pl_bolts/models/rl/dqn_model.py index 702f854903..b5fcb28e30 100644 --- a/pl_bolts/models/rl/dqn_model.py +++ b/pl_bolts/models/rl/dqn_model.py @@ -138,7 +138,7 @@ def populate(self, warm_start: int) -> None: if warm_start > 0: for _ in range(warm_start): self.source.agent.epsilon = 1.0 - exp, _, _ = self.source.step() + exp, _, _ = self.source.step(self.device) self.buffer.append(exp) def build_networks(self) -> None: @@ -174,7 +174,7 @@ def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], _) -> OrderedD self.agent.update_epsilon(self.global_step) # step through environment with agent and add to buffer - exp, reward, done = self.source.step() + exp, reward, done = self.source.step(self.device) self.buffer.append(exp) self.episode_reward += reward @@ -244,8 +244,7 @@ def configure_optimizers(self) -> List[Optimizer]: def prepare_data(self) -> None: """Initialize the Replay Buffer dataset used for retrieving experiences""" - device = torch.device(self.trainer.root_gpu) if self.trainer.num_gpus >= 1 else self.device - self.source = ExperienceSource(self.env, self.agent, device) + self.source = ExperienceSource(self.env, self.agent) self.buffer = ReplayBuffer(self.replay_size) self.populate(self.warm_start_size) diff --git a/pl_bolts/models/rl/n_step_dqn_model.py b/pl_bolts/models/rl/n_step_dqn_model.py index 9e4d76bb63..20d62d9136 100644 --- a/pl_bolts/models/rl/n_step_dqn_model.py +++ b/pl_bolts/models/rl/n_step_dqn_model.py @@ -74,10 +74,8 @@ def __init__( super().__init__(env, gpus, eps_start, eps_end, eps_last_frame, sync_rate, gamma, learning_rate, batch_size, replay_size, warm_start_size, num_samples) - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - self.source = NStepExperienceSource( - self.env, self.agent, device, n_steps=n_steps + self.env, self.agent, n_steps=n_steps ) diff --git a/pl_bolts/models/rl/noisy_dqn_model.py b/pl_bolts/models/rl/noisy_dqn_model.py index 5a5a7ed9c0..1eb8534f5a 100644 --- a/pl_bolts/models/rl/noisy_dqn_model.py +++ b/pl_bolts/models/rl/noisy_dqn_model.py @@ -77,7 +77,7 @@ def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], _) -> OrderedD Training loss and log metrics """ # step through environment with agent and add to buffer - exp, reward, done = self.source.step() + exp, reward, done = self.source.step(self.device) self.buffer.append(exp) self.episode_reward += reward diff --git a/pl_bolts/models/rl/per_dqn_model.py b/pl_bolts/models/rl/per_dqn_model.py index eda52b1bf4..fca28150c4 100644 --- a/pl_bolts/models/rl/per_dqn_model.py +++ b/pl_bolts/models/rl/per_dqn_model.py @@ -79,7 +79,7 @@ def training_step(self, batch, _) -> OrderedDict: self.agent.update_epsilon(self.global_step) # step through environment with agent and add to buffer - exp, reward, done = self.source.step() + exp, reward, done = self.source.step(self.device) self.buffer.append(exp) self.episode_reward += reward @@ -126,8 +126,7 @@ def training_step(self, batch, _) -> OrderedDict: def prepare_data(self) -> None: """Initialize the Replay Buffer dataset used for retrieving experiences""" - device = torch.device(self.trainer.root_gpu) if self.trainer.num_gpus >= 1 else self.device - self.source = ExperienceSource(self.env, self.agent, device) + self.source = ExperienceSource(self.env, self.agent) self.buffer = PERBuffer(self.replay_size) self.populate(self.warm_start_size) diff --git a/tests/models/test_rl/unit/test_vpg.py b/tests/models/test_rl/unit/test_vpg.py index 4b310faba4..ac62aebc6d 100644 --- a/tests/models/test_rl/unit/test_vpg.py +++ b/tests/models/test_rl/unit/test_vpg.py @@ -5,15 +5,13 @@ import gym import numpy as np import torch -from torch.utils.data import DataLoader from pl_bolts.models.rl.common import cli from pl_bolts.models.rl.common.agents import Agent -from pl_bolts.models.rl.common.experience import EpisodicExperienceStream from pl_bolts.models.rl.common.memory import Experience from pl_bolts.models.rl.common.networks import MLP from pl_bolts.models.rl.common.wrappers import ToTensor -from pl_bolts.models.rl.vpg import PolicyGradient +from pl_bolts.models.rl.vanilla_policy_gradient_model import PolicyGradient class TestPolicyGradient(TestCase): From 896b0321c512eed9db046d51cdce601ab2aead09 Mon Sep 17 00:00:00 2001 From: Donal Byrne Date: Mon, 6 Jul 2020 08:46:01 +0100 Subject: [PATCH 04/14] Update tests/datamodules/test_experience_sources.py Co-authored-by: Jirka Borovec --- tests/datamodules/test_experience_sources.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/datamodules/test_experience_sources.py b/tests/datamodules/test_experience_sources.py index 18199348bf..4ce8cc422a 100644 --- a/tests/datamodules/test_experience_sources.py +++ b/tests/datamodules/test_experience_sources.py @@ -5,8 +5,8 @@ import torch from torch.utils.data import DataLoader -from pl_bolts.datamodules.experience_source import ExperienceSourceDataset, ExperienceSource, NStepExperienceSource, \ - EpisodicExperienceStream +from pl_bolts.datamodules.experience_source import (ExperienceSourceDataset, ExperienceSource, NStepExperienceSource, + EpisodicExperienceStream) from pl_bolts.models.rl.common.agents import Agent from pl_bolts.models.rl.common.memory import Experience @@ -150,4 +150,4 @@ def test_experience_stream_MULTI_EPISODE(self): for i_batch, batch in enumerate(self.rl_dataloader): self.assertEqual(len(batch), 4) self.assertIsInstance(batch[0][0], Experience) - self.assertEqual(batch[0][-1].done, True) \ No newline at end of file + self.assertEqual(batch[0][-1].done, True) From 2baa02cd0d25e32ad26a1f04858343b08720efc5 Mon Sep 17 00:00:00 2001 From: Donal Byrne Date: Mon, 6 Jul 2020 09:15:22 +0100 Subject: [PATCH 05/14] Fixing lint errors --- pl_bolts/datamodules/experience_source.py | 1 + pl_bolts/models/rl/vanilla_policy_gradient_model.py | 2 +- tests/datamodules/test_experience_sources.py | 8 +++++--- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pl_bolts/datamodules/experience_source.py b/pl_bolts/datamodules/experience_source.py index f3c9b69a5f..0fea3718d9 100644 --- a/pl_bolts/datamodules/experience_source.py +++ b/pl_bolts/datamodules/experience_source.py @@ -30,6 +30,7 @@ def __iter__(self) -> Iterable: # Experience Sources + class ExperienceSource: """ Basic single step experience source diff --git a/pl_bolts/models/rl/vanilla_policy_gradient_model.py b/pl_bolts/models/rl/vanilla_policy_gradient_model.py index 4349ae1816..347e8bc69a 100644 --- a/pl_bolts/models/rl/vanilla_policy_gradient_model.py +++ b/pl_bolts/models/rl/vanilla_policy_gradient_model.py @@ -274,7 +274,7 @@ def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], _) -> OrderedD log = { "train_loss": loss, - "avg_reward": self.avg_reward, + "avg_reward": self.avg_reward, "episode_count": self.episode_count, "baseline": self.baseline } diff --git a/tests/datamodules/test_experience_sources.py b/tests/datamodules/test_experience_sources.py index 18199348bf..421bdcc0d9 100644 --- a/tests/datamodules/test_experience_sources.py +++ b/tests/datamodules/test_experience_sources.py @@ -5,8 +5,8 @@ import torch from torch.utils.data import DataLoader -from pl_bolts.datamodules.experience_source import ExperienceSourceDataset, ExperienceSource, NStepExperienceSource, \ - EpisodicExperienceStream +from pl_bolts.datamodules.experience_source import (ExperienceSourceDataset, ExperienceSource, NStepExperienceSource, + EpisodicExperienceStream) from pl_bolts.models.rl.common.agents import Agent from pl_bolts.models.rl.common.memory import Experience @@ -15,6 +15,7 @@ class DummyAgent(Agent): def __call__(self, states, agent_states): return 0 + class TestExperienceSourceDataset(TestCase): def train_batch(self): @@ -31,6 +32,7 @@ def test_iterator(self): self.assertEqual(batch[5], 5) break + class TestExperienceSource(TestCase): def setUp(self) -> None: @@ -150,4 +152,4 @@ def test_experience_stream_MULTI_EPISODE(self): for i_batch, batch in enumerate(self.rl_dataloader): self.assertEqual(len(batch), 4) self.assertIsInstance(batch[0][0], Experience) - self.assertEqual(batch[0][-1].done, True) \ No newline at end of file + self.assertEqual(batch[0][-1].done, True) From db18cd8d6d6dd1292d5ee68d77f60bc815d1f8ad Mon Sep 17 00:00:00 2001 From: Donal Byrne Date: Mon, 6 Jul 2020 12:14:26 +0100 Subject: [PATCH 06/14] Fixed linting errors --- tests/datamodules/test_experience_sources.py | 2 +- tests/models/test_rl/unit/test_vpg.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/datamodules/test_experience_sources.py b/tests/datamodules/test_experience_sources.py index 421bdcc0d9..793a36572a 100644 --- a/tests/datamodules/test_experience_sources.py +++ b/tests/datamodules/test_experience_sources.py @@ -6,7 +6,7 @@ from torch.utils.data import DataLoader from pl_bolts.datamodules.experience_source import (ExperienceSourceDataset, ExperienceSource, NStepExperienceSource, - EpisodicExperienceStream) + EpisodicExperienceStream) from pl_bolts.models.rl.common.agents import Agent from pl_bolts.models.rl.common.memory import Experience diff --git a/tests/models/test_rl/unit/test_vpg.py b/tests/models/test_rl/unit/test_vpg.py index ac62aebc6d..8793ecf625 100644 --- a/tests/models/test_rl/unit/test_vpg.py +++ b/tests/models/test_rl/unit/test_vpg.py @@ -49,7 +49,6 @@ def test_loss(self): self.model.logger = Mock() xp_dataloader = self.model.train_dataloader() - for i_batch, batch in enumerate(xp_dataloader): states, actions, scales = batch From 0f5ca796ae788a5b938d04b94bdb2b0147fb3087 Mon Sep 17 00:00:00 2001 From: Donal Byrne Date: Mon, 6 Jul 2020 17:23:58 +0100 Subject: [PATCH 07/14] Update pl_bolts/datamodules/experience_source.py Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> --- pl_bolts/datamodules/experience_source.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pl_bolts/datamodules/experience_source.py b/pl_bolts/datamodules/experience_source.py index 0fea3718d9..84f16b1ae3 100644 --- a/pl_bolts/datamodules/experience_source.py +++ b/pl_bolts/datamodules/experience_source.py @@ -31,7 +31,7 @@ def __iter__(self) -> Iterable: # Experience Sources -class ExperienceSource: +class ExperienceSource(object): """ Basic single step experience source From 5d9dfa643ed67c4797c8408b22f5ed3ecfaed009 Mon Sep 17 00:00:00 2001 From: Donal Date: Mon, 6 Jul 2020 18:03:01 +0100 Subject: [PATCH 08/14] Resolved comments --- pl_bolts/datamodules/experience_source.py | 13 ++++++------- pl_bolts/models/rl/common/experience.py | 5 +++++ .../models/rl/vanilla_policy_gradient_model.py | 15 ++++----------- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/pl_bolts/datamodules/experience_source.py b/pl_bolts/datamodules/experience_source.py index 84f16b1ae3..e9a1d10b4a 100644 --- a/pl_bolts/datamodules/experience_source.py +++ b/pl_bolts/datamodules/experience_source.py @@ -7,7 +7,6 @@ from typing import Iterable, Callable, Tuple, List import numpy as np import torch -from gym import Env from torch.utils.data import IterableDataset # Datasets @@ -40,7 +39,7 @@ class ExperienceSource(object): agent: Agent being used to make decisions """ - def __init__(self, env: Env, agent: Agent): + def __init__(self, env, agent: Agent): self.env = env self.agent = agent self.state = self.env.reset() @@ -82,7 +81,7 @@ def run_episode(self, device: torch.device) -> float: class NStepExperienceSource(ExperienceSource): """Expands upon the basic ExperienceSource by collecting experience across N steps""" - def __init__(self, env: Env, agent: Agent, n_steps: int = 1, gamma: float = 0.99): + def __init__(self, env, agent: Agent, n_steps: int = 1, gamma: float = 0.99): super().__init__(env, agent) self.gamma = gamma self.n_steps = n_steps @@ -95,10 +94,10 @@ def step(self, device: torch.device) -> Tuple[Experience, float, bool]: Returns: Experience """ - exp = self.single_step(device) + exp = self.n_step(device) while len(self.n_step_buffer) < self.n_steps: - self.single_step(device) + self.n_step(device) reward, next_state, done = self.get_transition_info() first_experience = self.n_step_buffer[0] @@ -108,7 +107,7 @@ def step(self, device: torch.device) -> Tuple[Experience, float, bool]: return multi_step_experience, exp.reward, exp.done - def single_step(self, device: torch.device) -> Experience: + def n_step(self, device: torch.device) -> Experience: """ Takes a single step in the environment and appends it to the n-step buffer @@ -155,7 +154,7 @@ class EpisodicExperienceStream(ExperienceSource, IterableDataset): agent: Agent being used to make decisions """ - def __init__(self, env: Env, agent: Agent, device: torch.device, episodes: int = 1): + def __init__(self, env, agent: Agent, device: torch.device, episodes: int = 1): super().__init__(env, agent) self.episodes = episodes self.device = device diff --git a/pl_bolts/models/rl/common/experience.py b/pl_bolts/models/rl/common/experience.py index 419ddbc046..c97fa812bc 100644 --- a/pl_bolts/models/rl/common/experience.py +++ b/pl_bolts/models/rl/common/experience.py @@ -5,6 +5,7 @@ ..note:: Deprecated, these functions have been moved to pl_bolts.datamodules.experience_source.py """ +import warnings from collections import deque from typing import List, Tuple @@ -28,6 +29,8 @@ class RLDataset(IterableDataset): """ def __init__(self, buffer: Buffer, sample_size: int = 1) -> None: + warnings.warn("Deprecated, these functions have been moved to pl_bolts.datamodules.experience_source.py", + DeprecationWarning) self.buffer = buffer self.sample_size = sample_size @@ -79,6 +82,8 @@ class ExperienceSource: """ def __init__(self, env: Env, agent: Agent): + warnings.warn("Deprecated, these functions have been moved to pl_bolts.datamodules.experience_source.py", + DeprecationWarning) self.env = env self.agent = agent self.state = self.env.reset() diff --git a/pl_bolts/models/rl/vanilla_policy_gradient_model.py b/pl_bolts/models/rl/vanilla_policy_gradient_model.py index 347e8bc69a..9b50056998 100644 --- a/pl_bolts/models/rl/vanilla_policy_gradient_model.py +++ b/pl_bolts/models/rl/vanilla_policy_gradient_model.py @@ -98,7 +98,7 @@ def __init__(self, env: str, gamma: float = 0.99, lr: float = 1e-4, batch_size: self.reward_list = [] for _ in range(100): - self.reward_list.append(torch.tensor(0)) + self.reward_list.append(torch.tensor(0, device=self.device)) self.avg_reward = 0 def build_networks(self) -> None: @@ -216,9 +216,6 @@ def train_batch(self) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[torc Returns: yields a tuple of Lists containing tensors for states, actions and rewards of the batch. """ - states = [] - actions = [] - scales = [] for _ in range(self.batch_size): @@ -230,13 +227,11 @@ def train_batch(self) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[torc # update the baseline self.reward_sum += exp.reward self.baseline = self.reward_sum / self.total_steps + self.total_reward += reward # gather the experience data - states.append(exp.new_state) - actions.append(exp.action) - scales.append(exp.reward - self.baseline) - - self.total_reward += reward + scale = exp.reward - self.baseline + yield exp.new_state, exp.action, scale if done: # tracking metrics @@ -250,8 +245,6 @@ def train_batch(self) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[torc self.total_reward = 0 self.env_steps = 0 - yield from zip(states, actions, scales) - def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], _) -> OrderedDict: """ Carries out a single step through the environment to update the replay buffer. From c3f62ace393b8623381dbd2cd9ed42f39a310092 Mon Sep 17 00:00:00 2001 From: Jirka Date: Thu, 9 Jul 2020 00:41:53 +0200 Subject: [PATCH 09/14] req --- docs/requirements.txt | 2 +- requirements.txt | 1 + tests/requirements.txt | 4 +--- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 0c8dccd2ca..c0af87f539 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -6,7 +6,7 @@ pandoc docutils sphinxcontrib-fulltoc sphinxcontrib-mockautodoc -gym + git+https://github.com/PytorchLightning/lightning_sphinx_theme.git # pip_shims sphinx-autodoc-typehints diff --git a/requirements.txt b/requirements.txt index b982ccc34d..df55815fcb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ scikit-learn>=0.23 opencv-python test_tube>=0.7.5 trains>=0.14.1 +gym>=0.17.2 \ No newline at end of file diff --git a/tests/requirements.txt b/tests/requirements.txt index 9d1a0a9ffe..e04a75fd23 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -6,6 +6,4 @@ pytest-flake8 flake8 check-manifest twine==1.13.0 -atari-py==0.2.6 -gym==0.17.2 -trains>=0.14.1 \ No newline at end of file +atari-py==0.2.6 \ No newline at end of file From 577569cb2cd9e381cfd396e9c1621f129929696a Mon Sep 17 00:00:00 2001 From: Donal Date: Thu, 9 Jul 2020 07:35:34 +0100 Subject: [PATCH 10/14] Removed cyclic import of Agents from experience source --- pl_bolts/datamodules/__init__.py | 4 ++-- pl_bolts/datamodules/experience_source.py | 14 ++++++++------ pl_bolts/models/rl/__init__.py | 2 +- .../models/rl/vanilla_policy_gradient_model.py | 2 +- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/pl_bolts/datamodules/__init__.py b/pl_bolts/datamodules/__init__.py index 8e88a63b89..5f6897df2d 100644 --- a/pl_bolts/datamodules/__init__.py +++ b/pl_bolts/datamodules/__init__.py @@ -6,5 +6,5 @@ from pl_bolts.datamodules.ssl_imagenet_datamodule import SSLImagenetDataModule from pl_bolts.datamodules.stl10_datamodule import STL10DataModule from pl_bolts.datamodules.fashion_mnist_datamodule import FashionMNISTDataModule -from pl_bolts.datamodules.experience_source import ExperienceSourceDataset, ExperienceSource, \ - NStepExperienceSource, EpisodicExperienceStream +from pl_bolts.datamodules.experience_source import (ExperienceSourceDataset, ExperienceSource, + NStepExperienceSource, EpisodicExperienceStream) diff --git a/pl_bolts/datamodules/experience_source.py b/pl_bolts/datamodules/experience_source.py index e9a1d10b4a..d01e3545e7 100644 --- a/pl_bolts/datamodules/experience_source.py +++ b/pl_bolts/datamodules/experience_source.py @@ -3,15 +3,17 @@ Based on implementations found here: https://github.com/Shmuma/ptan/blob/master/ptan/experience.py """ -from collections import deque +from collections import deque, namedtuple from typing import Iterable, Callable, Tuple, List import numpy as np import torch from torch.utils.data import IterableDataset # Datasets -from pl_bolts.models.rl.common.agents import Agent -from pl_bolts.models.rl.common.memory import Experience + +Experience = namedtuple( + "Experience", field_names=["state", "action", "reward", "done", "new_state"] +) class ExperienceSourceDataset(IterableDataset): @@ -39,7 +41,7 @@ class ExperienceSource(object): agent: Agent being used to make decisions """ - def __init__(self, env, agent: Agent): + def __init__(self, env, agent): self.env = env self.agent = agent self.state = self.env.reset() @@ -81,7 +83,7 @@ def run_episode(self, device: torch.device) -> float: class NStepExperienceSource(ExperienceSource): """Expands upon the basic ExperienceSource by collecting experience across N steps""" - def __init__(self, env, agent: Agent, n_steps: int = 1, gamma: float = 0.99): + def __init__(self, env, agent, n_steps: int = 1, gamma: float = 0.99): super().__init__(env, agent) self.gamma = gamma self.n_steps = n_steps @@ -154,7 +156,7 @@ class EpisodicExperienceStream(ExperienceSource, IterableDataset): agent: Agent being used to make decisions """ - def __init__(self, env, agent: Agent, device: torch.device, episodes: int = 1): + def __init__(self, env, agent, device: torch.device, episodes: int = 1): super().__init__(env, agent) self.episodes = episodes self.device = device diff --git a/pl_bolts/models/rl/__init__.py b/pl_bolts/models/rl/__init__.py index f5a47f8af4..f70f6bd959 100644 --- a/pl_bolts/models/rl/__init__.py +++ b/pl_bolts/models/rl/__init__.py @@ -5,4 +5,4 @@ from pl_bolts.models.rl.noisy_dqn_model import NoisyDQN from pl_bolts.models.rl.per_dqn_model import PERDQN from pl_bolts.models.rl.reinforce_model import Reinforce -# from pl_bolts.models.rl.vanilla_policy_gradient_model import PolicyGradient +from pl_bolts.models.rl.vanilla_policy_gradient_model import PolicyGradient diff --git a/pl_bolts/models/rl/vanilla_policy_gradient_model.py b/pl_bolts/models/rl/vanilla_policy_gradient_model.py index 9b50056998..3295be3aa3 100644 --- a/pl_bolts/models/rl/vanilla_policy_gradient_model.py +++ b/pl_bolts/models/rl/vanilla_policy_gradient_model.py @@ -16,7 +16,7 @@ from torch.optim.optimizer import Optimizer from torch.utils.data import DataLoader -from pl_bolts.datamodules.experience_source import ExperienceSourceDataset, NStepExperienceSource +from pl_bolts.datamodules.experience_source import NStepExperienceSource, ExperienceSourceDataset from pl_bolts.models.rl.common import cli from pl_bolts.models.rl.common.agents import PolicyAgent from pl_bolts.models.rl.common.networks import MLP From 22925289a27138e0120a28a325ea3143c03d45f8 Mon Sep 17 00:00:00 2001 From: Donal Date: Thu, 9 Jul 2020 08:08:39 +0100 Subject: [PATCH 11/14] Updated reference of Experience to datamodules instead of the rl.common --- tests/datamodules/test_experience_sources.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/datamodules/test_experience_sources.py b/tests/datamodules/test_experience_sources.py index 793a36572a..0225b73186 100644 --- a/tests/datamodules/test_experience_sources.py +++ b/tests/datamodules/test_experience_sources.py @@ -6,9 +6,8 @@ from torch.utils.data import DataLoader from pl_bolts.datamodules.experience_source import (ExperienceSourceDataset, ExperienceSource, NStepExperienceSource, - EpisodicExperienceStream) + EpisodicExperienceStream, Experience) from pl_bolts.models.rl.common.agents import Agent -from pl_bolts.models.rl.common.memory import Experience class DummyAgent(Agent): From 13cc727c14226a010d299541c38005ac08da13ef Mon Sep 17 00:00:00 2001 From: Jirka Date: Thu, 9 Jul 2020 11:18:35 +0200 Subject: [PATCH 12/14] timeout --- .github/workflows/ci-testing.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml index 8d56f95e46..b1a067911c 100644 --- a/.github/workflows/ci-testing.yml +++ b/.github/workflows/ci-testing.yml @@ -29,7 +29,7 @@ jobs: # requires: 'minimal' # Timeout: https://stackoverflow.com/a/59076067/4521646 - timeout-minutes: 20 + timeout-minutes: 35 steps: - uses: actions/checkout@v2 From d4c1cc77a355551fa7af300016cd40942ef8e1d3 Mon Sep 17 00:00:00 2001 From: Donal Byrne Date: Sat, 11 Jul 2020 13:26:16 +0100 Subject: [PATCH 13/14] Commented out test_dev_dataset to test run times --- tests/datamodules/test_datamodules.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/datamodules/test_datamodules.py b/tests/datamodules/test_datamodules.py index 379cafb9b9..6cd9a74017 100644 --- a/tests/datamodules/test_datamodules.py +++ b/tests/datamodules/test_datamodules.py @@ -1,8 +1,8 @@ -from pl_bolts.datamodules.cifar10_dataset import CIFAR10 - - -def test_dev_datasets(tmpdir): - - ds = CIFAR10(tmpdir) - for b in ds: - pass +# from pl_bolts.datamodules.cifar10_dataset import CIFAR10 +# +# +# def test_dev_datasets(tmpdir): +# +# ds = CIFAR10(tmpdir) +# for b in ds: +# pass From 04f02cd00cb85b394c86234da580c1586a9db154 Mon Sep 17 00:00:00 2001 From: Donal Byrne Date: Sat, 11 Jul 2020 13:52:44 +0100 Subject: [PATCH 14/14] undo commenting out of test_dev_datasets --- tests/datamodules/test_datamodules.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/datamodules/test_datamodules.py b/tests/datamodules/test_datamodules.py index 6cd9a74017..379cafb9b9 100644 --- a/tests/datamodules/test_datamodules.py +++ b/tests/datamodules/test_datamodules.py @@ -1,8 +1,8 @@ -# from pl_bolts.datamodules.cifar10_dataset import CIFAR10 -# -# -# def test_dev_datasets(tmpdir): -# -# ds = CIFAR10(tmpdir) -# for b in ds: -# pass +from pl_bolts.datamodules.cifar10_dataset import CIFAR10 + + +def test_dev_datasets(tmpdir): + + ds = CIFAR10(tmpdir) + for b in ds: + pass