<a href="https://colab.research.google.com/github/moodlep/rl-playground/blob/main/ppo/colab_notebooks/PPO_v0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installations

In [None]:
!pip3 install box2d-py
!pip3 install gym[Box_2D]

Collecting box2d-py
  Downloading box2d_py-2.3.8-cp37-cp37m-manylinux1_x86_64.whl (448 kB)
[?25l[K     |▊                               | 10 kB 26.5 MB/s eta 0:00:01[K     |█▌                              | 20 kB 29.0 MB/s eta 0:00:01[K     |██▏                             | 30 kB 21.6 MB/s eta 0:00:01[K     |███                             | 40 kB 18.0 MB/s eta 0:00:01[K     |███▋                            | 51 kB 10.2 MB/s eta 0:00:01[K     |████▍                           | 61 kB 11.8 MB/s eta 0:00:01[K     |█████▏                          | 71 kB 9.6 MB/s eta 0:00:01[K     |█████▉                          | 81 kB 10.5 MB/s eta 0:00:01[K     |██████▋                         | 92 kB 11.5 MB/s eta 0:00:01[K     |███████▎                        | 102 kB 9.6 MB/s eta 0:00:01[K     |████████                        | 112 kB 9.6 MB/s eta 0:00:01[K     |████████▊                       | 122 kB 9.6 MB/s eta 0:00:01[K     |█████████▌                      | 133 kB 9

In [None]:
!pip install stable-baselines3

Collecting stable-baselines3
  Downloading stable_baselines3-1.3.0-py3-none-any.whl (174 kB)
[?25l[K     |█▉                              | 10 kB 26.0 MB/s eta 0:00:01[K     |███▊                            | 20 kB 32.1 MB/s eta 0:00:01[K     |█████▋                          | 30 kB 16.0 MB/s eta 0:00:01[K     |███████▌                        | 40 kB 11.7 MB/s eta 0:00:01[K     |█████████▍                      | 51 kB 8.0 MB/s eta 0:00:01[K     |███████████▎                    | 61 kB 8.4 MB/s eta 0:00:01[K     |█████████████▏                  | 71 kB 7.2 MB/s eta 0:00:01[K     |███████████████                 | 81 kB 8.0 MB/s eta 0:00:01[K     |████████████████▉               | 92 kB 7.0 MB/s eta 0:00:01[K     |██████████████████▊             | 102 kB 7.6 MB/s eta 0:00:01[K     |████████████████████▋           | 112 kB 7.6 MB/s eta 0:00:01[K     |██████████████████████▌         | 122 kB 7.6 MB/s eta 0:00:01[K     |████████████████████████▍       | 133 kB 7.6

In [None]:
import os
import Box2D
import pyglet
import imageio
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

In [None]:
import gym
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from stable_baselines3.common.vec_env import SubprocVecEnv
import multiprocessing
from torch.utils.tensorboard import SummaryWriter


# Test Env

In [None]:
env_id = "LunarLanderContinuous-v2"
env = gym.make(env_id)


In [None]:
env.reset()

array([ 0.00450096,  1.4039963 ,  0.4558782 , -0.30773082, -0.00520864,
       -0.10326321,  0.        ,  0.        ], dtype=float32)

In [None]:
env.action_space.shape, env.observation_space.shape

((2,), (8,))

In [None]:
for episode in range(1): 
    observation = env.reset()
    for step in range(1):
        action = env.action_space.sample()  # or given a custom model, action = policy(observation)
        observation, reward, done, info = env.step(action)
        print(observation, reward, done, info, action)

[ 0.01517162  1.3966358   0.76332283 -0.3303236  -0.01610824 -0.1458994
  0.          0.        ] -0.28190276977425355 False {} [-0.5224455 -0.6211104]


# ActorCritic Model

In [None]:
class ActorCritic(nn.Module):

  def __init__(self, state_dim, action_dim, std_init):
    super(ActorCritic,self).__init__()

    # TBD switch to variable std

    self.action_dim = action_dim
    self.state_dim = state_dim
    self.critic = nn.Sequential(
        nn.Linear(self.state_dim, 64),
        nn.Tanh(),
        nn.Linear(64, 64),
        nn.Tanh(),
        nn.Linear(64, 1)
    )

    self.actor = nn.Sequential(
        nn.Linear(self.state_dim, 64),
        nn.Tanh(),
        nn.Linear(64, 64),
        nn.Tanh(),
        nn.Linear(64, self.action_dim),
        nn.Tanh()
    )

    # covariance for Multivariate Normal policy
    self.action_vars = torch.full((self.action_dim,), std_init * std_init)
    self.cov_mat = torch.diag(self.action_vars).unsqueeze(dim=0)  # do we need the unsqueeze? 

  def forward(self, input):
    values = self.critic(input)
    logits = self.actor(input)
    return values, logits

  def get_action(self, state, action=None):
    means = self.actor(state)
    policy = torch.distributions.MultivariateNormal(means, self.cov_mat)
    if action==None:
      action = policy.sample()
    return action,policy.log_prob(action)
    
  def get_value(self,state):
    return self.critic(state)

  



In [None]:
ac = ActorCritic(env.observation_space.shape[0], env.action_space.shape[0], 0.05)

In [None]:
states = torch.rand

In [None]:
pi = ac.get_action(torch.tensor([observation, observation]))
pi[0].shape, pi

(torch.Size([2, 2]), (tensor([[ 0.2333,  0.0467],
          [ 0.0868, -0.0096]]),
  tensor([2.4380, 2.4004], grad_fn=<SubBackward0>)))

# Envs

In [None]:
def make_env(env_id: str, rank: int, seed: int = 0):
  def _init():
    env = gym.make(env_id)
    env.seed(seed + rank)
    return env
  torch.manual_seed(seed)
  return _init

num_cpu = 4
env_p = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])


In [None]:
obs = torch.tensor(env_p.reset())
obs

tensor([[-5.9156e-04,  1.4135e+00, -5.9936e-02,  1.1277e-01,  6.9229e-04,
          1.3576e-02,  0.0000e+00,  0.0000e+00],
        [-5.2567e-03,  1.3989e+00, -5.3248e-01, -5.3348e-01,  6.0981e-03,
          1.2061e-01,  0.0000e+00,  0.0000e+00],
        [-4.0088e-03,  1.4072e+00, -4.0605e-01, -1.6675e-01,  4.6519e-03,
          9.1977e-02,  0.0000e+00,  0.0000e+00],
        [-7.2308e-03,  1.4090e+00, -7.3242e-01, -8.7516e-02,  8.3855e-03,
          1.6590e-01,  0.0000e+00,  0.0000e+00]])

# PPO Class: 

* sort out seeds
* to.device()


In [None]:
config = {
    'std_init': 0.05,
    'env_id': 'LunarLanderContinuous-v2',
    'num_workers': 4,  # rank (seed) / envs / N
    'num_epochs': 10, # K number of 
    'num_iterations': 10, # number of times we collect a dataset 
    'max_timesteps': 500, # T
    'epsilon': 0.2,  # clipping radius
    'gamma' : 0.99,
    'minibatch_size' : 64,
    'num_minibatches': 10 # need to decide how to mini-batch

}

In [None]:
class PPO:
  def __init__(self, config):

    self.config = config
    # self.num_cpus = multiprocessing.cpu_count()

    self.envs = SubprocVecEnv([make_env(config['env_id'], i) for i in range(self.config['num_workers'])])

    self.action_dim = self.envs.action_space.shape[0]
    self.state_dim = self.envs.observation_space.shape[0]

    self.model = ActorCritic(self.state_dim, self.action_dim, config['std_init'])

    self.optimizer = torch.optim.Adam(self.model.parameters())
    self.summary = SummaryWriter(log_dir='logs')

  def create_rollout(self):

    pass

  def train(self): 

    # 10 Jan 2022: enable anomaly detection to find the operation that failed to compute its gradient
    # torch.autograd.set_detect_anomaly(True)

    # num_iterations = Number of updates: 
    for it in range(self.config['num_iterations']):
      print('collect big batch - iteration number ', str(it))
    
      obs_batch = torch.zeros((self.config['max_timesteps'], self.config['num_workers'],  self.envs.observation_space.shape[0]))
      action_batch = torch.zeros((self.config['max_timesteps'],self.config['num_workers'], self.envs.action_space.shape[0]))
      reward_batch = torch.zeros((self.config['max_timesteps'],self.config['num_workers'], 1))
      done_batch = torch.zeros((self.config['max_timesteps'],self.config['num_workers'], 1))
      next_obs_batch = torch.zeros((self.config['max_timesteps'],self.config['num_workers'], self.envs.observation_space.shape[0]))
      ratio_batch = torch.zeros((self.config['max_timesteps'],self.config['num_workers'], 1))
      log_prob_batch = torch.zeros((self.config['max_timesteps'],self.config['num_workers'], 1))
      advantage_batch = torch.zeros((self.config['max_timesteps'],self.config['num_workers'], 1))
      returns_batch = torch.zeros((self.config['max_timesteps'],self.config['num_workers'], 1))
      values_batch = torch.zeros((self.config['max_timesteps'],self.config['num_workers'],1))   # can we remove the 1?
      obs = self.envs.reset()

      # capture NT rollouts 
      for t in range(self.config['max_timesteps']):
        # print('rollout timestep: ', str(t), ' - get action')
        with torch.no_grad():
          actions,log_probs = self.model.get_action(torch.tensor(obs))

        # print('rollout timestep: ', str(t), 'env step - async and wait')

        self.envs.step_async(actions.numpy())
        next_obs, rewards, dones, infos = self.envs.step_wait()
        
        # print('rollout timestep: ', str(t), 'gather batches: obs, action, reward, etc ', obs.shape, actions.shape, torch.tensor(rewards).shape)

        obs_batch[t] = torch.tensor(obs)
        action_batch[t] = torch.tensor(actions)
        reward_batch[t] = torch.tensor(rewards.reshape(-1,1))
        done_batch[t] = torch.tensor(dones.reshape(-1,1))
        next_obs_batch[t] = torch.tensor(next_obs)
        log_prob_batch[t] = log_probs.reshape(-1,1)
        values_batch[t] = self.model.get_value(torch.tensor(obs)).reshape(-1,1)

        obs = next_obs
      
      with torch.no_grad():
        # print('returns to go - final step')
        # Calculate returns to go - final step: 
        returns_batch[self.config['max_timesteps']-1] = torch.where(done_batch[self.config['max_timesteps']-1]==0,
                                                                    reward_batch[self.config['max_timesteps']-1] + self.config['gamma']*self.model.get_value(next_obs_batch[self.config['max_timesteps']-1]).detach()
                                                                    ,torch.tensor(0.0))  # torch.no_grad?

        # print('returns to go and advantage calc')
        # Calculate Advantage: 
        for t in range(self.config['max_timesteps']-2, -1, -1):
          returns_batch[t] = torch.where(done_batch[t]==0, returns_batch[t+1]*self.config['gamma'] + reward_batch[t],torch.tensor(0.0))
          advantage_batch[t] = returns_batch[t] - self.model.get_value(obs_batch[t])  # torch.no_grad? No - need grads!

        # Reverse the whole batch?? 
        returns_batch = torch.flip(returns_batch,dims=[0])

      # Optimization in k epochs:
      for k in range(self.config['num_epochs']):
        # Create some mini-batches and update TODO ****

          # print('opt', flush=True)
          obs_batch  = obs_batch.reshape(-1,self.envs.observation_space.shape[0])
          action_batch = action_batch.reshape(-1, self.envs.action_space.shape[0])
          reward_batch = reward_batch.reshape(-1,1)
          done_batch = done_batch.reshape(-1,1)
          next_obs_batch = next_obs_batch.reshape(-1,self.envs.observation_space.shape[0])
          ratio_batch = ratio_batch.reshape(-1,1)
          advantage_batch = advantage_batch.reshape(-1,1)
          returns_batch = returns_batch.reshape(-1,1)
          log_prob_batch = log_prob_batch.reshape(-1,1)
          values_batch = values_batch.reshape(-1, 1).detach()
          # print('opt2', flush=True)

          for nmb in range(self.config['num_minibatches']):
            # sample a mini-batch  - TODO - needs loop
            sample = torch.randint(0,self.config['num_workers'] * self.config['max_timesteps'],(64,))  # is this the right way to train the epochs? whole epochs maybe??

            _,new_log_probs = self.model.get_action(obs_batch[sample],action_batch[sample])   
            ratio = torch.exp(new_log_probs - log_prob_batch[sample]) 
            # print('opt3', flush=True)
            
            # print(advantage_batch[sample].requires_grad)  # showing up as False... 
            objective = ratio*advantage_batch[sample].detach()
            clipped_obj = torch.clamp(ratio,1-self.config['epsilon'],1+self.config['epsilon'])*advantage_batch[sample].detach()
            value_loss = (returns_batch[sample]-values_batch[sample])**2 
            loss = torch.mean(torch.min(objective,clipped_obj) - value_loss)  # swopped sign
            # loss = torch.mean(torch.min(objective,clipped_obj) - advantage_batch[sample]**2)  # swopped sign
            print("mini-batch ", str(nmb), " loss is: ", loss.detach(), flush=True)
            self.summary.add_scalar("loss", loss.item(), nmb)  # need to calc global/epoch step count
            # print('opt4', flush=True)
            
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

      # self.summary.add_graph(self.model, obs[0])  # temp - please remove!








In [None]:
ppo = PPO(config)
ppo.train()

collect big batch - iteration number  0




mini-batch  0  loss is:  tensor(-746.0175)
mini-batch  1  loss is:  tensor(-509.5120)
mini-batch  2  loss is:  tensor(-723.3445)
mini-batch  3  loss is:  tensor(-586.2155)
mini-batch  4  loss is:  tensor(-598.7020)
mini-batch  5  loss is:  tensor(-583.2792)
mini-batch  6  loss is:  tensor(-649.8322)
mini-batch  7  loss is:  tensor(-685.8222)
mini-batch  8  loss is:  tensor(-576.3699)
mini-batch  9  loss is:  tensor(-685.5485)
mini-batch  0  loss is:  tensor(-679.0809)
mini-batch  1  loss is:  tensor(-771.6079)
mini-batch  2  loss is:  tensor(-525.7799)
mini-batch  3  loss is:  tensor(-731.4330)
mini-batch  4  loss is:  tensor(-694.0738)
mini-batch  5  loss is:  tensor(-737.4572)
mini-batch  6  loss is:  tensor(-535.1644)
mini-batch  7  loss is:  tensor(-542.3280)
mini-batch  8  loss is:  tensor(-618.5316)
mini-batch  9  loss is:  tensor(-712.6564)
mini-batch  0  loss is:  tensor(-640.3350)
mini-batch  1  loss is:  tensor(-551.5248)
mini-batch  2  loss is:  tensor(-539.7510)
mini-batch 

In [None]:
%debug

> [0;32m/usr/local/lib/python3.7/dist-packages/torch/autograd/__init__.py[0m(156)[0;36mbackward[0;34m()[0m
[0;32m    154 [0;31m    Variable._execution_engine.run_backward(
[0m[0;32m    155 [0;31m        [0mtensors[0m[0;34m,[0m [0mgrad_tensors_[0m[0;34m,[0m [0mretain_graph[0m[0;34m,[0m [0mcreate_graph[0m[0;34m,[0m [0minputs[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 156 [0;31m        allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
[0m[0;32m    157 [0;31m[0;34m[0m[0m
[0m[0;32m    158 [0;31m[0;34m[0m[0m
[0m
ipdb> up
> [0;32m/usr/local/lib/python3.7/dist-packages/torch/_tensor.py[0m(307)[0;36mbackward[0;34m()[0m
[0;32m    305 [0;31m                [0mcreate_graph[0m[0;34m=[0m[0mcreate_graph[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    306 [0;31m                inputs=inputs)
[0m[0;32m--> 307 [0;31m        [0mtorch[0m[0;34m.[0m[0mautograd[0m[0;34m.[0m[0mbackward[0m[0;34m([0

In [None]:
 x = torch.arange(40).view(10, 4)
 print(x)
torch.flip(x, [0])


tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11],
        [12, 13, 14, 15],
        [16, 17, 18, 19],
        [20, 21, 22, 23],
        [24, 25, 26, 27],
        [28, 29, 30, 31],
        [32, 33, 34, 35],
        [36, 37, 38, 39]])


tensor([[36, 37, 38, 39],
        [32, 33, 34, 35],
        [28, 29, 30, 31],
        [24, 25, 26, 27],
        [20, 21, 22, 23],
        [16, 17, 18, 19],
        [12, 13, 14, 15],
        [ 8,  9, 10, 11],
        [ 4,  5,  6,  7],
        [ 0,  1,  2,  3]])

In [None]:
x.grad_fn

In [None]:
a = torch.tensor([1])
torch.no_grad(a)

TypeError: ignored

In [None]:
a = np.array([1,2])

In [None]:
for t in range(10-1, -1, -1):
  print(t)

## covariance stuff

Src: https://github.com/nikhilbarhate99/PPO-PyTorch/blob/master/PPO_colab.ipynb

In [None]:
torch.full((2,), 3 * 3)

In [None]:
torch.diag(torch.full((2,), 3 * 3)).unsqueeze(dim=0)

In [None]:
class PPO:
  def __init__(self, config):

    self.config = config
    # self.num_cpus = multiprocessing.cpu_count()

    self.envs = SubprocVecEnv([make_env(config['env_id'], i) for i in range(self.config['num_workers'])])

    self.action_dim = self.envs.action_space.shape[0]
    self.state_dim = self.envs.observation_space.shape[0]

    self.model = ActorCritic(self.state_dim, self.action_dim, config['std_init'])

    self.optimizer = torch.optim.Adam(self.model.parameters())

  def create_rollout(self):

    pass

  def train(self): 

    # num_iterations = Number of updates: 
    print('train')
    for it in range(self.config['num_iterations']):
    
      obs_batch = torch.zeros((self.config['max_timesteps'], self.config['num_workers'],  self.envs.observation_space.shape[0]))
      action_batch = torch.zeros((self.config['max_timesteps'],self.config['num_workers'], self.envs.action_space.shape[0]))
      reward_batch = torch.zeros((self.config['max_timesteps'],self.config['num_workers'], 1))
      done_batch = torch.zeros((self.config['max_timesteps'],self.config['num_workers'], 1))
      next_obs_batch = torch.zeros((self.config['max_timesteps'],self.config['num_workers'], self.envs.observation_space.shape[0]))
      ratio_batch = torch.zeros((self.config['max_timesteps'],self.config['num_workers'], 1))
      log_prob_batch = torch.zeros((self.config['max_timesteps'],self.config['num_workers'], 1))
      advantage_batch = torch.zeros((self.config['max_timesteps'],self.config['num_workers'], 1),requires_grad=True)
      returns_batch = torch.zeros((self.config['max_timesteps'],self.config['num_workers'], 1))
      print('batch')
      obs = self.envs.reset()

      # capture NT rollouts 
      for t in range(self.config['max_timesteps']):
        print('model')
        actions,log_probs = self.model.get_action(torch.tensor(obs))
        actions = actions.numpy()

        print('env')

        self.envs.step_async(actions)
        next_obs, rewards, dones, infos = self.envs.step_wait()
        
        print('gather')

        obs_batch[t] = torch.tensor(obs)
        action_batch[t] = torch.tensor(actions)
        reward_batch[t] = torch.tensor(rewards.reshape(-1,1))
        done_batch[t] = torch.tensor(dones.reshape(-1,1))
        next_obs_batch[t] = torch.tensor(next_obs)
        log_prob_batch[t] = log_probs.reshape(-1,1)

        obs = next_obs
      
      print('roll_out')


      with torch.no_grad():
        # Calculate returns to go
        returns_batch[self.config['max_timesteps']-1] = torch.where(done_batch[self.config['max_timesteps']-1]==0,
                                                                    reward_batch[self.config['max_timesteps']-1] + self.config['gamma']*self.model.get_value(next_obs_batch[self.config['max_timesteps']-1])
                                                                    ,torch.tensor(0.0))  # torch.no_grad?

        print('after where')
        # Calculate Advantage: 
        for t in range(self.config['max_timesteps']-2, -1, -1):
          returns_batch[t] = torch.where(done_batch[t]==0, returns_batch[t+1]*self.config['gamma'] + reward_batch[t],torch.tensor(0.0))
          advantage_batch[t] = returns_batch[t] - self.model.get_value(obs_batch[t])  # torch.no_grad?

        # Reverse the whole batch?? 
        returns_batch = torch.flip(returns_batch,dims=[0])
        print('adv')


      # Optimization:
      for k in range(self.config['num_epochs']):
        # Create some mini-batches and update TODO ****

          print('opt', flush=True)
          obs_batch  = obs_batch.reshape(-1,self.envs.observation_space.shape[0])
          action_batch = action_batch.reshape(-1, self.envs.action_space.shape[0])
          reward_batch = reward_batch.reshape(-1,1)
          done_batch = done_batch.reshape(-1,1)
          next_obs_batch = next_obs_batch.reshape(-1,self.envs.observation_space.shape[0])
          ratio_batch = ratio_batch.reshape(-1,1)
          advantage_batch = advantage_batch.reshape(-1,1)
          returns_batch = returns_batch.reshape(-1,1)
          log_prob_batch = log_prob_batch.reshape(-1,1).detach()  # <--- notice detach
          print('opt2', flush=True)

          sample = torch.randint(0,self.config['num_workers'] * self.config['max_timesteps'],(64,))  # is this the right way to train the epochs? whole epochs maybe??

          _,new_log_probs = self.model.get_action(obs_batch[sample],action_batch[sample])   
          ratio = torch.exp(new_log_probs - log_prob_batch[sample]) 
          print('opt3', flush=True)
          
          print(advantage_batch[sample].requires_grad)  # showing up as False... 
          obj = ratio*advantage_batch[sample].detach()
          clipped_obj = torch.clamp(ratio,1-self.config['epsilon'],1+self.config['epsilon'])*advantage_batch[sample].detach()  # missing *A? 
          loss = torch.mean(-torch.min(obj,clipped_obj) + advantage_batch[sample]**2)
          print(loss, flush=True)
          print('opt4', flush=True)
          
          self.optimizer.zero_grad()
          loss.backward()
          self.optimizer.step()









In [None]:
ppo = PPO(config)
ppo.train()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
gather
model
env
