<a href="https://colab.research.google.com/github/moodlep/rl-playground/blob/main/ppo/colab_notebooks/PPO_v0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installations

In [None]:
!pip3 install box2d-py
!pip3 install gym[Box_2D]



In [None]:
!pip install stable-baselines3



In [None]:
import os
import Box2D
import pyglet
import imageio
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

In [None]:
import gym
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from stable_baselines3.common.vec_env import SubprocVecEnv
import multiprocessing


# Test Env

In [None]:
env_id = "LunarLanderContinuous-v2"
env = gym.make(env_id)


In [None]:
env.reset()

array([-3.8595201e-04,  1.4057657e+00, -3.9102796e-02, -2.2908369e-01,
        4.5396067e-04,  8.8573862e-03,  0.0000000e+00,  0.0000000e+00],
      dtype=float32)

In [None]:
env.action_space.shape, env.observation_space.shape

((2,), (8,))

In [None]:
for episode in range(1): 
    observation = env.reset()
    for step in range(1):
        action = env.action_space.sample()  # or given a custom model, action = policy(observation)
        observation, reward, done, info = env.step(action)
        print(observation, reward, done, info, action)

[-0.01583881  1.4240321  -0.79821694  0.29294428  0.01856153  0.18684572
  0.          0.        ] -1.3144958995434195 False {} [ 0.84353274 -0.34593865]


# ActorCritic Model

In [None]:
class ActorCritic(nn.Module):

  def __init__(self, state_dim, action_dim, std_init):
    super(ActorCritic,self).__init__()

    # TBD switch to variable std

    self.action_dim = action_dim
    self.state_dim = state_dim
    self.critic = nn.Sequential(
        nn.Linear(self.state_dim, 64),
        nn.Tanh(),
        nn.Linear(64, 64),
        nn.Tanh(),
        nn.Linear(64, 1)
    )

    self.actor = nn.Sequential(
        nn.Linear(self.state_dim, 64),
        nn.Tanh(),
        nn.Linear(64, 64),
        nn.Tanh(),
        nn.Linear(64, self.action_dim),
        nn.Tanh()
    )

    # covariance for Multivariate Normal policy
    self.action_vars = torch.full((self.action_dim,), std_init * std_init)
    self.cov_mat = torch.diag(self.action_vars).unsqueeze(dim=0)  # do we need the unsqueeze? 

  def get_action(self, state):
    means = self.actor(state)
    policy = torch.distributions.MultivariateNormal(means, self.cov_mat)

    return policy.sample()

    
  def get_value(self,state):
    return self.critic(state)

  



In [None]:
ac = ActorCritic(env.observation_space.shape[0], env.action_space.shape[0], 0.05)

In [None]:
states = torch.rand

In [None]:
pi = ac.get_action(torch.tensor([observation, observation]))

# Buffer

In [None]:
class Buffer:
  def __init__(self,state_dim, action_dim ):
    #self.num_actors = num_actors
    #self.max_timesteps = max_timestep
    self.state_dim = state_dim
    self.action_dim = action_dim
    self.buffer = torch.zeros_like([])
  def add_transion(self):
    return None

# Envs

In [None]:
def make_env(env_id: str, rank: int, seed: int = 0):
  def _init():
    env = gym.make(env_id)
    env.seed(seed + rank)
    return env
  torch.manual_seed(seed)
  return _init

num_cpu = 4
env_p = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])


In [None]:
obs = torch.tensor(env_p.reset())
obs

tensor([[-5.9156e-04,  1.4135e+00, -5.9936e-02,  1.1277e-01,  6.9229e-04,
          1.3576e-02,  0.0000e+00,  0.0000e+00],
        [-5.2567e-03,  1.3989e+00, -5.3248e-01, -5.3348e-01,  6.0981e-03,
          1.2061e-01,  0.0000e+00,  0.0000e+00],
        [-4.0088e-03,  1.4072e+00, -4.0605e-01, -1.6675e-01,  4.6519e-03,
          9.1977e-02,  0.0000e+00,  0.0000e+00],
        [-7.2308e-03,  1.4090e+00, -7.3242e-01, -8.7516e-02,  8.3855e-03,
          1.6590e-01,  0.0000e+00,  0.0000e+00]])

In [None]:
actions = ac.get_action(obs).numpy()
actions

array([[-0.03697643, -0.03650609],
       [-0.2617246 , -0.001944  ],
       [-0.18491448, -0.11163113],
       [-0.10428213, -0.03950775]], dtype=float32)

In [None]:
env_p.step_async(actions)

In [None]:
env_p.step_wait()

(array([[-1.18331914e-03,  1.41541684e+00, -5.98544367e-02,
          8.70871544e-02,  1.36317231e-03,  1.34195890e-02,
          0.00000000e+00,  0.00000000e+00],
        [-1.05140684e-02,  1.38633800e+00, -5.31758964e-01,
         -5.59106767e-01,  1.20580401e-02,  1.19209744e-01,
          0.00000000e+00,  0.00000000e+00],
        [-8.01782589e-03,  1.40283966e+00, -4.05504376e-01,
         -1.92408934e-01,  9.19678714e-03,  9.09073725e-02,
          0.00000000e+00,  0.00000000e+00],
        [-1.44620892e-02,  1.40640640e+00, -7.31432617e-01,
         -1.13186173e-01,  1.65834818e-02,  1.63975552e-01,
          0.00000000e+00,  0.00000000e+00]], dtype=float32),
 array([ 1.94054617, -1.12638708, -1.01123009, -0.82140293]),
 array([False, False, False, False]),
 ({}, {}, {}, {}))

In [None]:
env_p.step_async(actions)
observation, reward, done, information = env_p.step_wait()
print(len(done))



4


In [None]:
env_p.action_space.shape

(2,)

# PPO Class: 

* sort out seeds
* to.device()


In [None]:
# Actual cores in Colab is too low so we will stick with our values!
num_cpus = multiprocessing.cpu_count()
num_cpus

2

In [None]:
config = {
    'std_init': 0.05,
    'env_id': 'LunarLanderContinuous-v2',
    'num_workers': 4,  # rank (seed) / envs / N
    'num_epochs': 10, # K number of 
    'num_iterations': 10, # number of times we collect a dataset 
    'max_timesteps': 1000, # T
    'epsilon': 0.2,  # clipping radius

}

In [None]:
class PPO:
  def __init__(self, config):

    self.config = config
    # self.num_cpus = multiprocessing.cpu_count()

    self.envs = SubprocVecEnv([make_env(config['env_id'], i) for i in range(self.config['num_workers'])])

    self.action_dim = self.envs.action_space.shape[0]
    self.state_dim = self.envs.observation_space.shape[0]

    self.model = ActorCritic(self.state_dim, self.action_dim, config['std_init'])

  def create_rollout(self):

    pass

  def train(self): 

    for it in range(self.config['num_iterations']):
      obs = torch.tensor(self.envs.reset())
      obs_batch = [list() for i in range(self.config['num_workers'])]
      action_batch = [list() for i in range(self.config['num_workers'])]
      reward_batch = [list() for i in range(self.config['num_workers'])]
      next_obs_batch = [list() for i in range(self.config['num_workers'])]
      done_batch = [list() for i in range(self.config['num_workers'])]
      active_workers = [i for i in range(self.config['num_workers'])] # slow => because if one worker is done it still continues to do rollouts

      for t in range(self.config['max_timesteps']):
        actions = self.model.get_action(obs).numpy()
        self.envs.step_async(actions)
        next_obs, reward, done, information = self.envs.step_wait()
        

        for i in range(self.config['num_workers']):
          obs_batch[i].append(obs[i])
          action_batch[i].append(actions[i])
          reward_batch[i].append(reward[i])
          done_batch[i].append(done[i])

          if active_workers[i] and done[i] :
            del(active_workers[i])
        




SyntaxError: ignored

In [None]:
a = np.array([1,2,3,4])

array([1, 4])

## covariance stuff

Src: https://github.com/nikhilbarhate99/PPO-PyTorch/blob/master/PPO_colab.ipynb

In [None]:
torch.full((2,), 3 * 3)

In [None]:
torch.diag(torch.full((2,), 3 * 3)).unsqueeze(dim=0)