<a href="https://colab.research.google.com/github/HassanChowdhry/DeepReinforcementLearning/blob/main/PPO_InvertedPendulum_Mujoco.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# imports
%pip install "gymnasium[mujoco]" "tensorboardX"
import gymnasium as gym
from gymnasium.vector import SyncVectorEnv
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import MultivariateNormal
from tqdm import tqdm
from tensorboardX import SummaryWriter

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Collecting tensorboardX
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorboardX
Successfully installed tensorboardX-2.6.2.2


In [6]:
class Memory:   # collected from old policy
  def __init__(self):
    self.states = []
    self.actions = []
    self.rewards = []
    self.is_terminals = []
    self.logprobs = []

  def clear_memory(self):
    del self.states[:]
    del self.actions[:]
    del self.rewards[:]
    del self.is_terminals[:]
    del self.logprobs[:]

In [18]:
class ActorCritic(nn.Module):
  def __init__(self, obs_dim, action_dim, action_std):
    super(ActorCritic, self).__init__()

    self.actor = nn.Sequential(
      nn.Linear(obs_dim, 64),
      nn.Tanh(),
      nn.Linear(64, 32),
      nn.Tanh(),
      nn.Linear(32, action_dim),
      nn.Tanh()
  )

    self.critic = nn.Sequential(
      nn.Linear(obs_dim, 64),
      nn.Tanh(),
      nn.Linear(64, 32),
      nn.Tanh(),
      nn.Linear(32, 1)
    )

    self.action_var = torch.full((action_dim, ), action_std * action_std).to(device)    #(4, ) variance of a gaussian dist

  def act(self, state, memory):
    action_mean = self.actor(state)
    cov_mat = torch.diag(self.action_var).to(device) # covariance matrix for multivariate distribution
    dist = MultivariateNormal(action_mean, cov_mat)

    action = dist.sample()
    logprob = dist.log_prob(action)

    memory.states.append(state)
    memory.actions.append(action)
    memory.logprobs.append(logprob)

    return action.detach()

  def evaluate(self, state, action):
    state_value = self.critic(state)    # (4000, 1)

    # to calculate action score(logprobs) and distribution entropy
    action_mean = self.actor(state)                     # (4000,4)
    action_var = self.action_var.expand_as(action_mean) # (4000,4)
    cov_mat = torch.diag_embed(action_var).to(device)   # (4000,4,4)
    dist = MultivariateNormal(action_mean, cov_mat)
    action_logprobs = dist.log_prob(action.unsqueeze(1)) # Unsqueeze action to match expected shape
    dist_entropy = dist.entropy()

    return action_logprobs, torch.squeeze(state_value), dist_entropy

In [19]:
class PPO:
  def __init__(self, obs_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip):
    self.lr = lr
    self.betas = betas
    self.gamma = gamma
    self.eps_clip = eps_clip
    self.K_epochs = K_epochs

    self.policy = ActorCritic(obs_dim, action_dim, action_std).to(device)
    self.old_policy = ActorCritic(obs_dim, action_dim, action_std).to(device)
    self.old_policy.load_state_dict(self.policy.state_dict())

    self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas)

    self.MSE = nn.MSELoss()

  def get_action(self, state, memory):
    # reshape(1, -1) turns it into shape (1, 4) from (4,)
    state = torch.FloatTensor(state.reshape(1, -1)).to(device)
    return self.old_policy.act(state, memory).cpu().numpy().flatten()

  def update(self, memory):
    returns = []
    g = 0

    for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)):
      if is_terminal:
        g = 0
      g = reward + (self.gamma * g)
      returns.insert(0, g)

    returns = torch.tensor(returns).to(device)
    returns = (returns - returns.mean()) / (returns.std() + 1e-5)

    old_states = torch.squeeze(torch.stack(memory.states).to(device)).detach()
    old_actions = torch.squeeze(torch.stack(memory.actions).to(device)).detach()
    old_logprobs = torch.squeeze(torch.stack(memory.logprobs)).to(device).detach()

    for _ in range(self.K_epochs):
      logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)

      # importance ratio
      ratios = torch.exp(logprobs - old_logprobs.detach())

      # advantages
      advantages = returns - state_values.detach()

      # actor_loss
      surr1 = ratios * advantages
      surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages
      actor_loss = -torch.min(surr1, surr2)

      # critic_loss
      critic_loss = 0.5 * self.MSE(state_values, returns)

      # loss
      loss = actor_loss + critic_loss - 0.001 * dist_entropy

      self.optimizer.zero_grad()
      loss.mean().backward()
      self.optimizer.step()

    # copy new weights into old_policy
    self.old_policy.load_state_dict(self.policy.state_dict())


In [20]:
env_name="InvertedPendulum-v5"
env = gym.make(env_name, reset_noise_scale=0.1)

obs_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_std = 0.5
lr = 3e-3
K_epochs = 80
eps_clip = 0.2
gamma = 0.99
betas = (0.9, 0.999)

max_episodes = int(1e4)
max_timesteps = 1500
update_timestep = 4000
solved_reward = 1000

render = False
print_interval = 500
save_interval = 500
writer = SummaryWriter()
tb = True


# Train Loop
memory = Memory()
agent = PPO(obs_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip)

r_reward = time_step = avg_length = 0

for ep in tqdm(range(1, max_episodes + 1), desc="Training"):
  state, _ = env.reset()
  x = 1
  for t in range(1, max_timesteps + 1):
    time_step += 1

    # run policy
    action = agent.get_action(state, memory)

    state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated

    memory.rewards.append(reward)
    memory.is_terminals.append(done)

    if time_step % update_timestep == 0:
      agent.update(memory)
      memory.clear_memory()
      time_step = 0

    r_reward += reward

    x = t
    if render: env.render()
    if done: break
  avg_length += x

  if r_reward > (print_interval * solved_reward):
    print("########## Solved! ##########")
    torch.save(agent.policy.state_dict(), 'PPO_continuous_{}.pth'.format(env_name))
    print('Save a checkpoint!')
    break

  if ep % save_interval == 0:
      torch.save(agent.policy.state_dict(), '/PPO_continuous_{}.pth'.format(env_name))
      print('Save a checkpoint!')

  if ep % print_interval == 0:
      avg_length = int(avg_length / print_interval)
      running_reward = int((r_reward / print_interval))

      print('Episode {} \t Avg length: {} \t Avg reward: {}'.format(ep, avg_length, running_reward))

      if tb:
        writer.add_scalar('scalar/reward', running_reward, ep)
        writer.add_scalar('scalar/length', avg_length, ep)

      running_reward, avg_length = 0, 0




Training:   5%|▌         | 509/10000 [00:07<02:49, 56.13it/s]

Save a checkpoint!
Episode 500 	 Avg length: 9 	 Avg reward: 8


Training:  10%|█         | 1003/10000 [00:24<07:43, 19.39it/s]

Save a checkpoint!
Episode 1000 	 Avg length: 17 	 Avg reward: 24


Training:  15%|█▌        | 1501/10000 [06:59<2:20:16,  1.01it/s]

Save a checkpoint!
Episode 1500 	 Avg length: 504 	 Avg reward: 528


Training:  18%|█▊        | 1815/10000 [13:44<1:01:57,  2.20it/s]

########## Solved! ##########
Save a checkpoint!



