# Humanoid Standup Project

Enviroment Setup and imports

In [32]:
!pip install gymnasium[mujoco]
import gymnasium as gym
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from torch.distributions import Normal
import numpy as np
from google.colab import files
import matplotlib.pyplot as plt
from google.colab import drive


drive.mount('/content/drive')

# device setup
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Memory

In [33]:
class Memory():
  def __init__(self, batch_size):
    self.states = []
    self.values = []
    self.actions = []
    self.probs = []
    self.rewards = []
    self.finished = []
    self.batch_size = batch_size

  def store_transition_data(self, state, value, a, prob, reward, finished):
    self.states.append(state)
    if torch.is_tensor(value):
      value = value.detach().squeeze().item()
    self.values.append(value)

    if torch.is_tensor(a):
      a = a.detach().cpu().squeeze(0)
    self.actions.append(a)

    if torch.is_tensor(prob):
      prob = prob.detach().cpu().squeeze().item()
    self.probs.append(prob)

    self.rewards.append(reward)
    self.finished.append(finished)

  def create_batches(self):
    states = torch.tensor(np.array(self.states), dtype = torch.float32).to(device)
    values = torch.tensor(self.values, dtype = torch.float32).to(device)
    actions = torch.tensor(np.array(self.actions), dtype = torch.float32).to(device)
    probs = torch.tensor(self.probs, dtype = torch.float32).to(device)
    rewards = torch.tensor(self.rewards, dtype = torch.float32).to(device)
    finished = torch.tensor(self.finished, dtype = torch.bool).to(device)

    random_indices = np.random.permutation(len(states))
    batch_starting_indices = np.arange(0,len(states), self.batch_size)
    batches = []

    for b in batch_starting_indices:
      batches.append(random_indices[b:b+self.batch_size])

    return states, values, actions, probs, rewards, finished, batches

  def restart_memory(self):
    self.states = []
    self.values = []
    self.actions = []
    self.probs = []
    self.rewards = []
    self.finished = []

###Actor

In [34]:
class Actor(nn.Module):
  def __init__(self, observation_dim, action_dim, hidden_size = 256):
    super().__init__()
    self.actor = nn.Sequential(
        nn.Linear(observation_dim, hidden_size),
        nn.ReLU(),
        nn.Linear(hidden_size, hidden_size),
        nn.ReLU(),
        nn.Linear(hidden_size, action_dim) # represents the mean
    )
    self.std_logged = nn.Parameter(torch.zeros(action_dim)) # trainable std parameter

  def forward(self,observation):
    mean = self.actor(observation)
    std = torch.exp(self.std_logged)
    return mean, std

  def compute_distr(self, observation):
    mean, std = self.forward(observation)
    distr = torch.distributions.Normal(mean, std)
    return distr

  def take_action(self, observation):
    distr = self.compute_distr(observation)
    action_sampled = distr.sample()
    log_probabilty  = distr.log_prob(action_sampled).sum(dim= -1)
    return action_sampled, log_probabilty

###Critic

In [35]:
class Critic(nn.Module):
  def __init__(self, observation, hidden_size = 256):
    super().__init__()
    self.critic = nn.Sequential(
        nn.Linear(observation, hidden_size),
        nn.ReLU(),
        nn.Linear(hidden_size, hidden_size),
        nn.ReLU(),
        nn.Linear(hidden_size, 1) # 1 dim represents the value of the state
    )
  def forward(self, observation):
    return self.critic(observation)

### Interacting Agent

In [36]:
class PPOController:
  def __init__(self,observation_dim, action_dim, lambda_, gamma, alpha,  clip, N, epochs, batch_size, ent_coef = 0.01):
    self.observation_dim = observation_dim
    self.controller_memory = Memory(batch_size)
    self.action_dim = action_dim
    self.lambda_ = lambda_
    self.gamma = gamma
    self.alpha = alpha
    self.clip = clip
    self.N = N
    self.epochs = epochs
    self.batch_size = batch_size
    self.actor = Actor(observation_dim, action_dim).to(device)
    self.critic = Critic(observation_dim).to(device)
    self.actor_opt = torch.optim.Adam(self.actor.parameters(), lr = alpha)
    self.critic_opt = torch.optim.Adam(self.critic.parameters(), lr = alpha)
    self.ent_coef = ent_coef

  def sample_action(self,observation):
    observation_torch = torch.as_tensor(observation, dtype = torch.float32).to(device)
    if observation_torch.dim() == 1:
      observation_torch = observation_torch.unsqueeze(0) # add batch dim
    action_sampled, log_probabilty = self.actor.take_action(observation_torch)
    state_value = self.critic(observation_torch)
    return action_sampled, log_probabilty, state_value



  def compute_gae(self, last_value = None):
    states, values, actions, prev_probs, rewards, finished, batches  = self.controller_memory.create_batches()
    if last_value is None:
      last_value_tensor = torch.zeros(1, dtype = values.dtype, device = device)
    else:
      if torch.is_tensor(last_value):
        last_value_tensor = last_value.detach().view(1).to(device)
      else:
        last_value_tensor = torch.tensor([last_value], dtype = values.dtype, device = device)


    vals = torch.cat([values, last_value_tensor], dim = 0)
    advantages = torch.zeros_like(rewards).to(device)
    prev_advantage = 0.0
    T = len(rewards)

    # computes gae (reversed)
    for t in reversed(range(T)):
      finished_mask = 1.0 - finished[t].float()
      td_error  = rewards[t] + self.gamma * vals[t+1] *finished_mask - vals[t]
      advantage = td_error + self.gamma * self.lambda_ * prev_advantage * finished_mask
      prev_advantage = advantage
      advantages[t]  = advantage

    return states, values, actions, prev_probs, rewards, finished, batches, advantages



  def iterate_batches(self, last_value = None):
    states, values, actions, prev_probs, rewards, finished, batches, advantages = self.compute_gae(last_value)
    returns = advantages + values
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
    batch_loss = []
    for batch in batches:
      batch_states = states[batch]
      batch_actions = actions[batch]
      batch_prev_probs = prev_probs[batch]
      batch_advantages = advantages[batch]
      batch_rewards = rewards[batch]
      batch_returns = returns[batch]

      distribution = self.actor.compute_distr(batch_states)
      new_log_probabilty = distribution.log_prob(batch_actions).sum(dim = -1)
      prob_ratio = (new_log_probabilty - batch_prev_probs).exp()

      value_pred = self.critic(batch_states).squeeze(-1)

      # computing actor loss
      term1 = prob_ratio * batch_advantages
      term2 = torch.clamp(prob_ratio, 1-self.clip, 1+ self.clip)*batch_advantages
      actor_loss =  -torch.min(term1, term2).mean()

      entropy = distribution.entropy().sum(-1).mean()


      # computing critic loss
      return_ = batch_returns
      critic_loss = 0.5 * ((value_pred - return_)**2).mean()

      total_loss = actor_loss + critic_loss - self.ent_coef * entropy
      batch_loss.append(total_loss)

    batch_loss = torch.stack(batch_loss).mean()

    return batch_loss


  def train(self, last_value = None):
    for ep in range(self.epochs):
      total_loss = self.iterate_batches(last_value)
      self.actor_opt.zero_grad()
      self.critic_opt.zero_grad()
      total_loss.backward()
      self.actor_opt.step()
      self.critic_opt.step()
    self.controller_memory.restart_memory()


# Running

In [None]:
import random
SEED = 24
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# enviroment
env = gym.make('HumanoidStandup-v5', uph_cost_weight = 1.5)
obs, info = env.reset(seed = SEED)

# parameters
observation_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
alpha = 3e-4
batch_size = 32
gamma = 0.99
lambda_ = 0.9
clip = 0.2
epochs = 10
runs = 4000
done= False
scores = []
mean_score = 0.0
top_score = float('-inf')
steps = 0
N = 512
mean_scores_list = []
entropy_coef = 0.01
hidden_size =256

PPOAgent = PPOController(observation_dim, action_dim, lambda_, gamma, alpha,  clip, N, epochs, batch_size, entropy_coef)

for i in range(runs):
  observation, info = env.reset()
  score = 0
  done = False
  while done == False:
    with torch.no_grad():
      action, log_prob, val = PPOAgent.sample_action(observation)

    unclipped_a = action.squeeze(0).cpu().numpy()
    env_action = np.clip(unclipped_a, env.action_space.low, env.action_space.high)
    obs, r, terminated, truncated, info = env.step(env_action)
    done = terminated or truncated
    steps += 1

    PPOAgent.controller_memory.store_transition_data(observation, val, unclipped_a, log_prob, r, done)
    if steps % N == 0:
      with torch.no_grad():
        obs_t = torch.as_tensor(obs, dtype = torch.float32).to(device)
        last_value = PPOAgent.critic(obs_t.unsqueeze(0)).squeeze(-1)
      PPOAgent.train(last_value)
    score += r
    observation = obs

  scores.append(score)
  mean_score = np.mean(scores[-100:]) # prev 100 runs
  mean_scores_list.append(mean_score)

  if mean_score > top_score:
    top_score = mean_score

  if i % 50 ==0 :
    print(f'EP: {i}, BEST SCORE: {top_score}, AVG 100: {mean_score}')

EP: 0, BEST SCORE: 35843.47715060286, AVG 100: 35843.47715060286
EP: 50, BEST SCORE: 49768.86361556381, AVG 100: 49768.86361556381
EP: 100, BEST SCORE: 51067.12129006974, AVG 100: 50661.47814757825
EP: 150, BEST SCORE: 51422.13540758381, AVG 100: 50885.04611256516
EP: 200, BEST SCORE: 51422.13540758381, AVG 100: 50981.65526969561
EP: 250, BEST SCORE: 52083.61353625544, AVG 100: 51630.155966836115
EP: 300, BEST SCORE: 52083.61353625544, AVG 100: 51271.83670743125
EP: 350, BEST SCORE: 52083.61353625544, AVG 100: 51210.40699296785
EP: 400, BEST SCORE: 52083.61353625544, AVG 100: 51522.9688316711
EP: 450, BEST SCORE: 53043.28905707785, AVG 100: 53043.28905707785
EP: 500, BEST SCORE: 54980.57596583747, AVG 100: 54980.57596583747
EP: 550, BEST SCORE: 55423.25422180104, AVG 100: 55369.4945887741
EP: 600, BEST SCORE: 55914.95989915313, AVG 100: 55424.373419228854
EP: 650, BEST SCORE: 56878.07330474533, AVG 100: 56662.87058388232
EP: 700, BEST SCORE: 59273.424416691574, AVG 100: 59273.424416691

In [None]:
print(
    f'hidden_size: {hidden_size}\n'
    f'observation_dim: {observation_dim}\n'
    f'action_dim: {action_dim}\n'
    f'alpha: {alpha}\n'
    f'batch_size: {batch_size}\n'
    f'gamma: {gamma}\n'
    f'lambda_: {lambda_}\n'
    f'clip: {clip}\n'
    f'epochs: {epochs}\n'
    f'runs: {runs}\n'
    f'N steps: {N}\n'
    f'entropy_coef: {entropy_coef}\n'
    f'lr: {alpha}\n' )

fig,ax = plt.subplots(figsize=(10,5))
ep = np.arange(len(scores))

ax.plot(ep, scores, alpha = 0.3, label = 'Episode return')
ax.plot(ep, mean_scores_list, label = '100-episode MA')
ax.set_xlabel('Episode')
ax.set_ylabel('Episode Return')
ax.set_title('PPO HumanoidStandup')
ax.legend()
plt.show()