In [1]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
import torch.nn.functional as F
import time
import pickle
import os
from stable_baselines3.common.atari_wrappers import (
    NoopResetEnv,
    MaxAndSkipEnv,
    EpisodicLifeEnv,
    FireResetEnv,
    ClipRewardEnv,
)

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Device: {device}')

Device: cuda


In [3]:
# env = gym.make('CartPole-v1', render_mode="rgb_array")
# env = gym.wrappers.RecordEpisodeStatistics(env)
# env = gym.experimental.wrappers.RecordVideoV0(env, './video', episode_trigger=lambda t: t % 50 == 0, video_length=200)

# observation = env.reset()
# for _ in range(200):
#     action = env.action_space.sample()
#     observation, reward, terminated, truncated, info = env.step(action)
#     if terminated or truncated:
#         observation = env.reset()
#         print(f"Episodic return {info['episode']['r']}")
#     env.close()

In [4]:
# Vectorized environment
# def make_env(gym_id, render_mode=False):
#     def _thunk():
#         env = gym.make(gym_id, render_mode=render_mode)
#         env = gym.wrappers.RecordEpisodeStatistics(env)
#         # env = gym.experimental.wrappers.RecordVideoV0(env, './video', episode_trigger=lambda t: t % 100 == 0, video_length=200)
#         return env
#     return _thunk

def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    nn.init.orthogonal_(layer.weight, std)
    nn.init.constant_(layer.bias, bias_const)
    return layer

def make_env(gym_id, render_mode=False, capture_video=False):
    def thunk():
        env = gym.make(gym_id, render_mode=render_mode)
        env = gym.wrappers.RecordEpisodeStatistics(env)
        if capture_video:
            env = gym.experimental.wrappers.RecordVideoV0(env, './video', episode_trigger=lambda t: t % 50 == 0, video_length=2000)
        env = NoopResetEnv(env, noop_max=30)
        env = MaxAndSkipEnv(env, skip=4)
        env = EpisodicLifeEnv(env)
        if "FIRE" in env.unwrapped.get_action_meanings():
            env = FireResetEnv(env)
        env = ClipRewardEnv(env)
        env = gym.wrappers.ResizeObservation(env, shape=84)
        env = gym.wrappers.GrayScaleObservation(env)
        env = gym.wrappers.FrameStack(env, num_stack=4)
        return env
    return thunk

In [5]:
envs = gym.vector.SyncVectorEnv([make_env('ALE/Breakout-v5', "rgb_array") for _ in range(8)])

In [6]:
observation = envs.reset()
for _ in range(200):
    action = envs.action_space.sample()
    observation, reward, terminated, truncated, info = envs.step(action)
    if terminated.any() or truncated.any():
        observation = envs.reset()
        # print(f"Episodic return {info}")
    envs.close()

In [7]:
class Agent(nn.Module):
    def __init__(self, envs):
        super(Agent, self).__init__()
        self.network = nn.Sequential(
            layer_init(nn.Conv2d(4, 32, kernel_size=8, stride=4)),
            nn.ReLU(),
            layer_init(nn.Conv2d(32, 64, kernel_size=4, stride=2)),
            nn.ReLU(),
            layer_init(nn.Conv2d(64, 64, kernel_size=3, stride=1)),
            nn.ReLU(),
            nn.Flatten(),
            layer_init(nn.Linear(64*7*7, 512)),
            nn.ReLU()
        )
        self.actor = layer_init(nn.Linear(512, envs.single_action_space.n), std=0.01)
        self.critic = layer_init(nn.Linear(512, 1), std=1.0)
        # self.critic = nn.Sequential(
        #     layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
        #     nn.Tanh(),
        #     layer_init(nn.Linear(64, 64)),
        #     nn.Tanh(),
        #     layer_init(nn.Linear(64, 1), std=1.),
        # )
        
        # self.actor = nn.Sequential(
        #     layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
        #     nn.Tanh(),
        #     layer_init(nn.Linear(64, 64)),
        #     nn.Tanh(),
        #     layer_init(nn.Linear(64, envs.single_action_space.n), std=0.01),
        # )
    def get_value(self, x):
        return self.critic(self.network(x / 255.0))
    
    def get_action_and_value(self, x, action=None):
        hidden = self.network(x / 255.0)
        logits = self.actor(hidden)
        probs = Categorical(logits=logits)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action), probs.entropy(), self.critic(self.network(x / 255.0))
        
agent = Agent(envs).to(device)
print(agent)

Agent(
  (network): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
    (6): Flatten(start_dim=1, end_dim=-1)
    (7): Linear(in_features=3136, out_features=512, bias=True)
    (8): ReLU()
  )
  (actor): Linear(in_features=512, out_features=4, bias=True)
  (critic): Linear(in_features=512, out_features=1, bias=True)
)


In [8]:
optimizer = optim.Adam(agent.parameters(), lr=1e-3, eps=1e-5)
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-05
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)

In [9]:
num_steps = 128
num_envs = 8
total_timesteps = 1000000
batch_size = num_envs * num_steps
# ALGO Logic: Storage setup
obs = torch.zeros((num_steps, num_envs) + envs.single_observation_space.shape).to(device)
actions = torch.zeros((num_steps, num_envs) + envs.single_action_space.shape).to(device)
logprobs = torch.zeros((num_steps, num_envs)).to(device)
rewards = torch.zeros((num_steps, num_envs)).to(device)
dones = torch.zeros((num_steps, num_envs)).to(device)
values = torch.zeros((num_steps, num_envs)).to(device)

# TRY NOT TO MODIFY: start the game
global_step = 0
start_time = time.time()
next_obs = torch.Tensor(np.array(envs.reset()[0])).to(device)
next_done = torch.zeros(num_envs).to(device)
num_updates = total_timesteps // batch_size
num_updates

976

In [10]:
print(f'next_obs.shape: {next_obs.shape}')
print(f'agent.get_value(next_obs): {agent.get_value(next_obs)}')
print(f'agent.get_value(next_obs).shape: {agent.get_value(next_obs).shape}')
print()
print("agent.get_action_and_value(next_obs): ", agent.get_action_and_value(next_obs))

next_obs.shape: torch.Size([8, 4, 84, 84])
agent.get_value(next_obs): tensor([[-0.0894],
        [-0.1026],
        [-0.0905],
        [-0.0596],
        [-0.0855],
        [-0.0971],
        [-0.0703],
        [-0.0649]], device='cuda:0', grad_fn=<AddmmBackward0>)
agent.get_value(next_obs).shape: torch.Size([8, 1])

agent.get_action_and_value(next_obs):  (tensor([2, 1, 3, 2, 0, 1, 2, 3], device='cuda:0'), tensor([-1.3887, -1.3868, -1.3838, -1.3887, -1.3859, -1.3867, -1.3887, -1.3839],
       device='cuda:0', grad_fn=<SqueezeBackward1>), tensor([1.3863, 1.3863, 1.3863, 1.3863, 1.3863, 1.3863, 1.3863, 1.3863],
       device='cuda:0', grad_fn=<NegBackward0>), tensor([[-0.0894],
        [-0.1026],
        [-0.0905],
        [-0.0596],
        [-0.0855],
        [-0.0971],
        [-0.0703],
        [-0.0649]], device='cuda:0', grad_fn=<AddmmBackward0>))


# Training Loop

In [11]:
global_step = 0
for update in range(1, num_updates+1):
    
    # Learning Rate Annealing Schedule
    frac = 1.0 - (update - 1.0) / num_updates
    lr_now = 1e-3 * frac
    optimizer.param_groups[0]['lr'] = lr_now
    
    for step in range(0, num_steps):
        global_step += 1 * num_envs
        obs[step] = next_obs
        dones[step] = next_done
        
        with torch.no_grad():
            action, logprob, _, value = agent.get_action_and_value(next_obs)
            values[step] = value.flatten()
        actions[step] = action
        logprobs[step] = logprob
        
        next_obs, reward, terminated, truncated, info = envs.step(action.cpu().numpy())
        rewards[step] = torch.tensor(reward).to(device).view(-1)
        next_obs = torch.Tensor(np.array(next_obs)).to(device)
        next_done = torch.Tensor(np.logical_or(terminated, truncated)).to(device)
        
        
  
        if next_done.any():
            if info['final_info'][0] is not None:
                # print(f'global_step: {global_step}')
                # print(f"Episodic return {info}")
                break
            
    # Generalized Advantage Estimation        
    gamma = 0.99
    gae_lambda = 0.95

    with torch.no_grad():
        next_value = agent.get_value(next_obs).reshape(1, -1)
        advantages = torch.zeros_like(rewards).to(device)
        lastgaelam = 0
        for t in reversed(range(num_steps)):
            if t == num_steps - 1:
                nextnonterminal = 1.0 - next_done
                nextvalues = next_value
            else:
                nextnonterminal = 1.0 - dones[t+1]
                nextvalues = values[t+1]
            delta = rewards[t] + gamma * nextvalues * nextnonterminal - values[t]
            advantages[t] = lastgaelam = delta + gamma * gae_lambda * nextnonterminal * lastgaelam
        returns = advantages + values 
        
    # Flatten the batch
    b_obs = obs.reshape((-1,) + envs.single_observation_space.shape)
    b_logprobs = logprobs.reshape(-1)
    b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
    b_advantages = advantages.reshape(-1)
    b_returns = returns.reshape(-1)
    b_values = values.reshape(-1)
    
    # Minibatch Update
    minibatch_size = batch_size // 4
    update_epochs = 4
    
    b_inds = np.arange(batch_size)
    clipfracs = []
    for epoch in range(update_epochs):
        np.random.shuffle(b_inds)
        for start in range(0, batch_size, minibatch_size):
            end = start + minibatch_size
            mb_inds = b_inds[start:end]
            # print(f'start and end: {start} and {end}')
        _, newlogproba, entropy, newvalues = agent.get_action_and_value(b_obs[mb_inds], b_actions[mb_inds])
        logratio = newlogproba - b_logprobs[mb_inds]
        ratio = logratio.exp()
        
        with torch.no_grad():
            clip_coef = 0.1
            old_approx_kl = (-logratio).mean()
            approx_kl = ((ratio - 1.0) * logratio).mean()
            clipfracs += [((ratio - 1.0).abs() > clip_coef).float().mean()]
        
        # Advantages Normalization
        mb_advantages = b_advantages[mb_inds]
        mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)
        
        # Clipped Surrogate Objective
        clip_coef = 0.1
        pg_loss1 = -mb_advantages * ratio
        pg_loss2 = -mb_advantages * torch.clamp(ratio, 1.0 - clip_coef, 1.0 + clip_coef)
        pg_loss = torch.max(pg_loss1, pg_loss2).mean()
        
        # Value Loss CLipping 
        v_loss_unclipped = (newvalues - b_returns[mb_inds]) ** 2
        v_clipped = b_values[mb_inds] + torch.clamp(newvalues - b_values[mb_inds], -clip_coef, clip_coef)
        v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
        v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
        v_loss = 0.5 * v_loss_max.mean()
        
        # Entropy Loss
        ent_coef = 0.01
        vf_coef = 0.5
        entropy_loss = entropy.mean()
        loss = pg_loss - ent_coef * entropy_loss + v_loss * vf_coef
        
        optimizer.zero_grad()
        loss.backward()
        # Global Gradient CLipping
        max_grad_norm = 0.5
        nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
        optimizer.step()   
    
    # Target KL Diverngence
    # target_kl = 0.015
    # if approx_kl > target_kl:
    #     print(f'Early stopping at step {step} due to reaching max kl: {approx_kl}')
    #     break

y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
var_y = np.var(y_true)
explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y

print(f'Learning Rate {optimizer.param_groups[0]["lr"]}')
print(f'Value Loss: {v_loss.item()}')
print(f'Policy Loss: {pg_loss.item()}')
print(f'Entropy Loss: {entropy_loss.item()}')
print(f'Approx KL: {approx_kl.item()}')
print(f'Clipfrac {torch.stack(clipfracs).mean()}')
print(f'Explained Var: {explained_var}')
print(f'Elapsed time (s): {time.time() - start_time}')

torch.save(agent.state_dict(), 'ppo_breakout.pth')

envs.close()

KeyboardInterrupt: 

In [12]:
num_envs = 1
# Vectorized environment
envs = gym.vector.SyncVectorEnv([make_env('ALE/Breakout-v5', "rgb_array", True)])

# Load the model
agent = Agent(envs).to(device)
agent.load_state_dict(torch.load('ppo_breakout.pth'))
agent.eval()

obs = torch.Tensor(np.array(envs.reset()[0])).to(device)
next_obs = obs
next_done = torch.zeros(num_envs).to(device)

for _ in range(10000):
    with torch.no_grad():
        action, _, _, _ = agent.get_action_and_value(next_obs)
    next_obs, _, next_term, next_trun, info = envs.step(action.cpu().numpy())
    next_obs = torch.Tensor(np.array(next_obs)).to(device)
    if next_term.any() or next_trun.any():
        next_obs = torch.Tensor(np.array(envs.reset()[0])).to(device)
        # print(f"Episodic return {info}")

    envs.close()

  logger.warn(


Moviepy - Building video c:\Users\andre\dev\ucdavis\spring24\ecs170\project\ProximalPolicyOptimization\video\rl-video-episode-0.mp4.
Moviepy - Writing video c:\Users\andre\dev\ucdavis\spring24\ecs170\project\ProximalPolicyOptimization\video\rl-video-episode-0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready c:\Users\andre\dev\ucdavis\spring24\ecs170\project\ProximalPolicyOptimization\video\rl-video-episode-0.mp4




Moviepy - Building video c:\Users\andre\dev\ucdavis\spring24\ecs170\project\ProximalPolicyOptimization\video\rl-video-episode-50.mp4.
Moviepy - Writing video c:\Users\andre\dev\ucdavis\spring24\ecs170\project\ProximalPolicyOptimization\video\rl-video-episode-50.mp4



                                                                  

Moviepy - Done !
Moviepy - video ready c:\Users\andre\dev\ucdavis\spring24\ecs170\project\ProximalPolicyOptimization\video\rl-video-episode-50.mp4
Moviepy - Building video c:\Users\andre\dev\ucdavis\spring24\ecs170\project\ProximalPolicyOptimization\video\rl-video-episode-100.mp4.
Moviepy - Writing video c:\Users\andre\dev\ucdavis\spring24\ecs170\project\ProximalPolicyOptimization\video\rl-video-episode-100.mp4



                                                                  

Moviepy - Done !
Moviepy - video ready c:\Users\andre\dev\ucdavis\spring24\ecs170\project\ProximalPolicyOptimization\video\rl-video-episode-100.mp4
Moviepy - Building video c:\Users\andre\dev\ucdavis\spring24\ecs170\project\ProximalPolicyOptimization\video\rl-video-episode-150.mp4.
Moviepy - Writing video c:\Users\andre\dev\ucdavis\spring24\ecs170\project\ProximalPolicyOptimization\video\rl-video-episode-150.mp4



                                                                  

Moviepy - Done !
Moviepy - video ready c:\Users\andre\dev\ucdavis\spring24\ecs170\project\ProximalPolicyOptimization\video\rl-video-episode-150.mp4
Moviepy - Building video c:\Users\andre\dev\ucdavis\spring24\ecs170\project\ProximalPolicyOptimization\video\rl-video-episode-200.mp4.
Moviepy - Writing video c:\Users\andre\dev\ucdavis\spring24\ecs170\project\ProximalPolicyOptimization\video\rl-video-episode-200.mp4



                                                                  

Moviepy - Done !
Moviepy - video ready c:\Users\andre\dev\ucdavis\spring24\ecs170\project\ProximalPolicyOptimization\video\rl-video-episode-200.mp4
