Deep Deterministic Policy Gradients

In [1]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [2]:
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import gym
from gym.wrappers import Monitor
import glob
import io
import base64
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

display = Display(visible=0, size=(800, 600))
display.start()

"""
Utility functions to enable video recording of gym environment 
and displaying it.
To enable video, just do "env = wrap_env(env)""
"""


def show_video():
    mp4list = glob.glob('video/*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")


def wrap_env(env):
    env = Monitor(env, './video', force=True)
    return env

In [4]:
# HIDE OUTPUT
env = wrap_env(gym.make("Pendulum-v0"))

observation = env.reset()

while True:

    env.render()

    # your agent goes here
    action = env.action_space.sample()

    observation, reward, done, info = env.step(action)

    if done:
        break

env.close()
show_video()

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim

import random
from collections import deque

In [12]:
class PolicyNetwork(nn.Module):
    
    def __init__(self, state_space_dim, action_space_dim):
        super().__init__()

        self.network = nn.Sequential(
            nn.Linear(state_space_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, action_space_dim)
        )

    def forward(self, x):
        return self.network(x)


In [13]:
class QNetwork(nn.Module):

    def __init__(self, state_space_dim, action_space_dim):
        super().__init__()

        self.network = nn.Sequential(
            nn.Linear(state_space_dim + action_space_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, state, action):
        x = torch.cat([state, action], 1)
        return self.network(x)


In [8]:
class Memory:
    def __init__(self, max_size):
        self.buffer = deque(maxlen=max_size)

    def push(self, state, action, reward, next_state, done):
        experience = (state, action, np.array([reward]), next_state, done)
        self.buffer.append(experience)

    def sample(self, batch_size):
        state_batch = []
        action_batch = []
        reward_batch = []
        next_state_batch = []
        done_batch = []

        batch = random.sample(self.buffer, batch_size)

        for experience in batch:
            state, action, reward, next_state, done = experience
            state_batch.append(state)
            action_batch.append(action)
            reward_batch.append(reward)
            next_state_batch.append(next_state)
            done_batch.append(done)

        return state_batch, action_batch, reward_batch, next_state_batch, done_batch

    def __len__(self):
        return len(self.buffer)


In [9]:
class DdpgAgent:
    def __init__(self, env, gamma=0.99, tau=1e-2, max_memory_size=10000):
        # Params
        self.num_states = env.observation_space.shape[0]
        self.num_actions = env.action_space.shape[0]
        self.gamma = gamma
        self.tau = tau

        # Networks
        self.actor = PolicyNetwork(self.num_states, self.num_actions)
        self.actor_target = PolicyNetwork(self.num_states, self.num_actions)
        self.q = QNetwork(self.num_states, self.num_actions)
        self.q_target = QNetwork(self.num_states, self.num_actions)

        for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
            target_param.data.copy_(param.data)

        for target_param, param in zip(self.q_target.parameters(), self.q.parameters()):
            target_param.data.copy_(param.data)

        # Training
        self.q_criterion  = nn.MSELoss()
        self.memory = Memory(max_memory_size)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4)
        self.critic_optimizer = optim.Adam(self.q.parameters(), lr=1e-3)

    def choose_action(self, state, sd):
        state = torch.tensor(state, dtype=torch.float32)
        with torch.no_grad():
            net_out = self.actor(state)

            eps = torch.randn_like(net_out)

            action = net_out + sd * eps

        return action.detach().cpu().numpy()

    def update(self, batch_size):
        if len(self.memory) < batch_size:
            return

        states, actions, rewards, next_states, _ = self.memory.sample(batch_size)
        states = torch.FloatTensor(states)
        actions = torch.FloatTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)
        with torch.no_grad():
            next_actions = self.actor_target(next_states)
            next_q = self.q_target.forward(next_states, next_actions.detach())

        # Critic loss
        q_vals = self.q(states, actions)

        q_prime = rewards + self.gamma * next_q

        critic_loss = self.q_criterion(q_vals, q_prime)

        # Actor loss
        policy_loss = -self.q(states, self.actor(states)).mean()

        # update networks
        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # update target networks
        for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))

        for target_param, param in zip(self.q_target.parameters(), self.q.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))



In [None]:
env = gym.make("Pendulum-v0")
total_episodes = 100

agent = DdpgAgent(env)

rewards = []
smooth_rewards = []

for episode in tqdm(range(0, total_episodes)):
  state = env.reset()

  episode_reward = 0

  max_steps = 200

  for step in range(max_steps):
    action = agent.choose_action(state, sd=0.1)

    new_state, reward, done, info = env.step(action)

    agent.memory.push(state, action, reward, new_state, done)

    agent.update(64)

    episode_reward += reward

    if done:
      break

    state = new_state

  rewards.append(episode_reward)
  smooth_rewards.append(np.mean(rewards[-10:]))

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
plt.plot(rewards)
plt.plot(smooth_rewards)
plt.show()

In [18]:
env = wrap_env(gym.make("Pendulum-v0"))

observation = env.reset()

while True:

    env.render()
    
    action = agent.choose_action(observation, 0.0)

    observation, reward, done, info = env.step(action)

    if done:
        break

env.close()
show_video()