In [7]:
import numpy as np
import scipy.signal

import torch
import torch.nn as nn


In [8]:
import math
import random
from copy import deepcopy
from this import d
import numpy as np
import torch
from torch.optim import Adam
import gym
import time
from collections import namedtuple, deque

In [9]:
env = gym.make('Reacher-v2')
test_env = gym.make('Reacher-v2')
device= torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('device = ', device)

device =  cuda


In [4]:
def combined_shape(length, shape=None):
    if shape is None:
        return (length,)
    return (length, shape) if np.isscalar(shape) else (length, *shape)

def mlp(sizes, activation, output_activation=nn.Identity):
    layers = []
    for j in range(len(sizes)-1):
        act = activation if j < len(sizes)-2 else output_activation
        layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
    return nn.Sequential(*layers)

def count_vars(module):
    return sum([np.prod(p.shape) for p in module.parameters()])

class MLPActor(nn.Module):

    def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit):
        super().__init__()
        pi_sizes = [obs_dim] + list(hidden_sizes) + [act_dim]
        self.pi = mlp(pi_sizes, activation, nn.Tanh)
        self.act_limit = act_limit

    def forward(self, obs):
        # Return output from network scaled to action space limits.
        return self.act_limit * self.pi(obs)

class MLPQFunction(nn.Module):

    def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
        super().__init__()
        self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation)

    def forward(self, obs, act):
        q = self.q(torch.cat([obs, act], dim=-1))
        return torch.squeeze(q, -1) # Critical to ensure q has right shape.

class MLPActorCritic(nn.Module):

    def __init__(self, observation_space, action_space, hidden_sizes=(256,256),
                 activation=nn.ReLU, device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
        super().__init__()

        obs_dim = observation_space.shape[0]
        act_dim = action_space.shape[0]
        act_limit = action_space.high[0]

        # build policy and value functions
        self.pi = MLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit).to(device)
        self.q = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation).to(device)

    def act(self, obs):
        with torch.no_grad():
            return self.pi(obs)

In [5]:
Transition = namedtuple('Transition',
                        ('obs', 'act', 'rew', 'next_obs', 'done'))

class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [6]:
hid = 256
l = 2


ac_kwargs=dict(hidden_sizes=[hid]*l)
seed=0
steps_per_epoch=3000
epochs=100
replay_size=int(1e6)
gamma=0.99
polyak=0.995
pi_lr=1e-4
q_lr=1e-4
batch_size=500
start_steps=10000 
update_after=1000
update_every=50
act_noise=0.01
num_test_episodes=10
max_ep_len=1000



torch.manual_seed(seed)
np.random.seed(seed)
obs_dim = env.observation_space.shape[0]

print('obs_dim = ', obs_dim)
act_dim = env.action_space.shape[0]
print('act_dim = ', act_dim)
# Action limit for clamping: critically, assumes all dimensions share the same bound!
act_limit = env.action_space.high[0]

# Create actor-critic module and target networks
ac = MLPActorCritic(env.observation_space, env.action_space, **ac_kwargs)
ac_targ = deepcopy(ac)

# Freeze target networks with respect to optimizers (only update via polyak averaging)
for p in ac_targ.parameters():
    p.requires_grad = False

memory = ReplayMemory(replay_size)

# Count variables (protip: try to get a feel for how different size networks behave!)
var_counts = tuple(count_vars(module) for module in [ac.pi, ac.q])

# Set up function for computing DDPG Q-loss
def compute_loss_q(data):

    o = torch.cat(data.obs).float()
    a = torch.cat(data.act).float()
    r = torch.cat(data.rew).float()
    o2 =torch.cat(data.next_obs).float()
    d = torch.cat(data.done).float()

    q = ac.q(o,a)


    # Bellman backup for Q function
    with torch.no_grad():
        q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2))
        backup = r + gamma * (1 - d) * q_pi_targ

    # MSE loss against Bellman backup
    loss_q = ((q - backup)**2).mean()

    return loss_q

# Set up function for computing DDPG pi loss
def compute_loss_pi(data):

    o = torch.cat(data.obs).float()

    q_pi = ac.q(o, ac.pi(o))

    return -q_pi.mean()

pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
q_optimizer = Adam(ac.q.parameters(), lr=q_lr)

def update(data):
    # First run one gradient descent step for Q.


    q_optimizer.zero_grad()
    loss_q = compute_loss_q(data)

    loss_q.backward()

    q_optimizer.step()


    # Freeze Q-network so you don't waste computational effort 
    # computing gradients for it during the policy learning step.
    for p in ac.q.parameters():
        p.requires_grad = False

    # Next run one gradient descent step for pi.
    pi_optimizer.zero_grad()
    loss_pi = compute_loss_pi(data)
    loss_pi.backward()
    pi_optimizer.step()

    # Unfreeze Q-network so you can optimize it at next DDPG step.
    for p in ac.q.parameters():
        p.requires_grad = True


    # Finally, update target networks by polyak averaging.
    with torch.no_grad():
        for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
            # NB: We use an in-place operations "mul_", "add_" to update target
            # params, as opposed to "mul" and "add", which would make new tensors.
            p_targ.data.mul_(polyak)
            p_targ.data.add_((1 - polyak) * p.data)


def get_action(o, noise_scale):
    a = ac.act(torch.tensor(o, dtype=torch.float32, device=device))
    a += noise_scale * torch.randn(act_dim).to(device)
    return torch.clip(a, -act_limit, act_limit)

def test_agent():
    for j in range(num_test_episodes):
        o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
        while not(d or (ep_len == max_ep_len)):
            # Take deterministic actions at test time (noise_scale=0)
            a_cpu = get_action(o, 0).cpu().data.numpy()
            o, r, d, _ = test_env.step(a_cpu)
            ep_ret += r
            ep_len += 1




# Prepare for interaction with environment
total_steps = steps_per_epoch * epochs
start_time = time.time()
o, ep_ret, ep_len = env.reset(), 0, 0
o = torch.tensor([o], device=device)


# Main loop: collect experience in env and update/log each epoch
for t in range(total_steps):


    # Until start_steps have elapsed, randomly sample actions
    # from a uniform distribution for better exploration. Afterwards, 
    # use the learned policy (with some noise, via act_noise). 
    if t > start_steps:
        a = get_action(o, act_noise)
    else:
        a = env.action_space.sample()
        a = torch.tensor([a], device=device)
    a_cpu = get_action(o, 0).cpu().data.numpy()
    # Step the env
    o2, r, d, _ = env.step(a_cpu)
    ep_ret += r
    ep_len += 1
    ep_ret_main = ep_ret/ep_len


    # Ignore the "done" signal if it comes from hitting the time
    # horizon (that is, when it's an artificial terminal signal
    # that isn't based on the agent's state)
    d = False if ep_len==max_ep_len else d

    # a = torch.tensor([a], device=device)
    o2 = torch.tensor([o2], device=device)
    r = torch.tensor([r], device=device)
    d = torch.tensor([d], device=device)

    # Store experience to replay buffer
    memory.push(o, a, r, o2, d)
    # Super critical, easy to overlook step: make sure to update 
    # most recent observation!
    o=o2

    # End of trajectory handling
    if d or (ep_len == max_ep_len):
        o, ep_ret, ep_len = env.reset(), 0, 0
        o = torch.tensor([o], device=device)


    # Update handling
    if t >= update_after and t % update_every == 0:
        print('t = ', t)
        print('main od rewards = ', ep_ret_main)
        for i in range(update_every):

            transitions = memory.sample(batch_size)
            # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
            # detailed explanation). This converts batch-array of Transitions
            # to Transition of batch-arrays.
            batch = Transition(*zip(*transitions))

            # print('batch = ', str(batch))
            update(data=batch)

    # End of epoch handling
    if (t+1) % steps_per_epoch == 0:
        epoch = (t+1) // steps_per_epoch

        # Test the performance of the deterministic version of the agent.
        test_agent()


obs_dim =  11
act_dim =  2


  o = torch.tensor([o], device=device)
  a = ac.act(torch.tensor(o, dtype=torch.float32, device=device))


t =  1000
main od rewards =  -0.39225072074416983
t =  1050
main od rewards =  -0.4189915411928245
t =  1100
main od rewards =  -0.3316896402893972
t =  1150
main od rewards =  -0.619270515888225
t =  1200
main od rewards =  -1.0872440833218924
t =  1250
main od rewards =  -1.0428896960137006
t =  1300
main od rewards =  -1.1284602238202552
t =  1350
main od rewards =  -1.0666593072279789
t =  1400
main od rewards =  -1.2504264116329733
t =  1450
main od rewards =  -1.2069146208856154
t =  1500
main od rewards =  -1.592110906673233
t =  1550
main od rewards =  -1.886543932385081
t =  1600
main od rewards =  -1.8076372298501318
t =  1650
main od rewards =  -1.9317593043480152
t =  1700
main od rewards =  -1.9955999531008408
t =  1750
main od rewards =  -1.8474744672041892
t =  1800
main od rewards =  -1.854689505652942
t =  1850
main od rewards =  -2.0243859194717717
t =  1900
main od rewards =  -1.984808734219329
t =  1950
main od rewards =  -2.0214688553435805
t =  2000
main od reward

t =  9450
main od rewards =  -2.185223021064987
t =  9500
main od rewards =  -2.2461417988814483
t =  9550
main od rewards =  -2.1498866943644144
t =  9600
main od rewards =  -2.2866247083911326
t =  9650
main od rewards =  -2.2729645511613006
t =  9700
main od rewards =  -2.1366899463114835
t =  9750
main od rewards =  -2.2506834630605344
t =  9800
main od rewards =  -2.139110674413405
t =  9850
main od rewards =  -2.2895278979380764
t =  9900
main od rewards =  -2.2722197680447787
t =  9950
main od rewards =  -2.2824051193186143
t =  10000
main od rewards =  -2.3059103210444674
t =  10050
main od rewards =  -2.236121005770428
t =  10100
main od rewards =  -2.1399204947160566
t =  10150
main od rewards =  -2.176090521099632
t =  10200
main od rewards =  -2.193799999767783
t =  10250
main od rewards =  -2.25189131448729
t =  10300
main od rewards =  -2.10549664988322
t =  10350
main od rewards =  -2.0922338966611917
t =  10400
main od rewards =  -2.245843153999257
t =  10450
main od re

KeyboardInterrupt: 