# Deep Reinforcement Learning in Action - Chapter 5 - extended and refactored to compare the actor-critic to the n-step actor-critic

In [1]:
from tqdm.notebook import trange

##### Listing 5.1

In [2]:
import multiprocessing as mp

import numpy as np
def square(x):
    return np.square(x)
x = np.arange(64)
print(x)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63]


In [3]:
mp.cpu_count()

12

In [4]:
if __name__ == '__main__': # added this line for process safety
    pool = mp.Pool(8)
    squared = pool.map(square, [x[8*i:8*i+8] for i in range(8)])
    squared

##### Listing 5.2

In [5]:
def square(i, x, queue):
    print("In process {}".format(i,))

queue = mp.Queue()
queue.put(np.square(x))
processes = []
if __name__ == '__main__': #adding this for process safety
    x = np.arange(64)
    for i in range(8):
        start_index = 8*i
        proc = mp.Process(target=square,args=(i,x[start_index:start_index+8],
                         queue)) 
        proc.start()
        processes.append(proc)

    for proc in processes:
        proc.join()

    for proc in processes:
        proc.terminate()

    results = []
    while not queue.empty():
        results.append(queue.get())

In process 0
In process 1
In process 2
In process 3
In process 4
In process 5
In process 6
In process 7


In [6]:
results

[array([   0,    1,    4,    9,   16,   25,   36,   49,   64,   81,  100,
         121,  144,  169,  196,  225,  256,  289,  324,  361,  400,  441,
         484,  529,  576,  625,  676,  729,  784,  841,  900,  961, 1024,
        1089, 1156, 1225, 1296, 1369, 1444, 1521, 1600, 1681, 1764, 1849,
        1936, 2025, 2116, 2209, 2304, 2401, 2500, 2601, 2704, 2809, 2916,
        3025, 3136, 3249, 3364, 3481, 3600, 3721, 3844, 3969])]

## Cartpole-v1 with actor-critic and N-step actor-critic trained distrubuted over CPU threads

In [7]:
import torch
from torch import nn
from torch import optim
import numpy as np
from torch.nn import functional as F
import gym
import torch.multiprocessing as mp

CPU_COUNT = mp.cpu_count()

class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()
        self.l1 = nn.Linear(4,25)
        self.l2 = nn.Linear(25,50)
        self.actor_lin1 = nn.Linear(50,2)
        self.l3 = nn.Linear(50,25)
        self.critic_lin1 = nn.Linear(25,1)
    def forward(self,x):
        x = F.normalize(x,dim=0)
        y = F.relu(self.l1(x))
        y = F.relu(self.l2(y))
        actor = F.log_softmax(self.actor_lin1(y),dim=0)
        c = F.relu(self.l3(y.detach()))
        critic = torch.tanh(self.critic_lin1(c))
        return actor, critic


def worker(t, worker_model, counter, params, N_steps):
    worker_env = gym.make("CartPole-v1")
    worker_env._max_episode_steps = 1000
    worker_env.reset()
    worker_opt = optim.Adam(lr=1e-4,params=worker_model.parameters())
    worker_opt.zero_grad()
    for i in range(params['epochs']):
        worker_opt.zero_grad()
        if N_steps > 1:
            values, logprobs, rewards, G = run_episode(worker_env,worker_model, N_steps)
            actor_loss,critic_loss,eplen = update_params(worker_opt,values,logprobs,rewards, G)
        else:
            values, logprobs, rewards = run_episode(worker_env,worker_model, N_steps)
            actor_loss,critic_loss,eplen = update_params(worker_opt,values,logprobs,rewards)            
        counter.value = counter.value + 1

        
def run_episode(worker_env, worker_model, N_steps=10):
    
    if N_steps > 1:
        raw_state = np.array(worker_env.env.state)
        state = torch.from_numpy(raw_state).float()
        values, logprobs, rewards = [],[],[]
        done = False
        j=0
        G=torch.Tensor([0])
        while (j < N_steps and done == False):
            j+=1
            policy, value = worker_model(state)
            values.append(value)
            logits = policy.view(-1)
            action_dist = torch.distributions.Categorical(logits=logits)
            action = action_dist.sample()
            logprob_ = policy.view(-1)[action]
            logprobs.append(logprob_)
            state_, _, done, info = worker_env.step(action.detach().numpy())
            state = torch.from_numpy(state_).float()
            if done:
                reward = -10
                worker_env.reset()
            else:
                reward = 1.0
                G = value.detach()
            rewards.append(reward)
        return values, logprobs, rewards, G

    else:
        state = torch.from_numpy(worker_env.env.state).float()
        values, logprobs, rewards = [],[],[]
        done = False
        j=0
        while (done == False):
            j+=1
            policy, value = worker_model(state)
            values.append(value)
            logits = policy.view(-1)
            action_dist = torch.distributions.Categorical(logits=logits)
            action = action_dist.sample()
            logprob_ = policy.view(-1)[action]
            logprobs.append(logprob_)
            state_, _, done, info = worker_env.step(action.detach().numpy())
            state = torch.from_numpy(state_).float()
            if done:
                reward = -10
                worker_env.reset()
            else:
                reward = 1.0
            rewards.append(reward)        
        return values, logprobs, rewards

    
def update_params(worker_opt,values,logprobs,rewards,G=None,clc=0.1,gamma=0.95):
    rewards = torch.Tensor(rewards).flip(dims=(0,)).view(-1)
    logprobs = torch.stack(logprobs).flip(dims=(0,)).view(-1)
    values = torch.stack(values).flip(dims=(0,)).view(-1)
    Returns = []
    
    # this is where G is ultimately used
    ret_ = G if G is not None else torch.Tensor([0])
    
    for r in range(rewards.shape[0]):
        ret_ = rewards[r] + gamma * ret_
        Returns.append(ret_)
    Returns = torch.stack(Returns).view(-1)
    Returns = F.normalize(Returns,dim=0)
    actor_loss = -1*logprobs * (Returns - values.detach())
    critic_loss = torch.pow(values - Returns,2)
    loss = actor_loss.sum() + clc*critic_loss.sum()
    loss.backward()
    worker_opt.step()
    return actor_loss, critic_loss, len(rewards)


def train_and_test(N_steps):
    """
    This will not record losses for plotting. 
    If you want to record losses, you'll need to create a multiprocessing shared array and
        modify the worker function to write each loss to it.
        See < https://docs.python.org/3/library/multiprocessing.html > 
    Alternatively, you could use process locks to safely write to a file.
    """
    
    MasterNode = ActorCritic()
    MasterNode.share_memory()
    processes = []
    params = {
        'epochs':1000,
        'n_workers':(CPU_COUNT-1),
    }
    counter = mp.Value('i',0)
    if __name__ == '__main__': #adding this for process safety
        for i in trange(params['n_workers']):
            p = mp.Process(target=worker, args=(i,MasterNode,counter,params,N_steps))
            p.start() 
            processes.append(p)
        for p in processes:
            p.join()
        for p in processes:
            p.terminate()

    print(counter.value,processes[1].exitcode)
    
    env = gym.make("CartPole-v1")
    env._max_episode_steps = 1000
    env.reset()

    last_loss = 0
    durations = []
    for i in trange(5000):
        state_ = np.array(env.env.state)
        state = torch.from_numpy(state_).float()
        logits,value = MasterNode(state)
        action_dist = torch.distributions.Categorical(logits=logits)
        action = action_dist.sample()
        state2, reward, done, info = env.step(action.detach().numpy())
        if done:
            duration = i - last_loss
            durations.append(duration)
            print(f"Lost after {duration} steps.")
            env.reset()
            last_loss = i
        state_ = np.array(env.env.state)
        state = torch.from_numpy(state_).float()
        env.render()

    env.close()
    print(f"durations: mean: {np.mean(durations)}, std: {np.std(durations)}")

In [8]:
train_and_test(1)

  0%|          | 0/11 [00:00<?, ?it/s]

10992 0


  0%|          | 0/5000 [00:00<?, ?it/s]

Lost after 121 steps.
Lost after 335 steps.
Lost after 358 steps.
Lost after 177 steps.
Lost after 230 steps.
Lost after 209 steps.
Lost after 173 steps.
Lost after 372 steps.
Lost after 322 steps.
Lost after 162 steps.
Lost after 98 steps.
Lost after 246 steps.
Lost after 118 steps.
Lost after 212 steps.
Lost after 209 steps.
Lost after 421 steps.
Lost after 152 steps.
Lost after 218 steps.
Lost after 155 steps.
Lost after 182 steps.
Lost after 355 steps.
durations: mean: 229.76190476190476, std: 91.62340351985598


In [9]:
train_and_test(10)

  0%|          | 0/11 [00:00<?, ?it/s]

10732 0


  0%|          | 0/5000 [00:00<?, ?it/s]

Lost after 292 steps.
Lost after 147 steps.
Lost after 472 steps.
Lost after 285 steps.
Lost after 171 steps.
Lost after 550 steps.
Lost after 248 steps.
Lost after 197 steps.
Lost after 203 steps.
Lost after 158 steps.
Lost after 189 steps.
Lost after 301 steps.
Lost after 886 steps.
Lost after 379 steps.
Lost after 169 steps.
Lost after 140 steps.
Lost after 138 steps.
durations: mean: 289.70588235294116, std: 188.56603296704137


In [10]:
train_and_test(20)

  0%|          | 0/11 [00:00<?, ?it/s]

10915 0


  0%|          | 0/5000 [00:00<?, ?it/s]

Lost after 205 steps.
Lost after 497 steps.
Lost after 556 steps.
Lost after 520 steps.
Lost after 250 steps.
Lost after 529 steps.
Lost after 262 steps.
Lost after 648 steps.
Lost after 206 steps.
Lost after 653 steps.
Lost after 524 steps.
durations: mean: 440.90909090909093, std: 166.3705910121107


In [11]:
train_and_test(30)

  0%|          | 0/11 [00:00<?, ?it/s]

10915 0


  0%|          | 0/5000 [00:00<?, ?it/s]

Lost after 168 steps.
Lost after 357 steps.
Lost after 237 steps.
Lost after 394 steps.
Lost after 223 steps.
Lost after 357 steps.
Lost after 158 steps.
Lost after 421 steps.
Lost after 286 steps.
Lost after 772 steps.
Lost after 588 steps.
Lost after 402 steps.
Lost after 258 steps.
Lost after 292 steps.
durations: mean: 350.92857142857144, std: 160.20186564176473


In [12]:
train_and_test(40)

  0%|          | 0/11 [00:00<?, ?it/s]

10949 0


  0%|          | 0/5000 [00:00<?, ?it/s]

Lost after 204 steps.
Lost after 865 steps.
Lost after 278 steps.
Lost after 332 steps.
Lost after 339 steps.
Lost after 272 steps.
Lost after 1000 steps.
Lost after 343 steps.
Lost after 281 steps.
Lost after 555 steps.
durations: mean: 446.9, std: 259.55594772611164


In [13]:
train_and_test(50)

  0%|          | 0/11 [00:00<?, ?it/s]

10960 0


  0%|          | 0/5000 [00:00<?, ?it/s]

Lost after 501 steps.
Lost after 546 steps.
Lost after 289 steps.
Lost after 360 steps.
Lost after 409 steps.
Lost after 327 steps.
Lost after 363 steps.
Lost after 453 steps.
Lost after 362 steps.
Lost after 283 steps.
Lost after 422 steps.
Lost after 305 steps.
Lost after 341 steps.
durations: mean: 381.61538461538464, std: 77.83368705282237
