In [1]:
import numpy as np
import time
import random
import gym
import math

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.multiprocessing as mp

import matplotlib.pyplot as plt

from IPython.display import clear_output
%matplotlib inline

### Checking for cuda device

In [2]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = 'cpu'
print(device)

cpu


In [3]:
env = gym.make('HalfCheetah-v2')
print('Observation Shape:', env.observation_space.shape, '\nAction Shape:', env.action_space)

Observation Shape: (17,) 
Action Shape: Box(6,)


In [4]:
LEARNING_RATE = 0.001
DISCOUNT = 0.99
EPS = 1
EPS_DECAY = 0.9999
END_EPS = 0.1

N_EPISODE = 3000

UPDATE_GLOBAL = 100


# Dimensions of input and output of environment
obs_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

In [5]:
class ActorCritic(nn.Module):
    def __init__(self, observations, actions):
        super(ActorCritic, self).__init__()
        self.a = nn.Linear(observations, 128)
        self.mean = nn.Linear(128, actions)
        self.variance = nn.Linear(128, actions)
        self.acti_tanh = nn.Tanh()

        self.c = nn.Linear(observations, 128)
        self.value = nn.Linear(128, actions)

    def forward(self, x):
        a = F.relu6(self.a(x))
        mean = 2*self.acti_tanh(self.mean(a))
        variance = F.softplus(self.variance(a)) + 0.0001    # to avoid zero value

        c = F.relu6(self.c(x))
        value = self.value(c)

        return mean, variance, value
    
    def actor_action(self, state):
        mean, variance, _ = self.forward(state)
        
        m = torch.distributions.Normal(mean, torch.sqrt(variance))
        action = m.sample()
        log_prob = m.log_prob(action)
        entropy = 0.5 + 0.5*math.log(2*math.pi) + torch.log(m.scale)
        return action.detach().cpu().numpy(), log_prob, entropy

In [6]:
def compute_returns(local_net, next_state, rewards, done, discount = DISCOUNT):
#     q_val = critic(state)

    next_state = torch.FloatTensor(next_state).to(device)
    _,_, next_q_val = local_net(next_state)
    returns = []
#     print(next_q_val, rewards)
    for step in reversed(range(len(rewards))):
        next_q_val = rewards[step] + discount*next_q_val*(1-done[step])
        returns.append(next_q_val)
        
    returns.reverse()
    
    return returns  

In [7]:
def update(log_probs, q_vals, values, entropies, local_net):

#     print('Entropy:',entropies)
#     print('log_probs', log_probs)
    advantage = q_vals - values

    critic_loss = advantage.pow(2)

    actor_loss = -(log_probs*advantage.detach())
    
    total_loss = (actor_loss+critic_loss).mean()
    
    optimizer.zero_grad()
    total_loss.backward()
    
    for lp, gp in zip(local_net.parameters(), global_net.parameters()):
        gp._grad = lp.grad
        
    optimizer.step()
    
    local_net.load_state_dict(global_net.state_dict())
    
    return total_loss

In [8]:
def record(global_ep, global_ep_r, ep_r, res_queue, name):
    with global_ep.get_lock():
        global_ep.value += 1
    with global_ep_r.get_lock():
        if global_ep_r.value == 0.:
            global_ep_r.value = ep_r
        else:
            global_ep_r.value = global_ep_r.value * 0.99 + ep_r * 0.01
    res_queue.put(global_ep_r.value)
    print(
        name,
        "Ep:", global_ep.value,
        "| Ep_r: %.0f" % global_ep_r.value,
    )

In [9]:
def Trainer(global_net, optimizer, name):
    name = 'trainer_' + str(name)
    env = gym.make('HalfCheetah-v2')
    local_net = ActorCritic(env.observation_space.shape[0], env.action_space.shape[0])
    
    total_steps = 0
    for i in range(1, N_EPISODE+1):
        ep_rewards = []
        log_probs = []
        done_states = []
        values = []
        entropies = []
        total_ep_reward = 0
        state = env.reset()
        
        while True:
            state = torch.FloatTensor(state).to(device)
            action, log_prob, entropy = local_net.actor_action(state)
            
            _,_, value = local_net(state)
            
            next_state, reward, done, _ = env.step(action)

            ep_rewards.append(torch.tensor(reward, dtype = torch.float, device = device))
            done_states.append(torch.tensor(done, dtype = torch.float, device = device))
            log_probs.append(log_prob)
            values.append(value)
            entropies.append(entropy)
            total_ep_reward += reward
            
#             if total_steps%UPDATE_GLOBAL == 0 or done:
                
            
            if done:
                break
                
            state = next_state
            total_steps += 1
        
#         queue.put(total_ep_reward)
        q_vals = compute_returns(local_net, next_state, ep_rewards, done_states)
                
        q_vals = torch.cat(q_vals)
        values = torch.cat(values)
        log_probs = torch.cat(log_probs)
        entropies = torch.cat(entropies)
                
        loss = update(log_probs, q_vals, values, entropies, local_net)
        if i%10 == 0:
            print(name,"Ep:", i,"| Ep_r: %.2f" % total_ep_reward, "|  Loss: %.2f" %loss)
#     queue.put(None)

In [10]:
global_net = ActorCritic(obs_dim, action_dim)
global_net.share_memory()
optimizer = optim.Adam(global_net.parameters(), lr = LEARNING_RATE)
# queue = mp.Queue()


processes = []

for i in range(mp.cpu_count()):
    p = mp.Process(target = Trainer, args = (global_net, optimizer, i))
    p.start()
    processes.append(p)
    
for p in processes:
    p.join()
# trainers = [Trainer(global_net, optimizer, queue, i) for i in range(mp.cpu_count())]
# [t.start() for t in trainers]


# res = []
# while True:
#     r = queue.get()
#     if r is not None:
#         res.append(r)
#     else:
#         break

# [t.join() for t in trainers]
# plt.plot(res)
# plt.ylabel('Ep_reward')
# plt.xlabel('Step')
# plt.show()

trainer_2 Ep: 10 | Ep_r: -2071.77 |  Loss: 32542.86
trainer_3 Ep: 10 | Ep_r: -1999.54 |  Loss: 30562.99
trainer_7 Ep: 10 | Ep_r: -2107.89 |  Loss: 33215.70
trainer_6 Ep: 10 | Ep_r: -2054.51 |  Loss: 31554.07
trainer_4 Ep: 10 | Ep_r: -2102.92 |  Loss: 33656.02
trainer_5 Ep: 10 | Ep_r: -2252.59 |  Loss: 37880.36
trainer_1 Ep: 10 | Ep_r: -2145.64 |  Loss: 34636.93
trainer_0 Ep: 10 | Ep_r: -2230.55 |  Loss: 36800.64
trainer_2 Ep: 20 | Ep_r: -2769.58 |  Loss: 49351.78
trainer_1 Ep: 20 | Ep_r: -2778.99 |  Loss: 49148.40
trainer_5 Ep: 20 | Ep_r: -2792.61 |  Loss: 49051.68
trainer_3 Ep: 20 | Ep_r: -2813.89 |  Loss: 49846.34
trainer_4 Ep: 20 | Ep_r: -2997.36 |  Loss: 56841.24
trainer_7 Ep: 20 | Ep_r: -2931.78 |  Loss: 53985.32
trainer_6 Ep: 20 | Ep_r: -2858.35 |  Loss: 50350.19
trainer_0 Ep: 20 | Ep_r: -2969.24 |  Loss: 55903.90
trainer_2 Ep: 30 | Ep_r: -2963.77 |  Loss: 42695.28
trainer_3 Ep: 30 | Ep_r: -2998.93 |  Loss: 43786.80
trainer_4 Ep: 30 | Ep_r: -2949.65 |  Loss: 40324.19
trainer_1 Ep

trainer_0 Ep: 200 | Ep_r: -2260.31 |  Loss: 2571.10
trainer_5 Ep: 200 | Ep_r: -2242.54 |  Loss: 2597.28
trainer_7 Ep: 210 | Ep_r: -2243.07 |  Loss: 2123.15
trainer_2 Ep: 210 | Ep_r: -2264.74 |  Loss: 2054.47
trainer_6 Ep: 210 | Ep_r: -2238.38 |  Loss: 1973.29
trainer_1 Ep: 210 | Ep_r: -2228.27 |  Loss: 1977.40
trainer_3 Ep: 210 | Ep_r: -2251.82 |  Loss: 1999.02
trainer_4 Ep: 210 | Ep_r: -2268.15 |  Loss: 2034.87
trainer_0 Ep: 210 | Ep_r: -2237.38 |  Loss: 2030.99
trainer_5 Ep: 210 | Ep_r: -2254.79 |  Loss: 2018.12
trainer_7 Ep: 220 | Ep_r: -2227.60 |  Loss: 2273.39
trainer_2 Ep: 220 | Ep_r: -2216.73 |  Loss: 2256.51
trainer_6 Ep: 220 | Ep_r: -2216.03 |  Loss: 2321.46
trainer_4 Ep: 220 | Ep_r: -2198.67 |  Loss: 2243.32
trainer_3 Ep: 220 | Ep_r: -2220.79 |  Loss: 2226.12
trainer_0 Ep: 220 | Ep_r: -2214.68 |  Loss: 2280.45
trainer_1 Ep: 220 | Ep_r: -2218.14 |  Loss: 2189.52
trainer_5 Ep: 220 | Ep_r: -2209.16 |  Loss: 2260.36
trainer_7 Ep: 230 | Ep_r: -2169.23 |  Loss: 2146.54
trainer_2 Ep

trainer_7 Ep: 400 | Ep_r: -1968.09 |  Loss: 2022.93
trainer_3 Ep: 400 | Ep_r: -1949.35 |  Loss: 1750.27
trainer_4 Ep: 400 | Ep_r: -1935.63 |  Loss: 1688.08
trainer_0 Ep: 400 | Ep_r: -1938.13 |  Loss: 1598.02
trainer_1 Ep: 410 | Ep_r: -1982.24 |  Loss: 1632.11
trainer_6 Ep: 410 | Ep_r: -1925.69 |  Loss: 1763.31
trainer_4 Ep: 410 | Ep_r: -1939.12 |  Loss: 1532.30
trainer_7 Ep: 410 | Ep_r: -1931.80 |  Loss: 1700.32
trainer_5 Ep: 410 | Ep_r: -1922.53 |  Loss: 1550.73
trainer_2 Ep: 410 | Ep_r: -1919.23 |  Loss: 1632.46
trainer_3 Ep: 410 | Ep_r: -1920.56 |  Loss: 1507.30
trainer_0 Ep: 410 | Ep_r: -1961.15 |  Loss: 1431.44
trainer_1 Ep: 420 | Ep_r: -1870.19 |  Loss: 1649.61
trainer_6 Ep: 420 | Ep_r: -1895.02 |  Loss: 1383.85
trainer_5 Ep: 420 | Ep_r: -1937.18 |  Loss: 1609.98
trainer_7 Ep: 420 | Ep_r: -1935.47 |  Loss: 1587.66
trainer_3 Ep: 420 | Ep_r: -1910.00 |  Loss: 1431.23
trainer_4 Ep: 420 | Ep_r: -1936.53 |  Loss: 1326.49
trainer_2 Ep: 420 | Ep_r: -1956.01 |  Loss: 1585.26
trainer_0 Ep

trainer_7 Ep: 600 | Ep_r: -1817.74 |  Loss: 1375.00
trainer_3 Ep: 600 | Ep_r: -1825.39 |  Loss: 1308.56
trainer_5 Ep: 600 | Ep_r: -1819.37 |  Loss: 1394.16
trainer_4 Ep: 600 | Ep_r: -1803.19 |  Loss: 1258.36
trainer_0 Ep: 600 | Ep_r: -1846.96 |  Loss: 1318.57
trainer_2 Ep: 600 | Ep_r: -1846.70 |  Loss: 1405.12


Process Process-4:


KeyboardInterrupt: 

Traceback (most recent call last):
  File "/home/himanshu/anaconda3/envs/rl/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/himanshu/anaconda3/envs/rl/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-f3c8cfd8d4af>", line 18, in Trainer
    action, log_prob, entropy = local_net.actor_action(state)
  File "<ipython-input-5-f3c54046eff6>", line 23, in actor_action
    mean, variance, _ = self.forward(state)
  File "<ipython-input-5-f3c54046eff6>", line 17, in forward
    c = F.relu6(self.c(x))
  File "/home/himanshu/anaconda3/envs/rl/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/himanshu/anaconda3/envs/rl/lib/python3.7/site-packages/torch/nn/modules/linear.py", line 87, in forward
    return F.linear(input, self.weight, self.bias)
  File "/home/himanshu/anaconda3/envs/rl/l

In [None]:
mean_avg = []
def plot(n_rewards):
    clear_output(True)
    plt.figure(figsize=(20,7))
    mean = np.mean(n_rewards[-20:])
#     plt.subplot(131)
    mean_avg.append(mean)
    plt.title('Reward: %s' % (mean))
    plt.plot(n_rewards)
    plt.plot(mean_avg)
#   plt.subplot(132)
#   plt.title('loss')
#   plt.plot(losses)
    plt.show()

In [None]:
n_reward = []
for i in range(100):
    state = env.reset()
    total_reward = 0
    done = False

#     actor.load_state_dict(torch.load('/home/himanshu/RL/Policy-based-RL/adv-actor.pth'))
#     critic.load_state_dict(torch.load('/home/himanshu/RL/Policy-based-RL/adv-critic.pth'))
#     print("Model Loaded")
#     actor.to(device)
#     critic.to(device)
    while not done:
        state = torch.FloatTensor(state).to(device)
        action, log_prob, _ = global_net.actor_action(state)

        next_state, reward, done, _ = env.step(action)
        total_reward += reward

#         env.render()
#         time.sleep(0.01)
        state = next_state
    n_reward.append(total_reward)
    plot(n_reward)
#     print('Duration till which pole is balanced: ', total_reward)

#     env.close()