In [2]:
import numpy as np
import time
import random
import gym

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import matplotlib.pyplot as plt

from IPython.display import clear_output
%matplotlib inline

In [3]:
env = gym.make('InvertedPendulum-v2')

print('Observation Shape:', env.observation_space.shape, '\nAction Shape:', env.action_space)

Observation Shape: (4,) 
Action Shape: Box(1,)


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [5]:
## Hyperparameters

BATCH_SIZE = 128
LEARNING_RATE = 0.001
DISCOUNT = 0.99
EPS = 1
EPS_DECAY = 0.9999
END_EPS = 0.1

N_EPISODE = 2000

obs_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

In [6]:
class Actor(nn.Module):
    def __init__(self, observations, actions):
        super(Actor, self).__init__()
        self.actor = nn.Sequential(
            nn.Linear(observations, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU()
        )
        self.l1 = nn.Linear(16, actions)
        self.l2 = nn.Linear(16, actions)
        
    def forward(self, x):
        x = self.actor(x)
        mean = self.l1(x)
        variance = F.softplus(self.l2(x))
        
        return mean, variance

In [7]:
actor = Actor(obs_dim, action_dim).to(device)
optimizerA = optim.Adam(actor.parameters())

In [8]:
def actors_action(state):
    state = torch.FloatTensor(state).to(device)
    
    mean, variance = actor(state)
    
    m = torch.distributions.Normal(mean, torch.sqrt(variance))
    action = m.sample()
    log_prob = m.log_prob(action)
    
    return action.detach().cpu().numpy(), log_prob

In [9]:
class Critic(nn.Module):
    def __init__(self, observations, actions):
        super(Critic, self).__init__()
        self.critic = nn.Sequential(
            nn.Linear(observations, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, actions)
        )
    
    def forward(self, x):
        return self.critic(x)

In [16]:
critic = Critic(obs_dim, action_dim).to(device)
optimizerC = optim.Adam(critic.parameters())
criterionC = nn.MSELoss().to(device)

In [11]:
def critics_action(state):
    global EPS, END_EPS, EPS_DECAY
    EPS = max(EPS*EPS_DECAY, END_EPS)
    
    if random.random()<EPS:
        action = env.action_space.sample()
    else:
        state = torch.FloatTensor(state).to(device)
        action = torch.argmax(critic(state).item())
    
    return action

In [15]:
def compute_returns(next_state, rewards, done, discount = DISCOUNT):
#     q_val = critic(state)

    next_state = torch.FloatTensor(next_state).to(device)
    next_q_val = critic(next_state)
    returns = []
    
    for step in reversed(range(len(rewards))):
        next_q_val = rewards[step] + discount*next_q_val*done[step]
        returns.append(next_q_val)
        
    returns.reverse()
    return returns
    

In [None]:
def ACupdate(log_probs, ret):
    optimizerA.zero_grad()
    optimizerC.zero_grad()
    
    actor_loss = 0
    critic_loss = 0
    
    for itr, log_prob in enumerate(log_probs):
        retr = ret[itr]
        actor_loss -= torch.sum(log_prob*retr)
        critic_loss += criterionC(values, retr)
        
    actor_loss.backward()
    critic_loss.backward()
    
    optimizerA.step()
    optimizerC.step()

In [14]:
n_rewards = []

for i in range(1, N_EPISODE+1):
    ep_rewards = []
    log_probs = []
    returns
    total_reward = 0
    done = False
    values = []
    
    state = env.reset()
    ret = 0
    
    while not done:
        action, log_prob = actors_action(state)
        value = critic(state)
        next_state, reward, done, _ = env.step(action)
        
        done = torch.tensor([done], dtype = torch.float, device = device)
        
        ep_rewards.append(torch.tensor([reward], dtype = torch.float, device = device))
        log_probs.append(log_prob)
        done_states.append(done)
        values.append(value)
        
        total_reward += reward
#         ret = compute_returns(next_state, reward, done, ret)
#         if i%5 == 0:
#             env.render()
        state = next_state
        
    ret = compute_returns(next_state, ep_rewards, done_states)
    
    ACupdate(log_probs, ret, values)
    
    n_rewards.append(total_reward)

NameError: name 'action' is not defined