In [1]:
import numpy as np
# import brica
import gym

from skimage.transform import resize
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.distributions.normal import Normal
from torch.distributions.categorical import Categorical

In [2]:
# Function from https://github.com/ikostrikov/pytorch-a2c-ppo-acktr/blob/master/model.py
def initialize_parameters(m):
    classname = m.__class__.__name__
    if classname.find('Linear') != -1:
        m.weight.data.normal_(0, 1)
        m.weight.data *= 1 / torch.sqrt(m.weight.data.pow(2).sum(1, keepdim=True))
        if m.bias is not None:
            m.bias.data.fill_(0)

In [3]:
step_size = 100000
memory_size = 64
rollout_steps = 20
save_steps = 100
value_coeff = 0.5
entropy_coeff = 0.01
grad_norm_limit = 40
gamma = 0.99
lambd = 1.00
lr = 3e-4

In [4]:
class ACModel(nn.Module):
    '''
    Actor-Critic Model
    '''
    def __init__(self, action_num=2, memory_size=64, d_limit=0.5):
        super().__init__()

        self.memory_size = memory_size
        self.action_num = action_num

        # Define image embedding
        self.image_embedding_size = 2
        
        # Define memory
        self.memory_rnn = nn.GRUCell(self.image_embedding_size, self.memory_size)

       
        # Resize image embedding
        self.embedding_size = self.memory_size

        # Define actor's model
        self.actor = nn.Sequential(
            nn.Linear(self.embedding_size, 64),
            nn.Tanh(),
            nn.Linear(64, action_num),
        )
        self.actor_mu = nn.Sequential(
            nn.Linear(self.action_num, self.action_num),
            nn.Tanh()
        )
        self.actor_sigma = nn.Sequential(
            nn.Linear(self.action_num, self.action_num),
            nn.Softplus()
        )

        # Define critic's model
        self.critic = nn.Sequential(
            nn.Linear(self.embedding_size, 64),
            nn.Tanh(),
            nn.Linear(64, 1)
        )

        # Initialize parameters correctly
        self.apply(initialize_parameters)

    def forward(self, obs, memory):
        hidden = self.memory_rnn(obs, memory)
        embedding = hidden
        memory = hidden

        x = self.actor(embedding)
        mu = self.actor_mu(x)
        sigma = self.actor_sigma(x)
        dist_params = [mu, sigma]

        x = self.critic(embedding)
        value = x.squeeze(1)

        return dist_params, value, memory

In [5]:
class BG(object):
    def __init__(self, training=True, init_weight_path=None, use_cuda=False):
#         self.timing = brica.Timing(5, 1, 0)
        self.training = training
        self.total_steps = 0
        self.ep_rewards = [0.]
        self.cuda = use_cuda
        self.ac_model = ACModel()
        if init_weight_path is not None:
            self.ac_model.load_state_dict(torch.load(init_weight_path))
        self.optimizer = optim.Adam(self.ac_model.parameters(), lr=lr)
        if self.cuda: self.ac_model = self.ac_model.cuda()
        self.init_params()

    def __call__(self, inputs):
#         if 'from_environment' not in inputs:
#             raise Exception('BG did not recieve from Environment')
#         if 'from_pfc' not in inputs:
#             raise Exception('BG did not recieve from PFC')
#         if 'from_fef' not in inputs:
#             raise Exception('BG did not recieve from FEF')
        
        obs = inputs['from_fef']
        reward, done = inputs['from_environment']
        dones = [done]
        rewards = np.array([reward])

        # reset the LSTM state for done envs
        masks = (1. - torch.from_numpy(np.array(dones, dtype=np.float32).reshape(-1))).unsqueeze(1)
        if self.cuda: masks = masks.cuda()

        self.total_steps += 1
        self.ep_rewards = self.ep_rewards + rewards
        # if done:
        #     ep_rewards = 0
        rewards = torch.from_numpy(rewards).float().unsqueeze(1)
        if self.cuda: rewards = rewards.cuda()
        if self.prev_actions is not None:
            self.steps.append((
                rewards,
                masks,
                self.prev_actions.clone(),
                self.prev_policies,
                self.prev_values.clone()
            ))
        obs = np.expand_dims(obs, axis=0).astype("float32")
        obs = Variable(torch.tensor(obs))
#         print(obs.size())

        # network forward pasa
        dist_params, values, self.memory = self.ac_model(obs, self.memory)
        mu , sigma = dist_params
        policies = Normal(mu, sigma)
        actions = policies.sample()
        prob = policies.log_prob(actions)
        self.prev_actions = actions
        self.prev_policies = policies
        self.prev_values = values

        if self.total_steps % rollout_steps == 0 and self.training:
            self.update()
            self.init_params()
        
        if self.total_steps % save_steps == 0 and self.training:
            cur_weights = self.ac_model.state_dict()
#             torch.save(cur_weights, "./data/bg.pth")

        
        return actions.cpu().numpy().reshape(-1)
        

    def update(self):
        self.steps.append((None, None, None, None, self.prev_values.clone()))
        actions, values, returns, advantages, entropies = process_rollout(self.steps, self.cuda)
        # calculate action probabilities
        log_action_probs = self.prev_policies.log_prob(actions)

        policy_loss = (-log_action_probs * Variable(advantages)).sum()
        value_loss = (.5 * (values - Variable(returns)) ** 2.).sum()
        entropy_loss = (-1)*entropies.sum()
#         entropy_loss = (log_probs * probs).sum()

        loss = policy_loss + value_loss * value_coeff + entropy_loss * entropy_coeff
        loss.backward()

        nn.utils.clip_grad_norm(self.ac_model.parameters(), grad_norm_limit)
        self.optimizer.step()
        self.optimizer.zero_grad()
#         print("total step", self.total_steps)
#         print("Loss:", loss.data[0])
#         print("Return:", torch.mean(returns).data[0])
        

    def init_params(self):
        self.steps = []
        self.memory = torch.zeros(1, memory_size)
        if self.cuda: self.memory = self.memory.cuda()
        self.prev_actions = None
        self.prev_policies = None
        self.prev_values = None

In [6]:
def process_rollout(steps, cuda, num_workers=1):
    # bootstrap discounted returns with final value estimates
    _, _, _, _, last_values = steps[-1]
    returns = last_values.data

    advantages = torch.zeros(num_workers, 1)
    if cuda: advantages = advantages.cuda()

    out = [None] * (len(steps) - 1)
    out_actions = [None] * (len(steps) - 1)
    out_policies = [None] * (len(steps) - 1)
    out_values = [None] * (len(steps) - 1)
    out_returns = [None] * (len(steps) - 1)
    out_advantages = [None] * (len(steps) - 1)
    out_entropies = [None] * (len(steps) - 1)

    # run Generalized Advantage Estimation, calculate returns, advantages
    for t in reversed(range(len(steps) - 1)):
        rewards, masks, actions, policies, values = steps[t]
        _, _, _, _, next_values = steps[t + 1]

        returns = rewards + returns * gamma * masks

        deltas = rewards + next_values.data * gamma * masks - values.data
        advantages = advantages * gamma * lambd * masks + deltas
        
        out_actions[t] = actions
        out_entropies[t] = policies.entropy()
        out_policies[t] = policies
        out_values[t] = values
        out_returns[t] = returns
        out_advantages[t] = advantages

    # return data as batched Tensors, Variables
    out_actions = torch.cat(out_actions, dim=0)
    out_values = torch.cat(out_values, dim=0)
    out_returns = torch.cat(out_returns, dim=0)
    out_advantages = torch.cat(out_advantages, dim=0)
    out_entropies = torch.cat(out_entropies, dim=0)
    return (out_actions, out_values, out_returns, out_advantages, out_entropies)

In [7]:
env = gym.make("MountainCarContinuous-v0")

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [8]:
obs = env.reset()
reward = 0
done = False
episode_reward = 0
episode_count = 0
step = 0

agent = BG()

for i in range(step_size):
    inputs = {}
    inputs['from_fef'] = obs
    inputs['from_environment'] = (reward, done)
    action = agent(inputs)
    obs, reward, done, _ = env.step(action)
    env.render()
    episode_reward += reward
    step += 1
    if done:
        obs = env.reset()
        print("episode reward={}".format(episode_reward))

        # Store log for tensorboard graph
        episode_count += 1
#         logger.log("episode_reward", episode_reward, episode_count)
            
        episode_reward = 0
        step = 0



episode reward=-53.901073329474215
episode reward=-1176.2990973812205
episode reward=-6400.248468657024
episode reward=-9954.48756541229
episode reward=-14413.643939915111
episode reward=-17395.142354084575
episode reward=-18547.0403373371
episode reward=-22166.558170443564
episode reward=-24924.99375387171
episode reward=-23576.960319049766
episode reward=-24038.312941941276
episode reward=-25562.250651363975
episode reward=-24986.590582880723
episode reward=-23668.48060844528
episode reward=-19141.03602959034
episode reward=-24793.387644529215
episode reward=-23811.635532418983
episode reward=-12787.151739147503
episode reward=-24155.38748681925
episode reward=-24964.774952392898
episode reward=-12429.21613582478
episode reward=-18019.73949442265
episode reward=-23708.3749800286
episode reward=-24292.510322589176
episode reward=-23633.06946201703
episode reward=-22720.015925285126
episode reward=-12423.431305826398
episode reward=-24555.13153167581
episode reward=-16969.55379245074
e