In [1]:
import os
import gym
from tqdm import trange

import matplotlib.pyplot as plt 
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
from torch.utils.tensorboard import SummaryWriter

open_tensorboard = True

### Network
Created: Jan 4 2022

In [2]:
class actor(nn.Module):
    def __init__(self, input_dim, output_dim):
        '''
        Init the actor network

        args:
        input_dim (int): input dimension
        output_dim (int): output dimension

        '''
        super().__init__()

        self.lin = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
        )
        self.mu_layer = nn.Linear(128, output_dim)
        self.std_layer = nn.Linear(128, output_dim)

    def forward(self, state):
        '''
        Forward pass

        args:
        state (torch.tensor): the state variable

        '''
        x = self.lin(state)

        mu = torch.tanh(self.mu_layer(x)) * 2  # the action space is [-2, 2]
        log_std = F.softplus(self.std_layer(x))
        std = torch.exp(log_std)
        # std = F.softplus(self.std_layer(x)).clamp(0.1, 1000)

        distr = Normal(mu, std)
        action = distr.sample()

        return action, distr


class critic(nn.Module):
    def __init__(self, input_dim):
        '''
        Init the  critic network

        args:
        input_dim (int): input dimension

        '''
        super().__init__()
        
        self.lin = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
        )
        self.output = nn.Linear(64, 1)

    def forward(self, state):
        '''
        Forward pass

        args:
        state (torch.tensor): the state variable

        '''
        x = self.lin(state)
        output = self.output(x)
        
        return output


In [3]:
class A2Cagent:
    '''
    actor critic agent
    '''

    def __init__(self, env, gamma, learning_rate, entropy_weight) -> None:
        '''
        init the agent
        
        paras:
            env (gym.Env): An openAI Gym environment
            gamma (float): discount factor
            learning_rate (tuple): learning rate for (actor_lr, critic_lr)
            entropy_weight (float): rate of weighting entropy in the loss funtion

        '''
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu"
        )

        self.env = env
        self.gamma = gamma
        self.entropy_weight = entropy_weight
        self.lr = learning_rate

        # init the networks
        self.actor = actor(
            self.env.observation_space.shape[0], self.env.action_space.shape[0]).to(self.device)
        self.critic = critic(
            self.env.observation_space.shape[0]).to(self.device)

        # init the optimizer
        
        # THE FUCKING LEARNING RATES MATTERRRRRRRRRRRRRRRRRRRR!!!!!!
        self.actor_opt = optim.Adam(self.actor.parameters(), lr=learning_rate[0])
        self.critic_opt = optim.Adam(self.critic.parameters(), lr=learning_rate[1])

        # init the transition
        self.transition = []
        self.total_step = 0
        
        # init Tensorboard for visualization
        self.writer = SummaryWriter('result/a2c')

    def select_action(self, state: np.ndarray) -> np.ndarray:
        '''
        Select an action given a state
        '''

        state = torch.FloatTensor(state).to(self.device)

        action, dist = self.actor(state)
        log_prob = dist.log_prob(action).sum(dim=-1)
        self.transition = [state, log_prob]

        return action.clamp(-2.0, 2.0).cpu().detach().numpy()

    def step(self, action):
        '''
        Take the action and record the information from env
        '''
        next_state, reward, done, _ = self.env.step(action)
        self.transition.extend([next_state, reward, done])

        return next_state, reward, done

    def update(self):
        '''
        update the network and return the loss value
        '''

        state, log_prob, next_state, reward, done = self.transition
        next_state = torch.FloatTensor(next_state).to(self.device)

        # calculate loss for updating critic network
        mask = 1 - done
        target_function = reward + self.gamma * self.critic(next_state) * mask
        
        pred_function = self.critic(state)
        target_function = target_function.to(self.device).detach()
        
        critic_loss = F.smooth_l1_loss(pred_function, target_function)
        
        # update critic network
        self.critic_opt.zero_grad()
        critic_loss.backward()
        self.critic_opt.step()

        # calculate advantage function and loss for actor
        advantage = (target_function - pred_function).detach()
        actor_loss = -advantage * log_prob + self.entropy_weight * (-log_prob)

        # update actor network
        self.actor_opt.zero_grad()
        actor_loss.backward()
        self.actor_opt.step()

        return actor_loss.item(), critic_loss.item()

    def train(self, num_steps):
        '''
        Training
        '''
        self.actor_losses, self.critic_losses, self.scores = [], [], []
        state = self.env.reset()
        num_done = 0
        score = 0

        with trange(num_steps) as pbar:
            for idx in pbar:
                action = self.select_action(state)
                next_state, reward, done = self.step(action)
                actor_loss, critic_loss = self.update()

                # record the losses
                self.writer.add_scalar("Actor_Losses", actor_loss, idx)
                self.writer.add_scalar("Critic_Losses", critic_loss, idx)

                score += reward

                if done:
                    state = self.env.reset()
                    # self.scores.append(score)
                    self.writer.add_scalar("Scores", score, num_done)
                    num_done += 1
                    score = 0
                else:
                    state = next_state
        
        # add all data to tensorboard
        self.writer.flush()


In [4]:
NUM_EPISODES = 100
NUM_STEPS = 100000
GAMMA = 0.9
ENTROPY_WEIGHT = 1e-2
ACTOR_LEARNING_RATE = 1e-4
CRITIC_LEARNING_RATE = 1e-3
LEARNING_RATE = (ACTOR_LEARNING_RATE, CRITIC_LEARNING_RATE)

In [5]:
env = gym.make('Pendulum-v0')
agent = A2Cagent(env, GAMMA, LEARNING_RATE, ENTROPY_WEIGHT)

In [6]:
agent.train(NUM_STEPS)

100%|██████████████████████████████████| 100000/100000 [05:05<00:00, 327.24it/s]


In [None]:
if open_tensorboard :
    !tensorboard --logdir result/a2c

TensorFlow installation not found - running with reduced feature set.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.6.0 at http://localhost:6006/ (Press CTRL+C to quit)
