# Actor Critic Methods

In [1]:
import numpy as np

import matplotlib.pyplot as plt

import gym

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as distributions

## A3C (Asynchronous Advantage Actor-Critic)

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
class Actor(nn.Module):
    def __init__(self, input_dims, hidden_dims, output_dims, lr):
        super(Actor, self).__init__()
        self.input_layer = nn.Linear(input_dims, hidden_dims[0])
        self.hidden_layers = nn.ModuleList(
            [nn.Linear(hidden_dims[i], hidden_dims[i+1]) for i in range(len(hidden_dims) - 1)]
        )
        self.output_layer = nn.Linear(hidden_dims[-1], output_dims)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.to(device)
        
    def forward(self, state):
        x = torch.tensor(state, dtype=torch.float32).to(device)
        x = F.relu(self.input_layer(x))
        for hidden_layer in self.hidden_layers:
            x = F.relu(hidden_layer(x))
        x = self.output_layer(x)
        return x
    
    def step(self, state):
        logits = self.forward(state)
        distribution = distributions.Categorical(logits=logits)
        action = distribution.sample()
        log_prob = distribution.log_prob(action)
        entropy = distribution.entropy()
        return action.cpu().item(), log_prob, entropy

In [4]:
class Critic(nn.Module):
    def __init__(self, input_dims, hidden_dims, lr):
        super(Critic, self).__init__()
        self.input_layer = nn.Linear(input_dims, hidden_dims[0])
        self.hidden_layers = nn.ModuleList(
            [nn.Linear(hidden_dims[i], hidden_dims[i+1]) for i in range(len(hidden_dims) - 1)]
        )
        self.output_layer = nn.Linear(hidden_dims[-1], 1)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.to(device)
        
    def forward(self, state):
        x = torch.tensor(state, dtype=torch.float32).to(device)
        x = F.relu(self.input_layer(x))
        for hidden_layer in self.hidden_layers:
            x = F.relu(hidden_layer(x))
        x = self.output_layer(x)
        return x

In [5]:
class A3C_Agent():
    def __init__(self, 
                 state_dims, 
                 actor_hidden_dims, 
                 critic_hidden_dims, 
                 action_dims, 
                 gamma, 
                 beta, 
                 actor_lr,
                 critic_lr):
        self.actor = Actor(
            input_dims=state_dims, 
            hidden_dims=actor_hidden_dims, 
            output_dims=action_dims, 
            lr=actor_lr)
        self.critic = Critic(
            input_dims=state_dims,
            hidden_dims=critic_hidden_dims,
            lr=critic_lr
        )
        self.gamma = gamma
        self.beta = beta
        self.reset()
        
    def reset(self):
        self.rewards = []
        self.values = []
        self.log_probs = []
        self.entropies = []
    
    def learn(self):
        trajectory_len = len(self.rewards)
        gammas = np.array([self.gamma**i for i in range(trajectory_len)], dtype=np.float32)
        returns = [np.sum(np.array(self.rewards[t:]) * gammas[:trajectory_len - t]) for t in range(trajectory_len)]
        returns = torch.tensor(returns, dtype=torch.float32).to(device)
        
        log_probs = torch.vstack(self.log_probs).squeeze(1)
        values = torch.vstack(self.values).squeeze(1)
        entropies = torch.vstack(self.entropies).squeeze(1)
        advantages = returns - values
        
        actor_loss = -(advantages.detach() * log_probs + self.beta * entropies).mean()
        
        self.actor.optimizer.zero_grad()
        actor_loss.backward()
        self.actor.optimizer.step()
        
        critic_loss = advantages.pow(2).mul(0.5).mean()
        self.critic.optimizer.zero_grad()
        critic_loss.backward()
        self.critic.optimizer.step()
        
        self.reset()
    
    def add_reward(self, reward):
        self.rewards.append(reward)
    
    def act(self, state):
        action, log_prob, entropy = self.actor.step(state)
        self.log_probs.append(log_prob)
        self.entropies.append(entropy)
        self.values.append(self.critic(state))
        return action

In [44]:
class HogwildAdam(optim.Adam):
    def __init__(self, params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False):
        super(HogwildAdam, self).__init__(
            params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
        for param_group in self.param_groups:
            for parameters in param_group['params']:
                print(self.state[parameters])
#         print(self.state[0])

In [45]:
test = Actor(10, (10, 10), 10, 0.001)
HogwildAdam(test.parameters())

{}
{}
{}
{}
{}
{}


HogwildAdam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
)

In [6]:
# parameters
env_name = 'LunarLander-v2'
episodes = 10000
actor_alpha = 0.0005
critic_alpha = 0.0007
beta = 0.001
gamma = 0.99
actor_hidden_dims = (128, 64)
critic_hidden_dims = (256, 128)
win_condition = 200

In [7]:
# main loop
env = gym.make(env_name)
state_dims = env.observation_space.shape[0]
action_dims = env.action_space.n 
agent = A3C_Agent(state_dims, 
                  actor_hidden_dims, 
                  critic_hidden_dims, 
                  action_dims, 
                  gamma, 
                  beta, 
                  actor_alpha,
                  critic_alpha)
reward_mean = []
rewards = []
best_mean = 0
for episode in range(episodes):
    obs = env.reset()
    done = False
    reward_sum = 0
    while not done:
        action = agent.act(obs)
        new_obs, reward, done, _ = env.step(action)
        agent.add_reward(reward)
        obs = new_obs
        reward_sum += reward
    
    rewards.append(reward_sum)
    if episode > 100:
        mean = np.mean(rewards[-100:])
        if mean > best_mean:
            best_mean = mean
        reward_mean.append(mean)
        print(f'Episode: {episode}, Current Rewards: {reward_sum}, Reward Mean: {mean}, Best Mean: {best_mean}', 
             end='\r')
        if mean > win_condition:
            break
    agent.learn()

Episode: 1866, Current Rewards: -38.35780145951844, Reward Mean: -16.53960616165882, Best Mean: 5.1440667374994737333

KeyboardInterrupt: 

In [None]:
plt.plot(reward_mean)
plt.show()