In [1]:
import sys
import torch  
import gym
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import pandas as pd

# hyperparameters
hidden_size = 256
learning_rate = 3e-4

# Constants
GAMMA = 0.99
num_steps = 300
max_episodes = 3000

In [2]:

class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, learning_rate=3e-4):
        super(ActorCritic, self).__init__()

        self.num_actions = num_actions
        self.critic_linear1 = nn.Linear(num_inputs, hidden_size)
        self.critic_linear2 = nn.Linear(hidden_size, 1)

        self.actor_linear1 = nn.Linear(num_inputs, hidden_size)
        self.actor_linear2 = nn.Linear(hidden_size, num_actions)
    
    def forward(self, state):
        state = Variable(torch.from_numpy(state).float().unsqueeze(0))
        value = F.relu(self.critic_linear1(state))
        value = self.critic_linear2(value)
        
        policy_dist = F.relu(self.actor_linear1(state))
        policy_dist = F.softmax(self.actor_linear2(policy_dist), dim=1)

        return value, policy_dist

In [3]:

def a2c(env):
    num_inputs = env.observation_space.shape[0]
    num_outputs = env.action_space.n
    
    actor_critic = ActorCritic(num_inputs, num_outputs, hidden_size)
    ac_optimizer = optim.Adam(actor_critic.parameters(), lr=learning_rate)

    all_lengths = []
    average_lengths = []
    all_rewards = []
    entropy_term = 0

    for episode in range(max_episodes):
        log_probs = []
        values = []
        rewards = []

        state = env.reset()
        for steps in range(num_steps):
            value, policy_dist = actor_critic.forward(state)
            value = value.detach().numpy()[0,0]
            dist = policy_dist.detach().numpy()

            action = np.random.choice(num_outputs, p=np.squeeze(dist))
            log_prob = torch.log(policy_dist.squeeze(0)[action])
            entropy = -np.sum(np.mean(dist) * np.log(dist))
            new_state, reward, done, _ = env.step(action)

            rewards.append(reward)
            values.append(value)
            log_probs.append(log_prob)
            entropy_term += entropy
            state = new_state
            
            if done or steps == num_steps-1:
                Qval, _ = actor_critic.forward(new_state)
                Qval = Qval.detach().numpy()[0,0]
                all_rewards.append(np.sum(rewards))
                all_lengths.append(steps)
                average_lengths.append(np.mean(all_lengths[-10:]))
                if episode % 10 == 0:                    
                    sys.stdout.write("episode: {}, reward: {}, total length: {}, average length: {} \n".format(episode, np.sum(rewards), steps, average_lengths[-1]))
                break
        
        # compute Q values
        Qvals = np.zeros_like(values)
        for t in reversed(range(len(rewards))):
            Qval = rewards[t] + GAMMA * Qval
            Qvals[t] = Qval
  
        #update actor critic
        values = torch.FloatTensor(values)
        Qvals = torch.FloatTensor(Qvals)
        log_probs = torch.stack(log_probs)
        
        advantage = Qvals - values
        actor_loss = (-log_probs * advantage).mean()
        critic_loss = 0.5 * advantage.pow(2).mean()
        ac_loss = actor_loss + critic_loss + 0.001 * entropy_term
        
        ac_optimizer.zero_grad()
        ac_loss.backward()
        ac_optimizer.step()
        
        #====================================================================================================
        #to see the change of the weights====================================================================
        #====================================================================================================
        print("=======================================")
        print("actor_critic.actor_fc1.weight : {}".format(actor_critic.actor_linear1.weight))
        print("actor_critic.critic_fc2.weight : {}".format(actor_critic.critic_linear1.weight))
        #====================================================================================================
    
    # Plot results
    smoothed_rewards = pd.Series.rolling(pd.Series(all_rewards), 10).mean()
    smoothed_rewards = [elem for elem in smoothed_rewards]
    plt.plot(all_rewards)
    plt.plot(smoothend_rewards)
    plt.plot()
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.show()

    plt.plot(all_lengths)
    plt.plot(average_lengths)
    plt.xlabel('Episode')
    plt.ylabel('Episode length')
    plt.show()

In [4]:
if __name__ == "__main__":
    env = gym.make("CartPole-v0")
    a2c(env)    

episode: 0, reward: 16.0, total length: 15, average length: 15.0 
actor_critic.actor_fc1.weight : Parameter containing:
tensor([[-0.2298,  0.2770, -0.1036, -0.3950],
        [ 0.4676, -0.4538, -0.2897, -0.1354],
        [ 0.1991,  0.3025, -0.1911,  0.2680],
        ...,
        [ 0.4285, -0.3660, -0.0486, -0.3200],
        [-0.1304, -0.1343,  0.2468,  0.0368],
        [-0.0309,  0.3450,  0.1751,  0.4247]], requires_grad=True)
actor_critic.critic_fc2.weight : Parameter containing:
tensor([[-0.4247, -0.4649, -0.4878,  0.2030],
        [-0.1702,  0.2981, -0.3766,  0.1905],
        [-0.0766, -0.3158, -0.3515, -0.2254],
        ...,
        [-0.0020,  0.3974,  0.0123, -0.3448],
        [ 0.0831,  0.0301,  0.2025, -0.3015],
        [-0.0452, -0.2938, -0.1895, -0.4559]], requires_grad=True)
actor_critic.actor_fc1.weight : Parameter containing:
tensor([[-0.2301,  0.2770, -0.1034, -0.3950],
        [ 0.4679, -0.4539, -0.2900, -0.1353],
        [ 0.1988,  0.3024, -0.1908,  0.2680],
        ...,


KeyboardInterrupt: 