In [None]:
import math
import random

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal, Categorical
from IPython.display import clear_output
import matplotlib.pyplot as plt

In [None]:
env_name = 'CartPole-v0'
env = gym.make(env_name)

In [None]:
class RL(nn.Module):
    def __init__(self, action_num=2, hidden_size=256):
        super(RL, self).__init__()
        self.fc_actor1 = nn.Linear(4, hidden_size)
        self.fc_actor2 = nn.Linear(hidden_size, action_num)

        self.fc_critic1 = nn.Linear(4, hidden_size)
        self.fc_critic2 = nn.Linear(hidden_size, 1)

    def forward(self, x):
        ax = F.relu(self.fc_actor1(x))
        dist = Categorical(logits = self.fc_actor2(ax))
                   
        cx = F.relu(self.fc_critic1(x))
        value = self.fc_critic2(cx)
        return dist, value

In [None]:
def calc_returns(rewards, gamma = 0.99):
    returns = []
    delta = 0
    for reward in rewards[::-1]:
        #Bug fixed on this line
        delta = reward + gamma*delta
        returns.insert(0, delta)
    return returns

def test_agent():
    done = False
    total_reward = 0
    observation = torch.FloatTensor(env.reset()).unsqueeze(0)

    with torch.no_grad():
        while not done:
            dist, _ = rl_model(observation)
            action = dist.sample().cpu().item()
            observation, reward, done, info = env.step(action)
            
            observation = torch.FloatTensor(observation).unsqueeze(0)
            total_reward += reward
            
    return total_reward

In [None]:
rl_model = RL()
lr = 1e-3
optimizer = optim.Adam(rl_model.parameters(), lr=lr)

In [None]:
max_steps = 100000
rollouts = 0
step = 0
score_logger = []

In [None]:
while step < max_steps:
    observation =  torch.FloatTensor(env.reset()).unsqueeze(0)
    done = False
    rewards = []
    values = []
    log_probs = []
    
    while not done:
        dist, value = rl_model(observation)
        action = dist.sample()
        log_prob = dist.log_prob(action.unsqueeze(0))
        
        observation, reward, done, info = env.step(action.cpu().item())
        
        observation = torch.FloatTensor(observation).unsqueeze(0)
        reward = torch.FloatTensor([reward]).unsqueeze(0)

        rewards.append(reward)
        values.append(value)
        log_probs.append(log_prob)
        step +=1
    
    returns = calc_returns(rewards)
    
    returns = torch.cat(returns, 1)
    log_probs = torch.cat(log_probs, 1)
    values = torch.cat(values, 1)
    advantage = (returns - values).detach()
    
    action_loss = - (log_probs * advantage).mean()
    critic_loss = (returns - values).pow(2).mean()
    agent_loss = action_loss + critic_loss
    
    optimizer.zero_grad()
    agent_loss.backward()
    optimizer.step()
    rollouts += 1
    
    if rollouts % 10 == 0:
        new_lr = ((max_steps - step)/max_steps) * lr
        optimizer.param_groups[0]["lr"] = new_lr
        
        score_logger.append(np.mean([test_agent() for _ in range(10)]))
        clear_output(True)
        plt.plot(score_logger)
        plt.show()
    
env.close()

In [None]:
value

In [None]:
plt.plot(score_logger)