In [None]:
# Notebook imports
from IPython.display import clear_output, display

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions.multinomial import Multinomial
from torch.distributions.categorical import Categorical

import gym
import numpy as np

# The policy is a simple linear network with no hidden layers.
# Using the softmax, the output is a probability distribution
# over the available actions given the current state.
class PolicyNetwork(nn.Module):
    def __init__(self, input_size, output_size):
        super(PolicyNetwork, self).__init__()
        self.output = nn.Linear(input_size, output_size)

    def forward(self, x):
        x = F.softmax(self.output(x))
        return x

    
def REINFORCE(env, state_space_size, action_space_size, num_episodes, num_repeats=50, gamma=0.99, disp=print):
    
    # Check to see if we can run on the GPU
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    # Create policy network (i.e. differentiable policy function)
    policy = PolicyNetwork(state_space_size, action_space_size).to(device)
    optimizer = optim.Adam(policy.parameters(), lr=1e-3)
    
    # Record all of the scores to review later
    scores = []
    
    for i in range(num_episodes):
        
        # All states, actions and rewards need to be recorded for training
        states  = []
        actions = []
        rewards = []
        
        # Re-seed every few episodes. This gives REINFORCE a few extra shots at the same episode
        # so it has time to learn before being overwhelmed by too much randomness.
        if i % num_repeats == 0:
            seed = np.random.randint(0, np.iinfo(np.int64).max)
        env.seed(seed)
        
        # Reset the score and the environment for this episode
        score = 0
        state = env.reset()
        
        # Play out an episode using the current policy
        while True:
            
            # Take a step and generate action probabilities for current state
            # The state must first be turned into a tensor and sent to the device
            state_tensor = torch.from_numpy(state).float().to(device)
            action_probs = policy.forward(state_tensor)
            
            # Sample from softmax output to get next action
            # 'Categorical' is the same as 'Multinomial'
            m = Categorical(action_probs)
            action = m.sample()
            
            # Take another step, update the state, and check the reward
            # Calling item retrieves the action value from the action tensor
            next_state, reward, done, _ = env.step(action.item())
            score += reward
            
            # Record all of our episode stats
            rewards.append(reward)
            states.append(state)
            actions.append(action)
            
            # Update the state for the next step
            state = next_state
            
            if done:
                disp("Episode: {} Score: {}".format(i, score))
                break
                
        # Now that the episode is done, update out policy for each timestep
        for t in range(len(states)):

            # Get returns at all times, i.e. G[t] for all t
            G = sum([r * gamma ** i for i, r in enumerate(rewards[t:])])

            # Update our weights. First, zero the gradients
            optimizer.zero_grad()
            
            # Convert state to a tensor and re-evaluate probability distribution
            state_tensor = torch.from_numpy(states[t]).float().to(device)
            probs = policy(state_tensor)
            
            # Evaluate performanc as per the policy gradient theorem and update our
            # weights to take a step in the direction of increased performance.
            m = Categorical(probs)
            performance = -m.log_prob(actions[t]) * G
            performance.backward()
            optimizer.step()
        
        scores.append(score)
        
    return scores
        

In [None]:
def jupyterDisplay(s):
    clear_output(wait=True)
    display(s)

In [None]:
env = gym.make("CartPole-v0")
action_space_size = env.action_space.n
state_space_size = env.observation_space.shape[0]
scores = REINFORCE(env,
                   state_space_size=state_space_size,
                   action_space_size=action_space_size,
                   num_episodes=5000,
                   disp=jupyterDisplay)

In [None]:
fig=plt.figure(figsize=(10, 8), dpi= 80, facecolor='w', edgecolor='k')
plt.plot(scores)
plt.title("Episode scores over training")
plt.xlabel("Training episodes")
plt.ylabel("Score")

In [None]:
# Smooth scores
N = 50
smooth_scores = np.convolve(scores, np.ones((N,))/N, mode='valid')

In [None]:
fig=plt.figure(figsize=(10, 8), dpi= 80, facecolor='w', edgecolor='k')
plt.plot(smooth_scores)
plt.title("Episode scores over training (smoothed over {} episodes)".format(N))
plt.xlabel("Training episodes")
plt.ylabel("Score (smoothed over {} episodes)".format(N))