In [122]:
import pathlib

import gym
import torch

from matplotlib import pyplot as plt
from torch.utils.tensorboard import SummaryWriter

In [123]:
ENV = "MountainCar-v0" # 'Acrobot-v1' "LunarLander-v2" 'CartPole-v1'
RECORD_PATH = "../videos/reinforce/" + ENV
BASELINE_NUMBER = 1
BASELINE = "policy_gradient_" + ENV
LOG_FOLDER = "../runs/" + BASELINE + "/"
SEEDS = [1]
VIDEO_RECORD_DIR = pathlib.Path(RECORD_PATH)

In [124]:
env = gym.make(ENV, new_step_api=True)

In [125]:
# Part of the library

class reinforce(torch.nn.Module):

    def __init__(self, policy: torch.nn.Module) -> None:
        super().__init__()
        self.policy = policy
    
    def forward(self, state):
        return self.policy(state)

class ActionReplayBuffer(torch.nn.Module):

    def __init__(self, gamma: float) -> None:
        super().__init__()
        self.gamma = gamma

        self.state_buffer = []
        self.action_buffer = []
        self.reward_buffer = []
        self.done_buffer = []

    def __len__(self):
        return len(self.state_buffer)
    
    def push(self, state, action, reward, done):
        state = torch.FloatTensor(state).unsqueeze(dim=0)

        self.state_buffer.append(state)
        self.action_buffer.append(action)
        self.reward_buffer.append(reward)
        self.done_buffer.append(done)
    
    def sample(self):
        self.reward_buffer.reverse()
        self.done_buffer.reverse()

        for i in range(len(self.reward_buffer) - 1):
            if not self.done_buffer[i + 1]:
                self.reward_buffer[i + 1] = self.reward_buffer[i + 1] + (self.gamma * self.reward_buffer[i])

        self.reward_buffer.reverse()
        self.done_buffer.reverse()

        state_tensor = torch.cat(self.state_buffer)
        action_tensor = torch.tensor(self.action_buffer)
        reward_tensor = torch.FloatTensor(self.reward_buffer)

        return state_tensor, action_tensor, reward_tensor

    def empty(self):
        self.__init__(self.gamma)

def categorical_policy(action_probabilities):
    return torch.distributions.Categorical(action_probabilities)

def categorical_action(action_probabilities):
    categorical = torch.distributions.Categorical(action_probabilities)
    return categorical.sample().item()

def get_prob_from_pred(pred_batch, action_batch):
    return pred_batch.gather(dim=1,index=action_batch.long().view(-1,1)).squeeze()

def reinforce_loss(prob_batch, expected_returns_batch, epsilon=1e-14):
    error = torch.log(prob_batch + epsilon) * expected_returns_batch
    return -torch.mean(error)

def get_grad_max(module):
    grad_max = 0.0

    for parameter in module.parameters():
        grad_max = max(grad_max, parameter.grad.abs().max().item())

    return grad_max

def get_grad_l2(module):
    grad_means = 0.0
    grad_count = 0

    for parameter in module.parameters():
        grad_means += (parameter.grad ** 2).mean().sqrt().item()
        grad_count += 1

    return grad_means / grad_count


## Func Tools:

In [126]:
def evaluate_agent(agent, env, nb_episodes=10):
    env = gym.make(env, new_step_api=True)
    rewards = []

    for n_episode in range(nb_episodes):
        episode_reward = 0

        observation = env.reset()
        done = False

        while not done:
            pred = agent(torch.from_numpy(observation).float())
            action = torch.argmax(pred).item()

            observation, reward, terminated, truncarted, info = env.step(action)
            done = terminated or truncarted

            episode_reward += reward

        rewards.append(episode_reward)

    env.close()
    score = sum(rewards) / len(rewards)

    return score

In [127]:
MAX_EPISODE_STEPS = 1000
MAX_EPISODES = 7000
GOAL_SCORE = -150
BUFFER_SIZE = 20

EARLY_STOPPING = True
LOG_TIME = 100

obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

learning_rate = 0.0001
gamma = 0.99

In [128]:
from collections import deque

def torch_reinforce_train(agent, replay_buffer, optimizer, buffer_size=500, seed=0):
    writer = SummaryWriter(log_dir=LOG_FOLDER + "run_" + str(BASELINE_NUMBER) + "_seed_" + str(seed))

    # Metrics
    losses = []
    cumulative_reward = deque(maxlen=100)
    episode_length = deque(maxlen=100)

    for trajectory in range(MAX_EPISODES + 1):
        state = env.reset(seed=seed)
        done = False

        total_reward = 0
        nb_step = 0
    
        for t in range(MAX_EPISODE_STEPS):
            torch_state = torch.from_numpy(state).float()
            action_probs = agent(torch_state)

            action = categorical_action(action_probs)

            prev_state = state

            step_state = env.step(action)
            state, reward, done = step_state[0], step_state[1], step_state[2]

            replay_buffer.push(prev_state, action, reward, done)
            nb_step += 1
            total_reward += reward

            if done:
                break

        cumulative_reward.append(total_reward)
        episode_length.append(nb_step)

        if len(replay_buffer) > buffer_size:
            state_batch, action_batch, reward_batch = replay_buffer.sample()

            # reward_batch = (reward_batch - reward_batch.std()) /  reward_batch.mean()
            reward_batch = reward_batch / reward_batch.max()

            pred_batch = agent(state_batch)
            prob_batch = get_prob_from_pred(pred_batch, action_batch)
    
            optimizer.zero_grad()

            loss = reinforce_loss(prob_batch, reward_batch)
            loss.backward()

            # torch.nn.utils.clip_grad_norm_(agent.parameters(), 0.5)
            optimizer.step()

            losses.append(loss)
            replay_buffer.empty()

            writer.add_scalar("grad_max", get_grad_max(agent), trajectory)
            writer.add_scalar("grad_l2", get_grad_l2(agent), trajectory)

        # logging
        vector_cumulative_reward = torch.FloatTensor(cumulative_reward)
        vector_episode_length = torch.FloatTensor(episode_length)

        mean_reward = torch.mean(vector_cumulative_reward)
        max_reward = torch.max(vector_cumulative_reward)
        min_reward = torch.min(vector_cumulative_reward)
        std_reward = torch.std(vector_cumulative_reward)

        mean_episode_length = torch.mean(vector_episode_length)
        max_episode_length = torch.max(vector_episode_length)
        min_episode_length = torch.min(vector_episode_length)
        std_episode_length = torch.std(vector_episode_length)

        if trajectory % LOG_TIME == 0 and trajectory > 0:
            writer.add_scalar("mean_reward", mean_reward, trajectory)
            writer.add_scalar("max_reward", max_reward, trajectory)
            writer.add_scalar("min_reward", min_reward, trajectory)
            writer.add_scalar("std_reward", std_reward, trajectory)

            writer.add_scalar("mean_episode_length", mean_episode_length, trajectory)
            writer.add_scalar("max_episode_length", max_episode_length, trajectory)
            writer.add_scalar("min_episode_length", min_episode_length, trajectory)
            writer.add_scalar("std_episode_length", std_episode_length, trajectory)

        # Early stopping
        if EARLY_STOPPING and mean_reward > GOAL_SCORE:
            print("Environment resolved, nb episodes:", trajectory, "score:", mean_reward)
            break
    
    writer.close()
    return mean_reward

In [129]:
def baseline(seeds):
    best_score = float("-inf")
    best_agent = None

    for seed in seeds:
        torch.manual_seed(0)

        model = torch.nn.Sequential(
            torch.nn.Linear(obs_size, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, n_actions),
            torch.nn.Softmax(dim=0)
        )

        agent = reinforce(model)
        replay_buffer = ActionReplayBuffer(gamma)

        optimizer = torch.optim.Adam(agent.parameters(), lr=learning_rate)

        score = torch_reinforce_train(agent, replay_buffer, optimizer, buffer_size=BUFFER_SIZE, seed=seed)

        if score > best_score:
            best_score = score
            best_agent = agent
    
    return best_agent, best_score

In [130]:
agent, score = baseline(SEEDS)

print("Best mean score:", score)

Best mean score: tensor(-997.1100)


In [131]:
score = evaluate_agent(agent, ENV, nb_episodes=100)

print("Evaluation score:", score)

Evaluation score: -200.0


In [132]:
from gym.wrappers import RecordVideo

def watch_agent(agent, env, max_steps=500):
    env = gym.make(env, new_step_api=True)
    env = RecordVideo(env, video_folder=VIDEO_RECORD_DIR)

    state = env.reset()
    score = 0

    for t in range(max_steps):
        pred = agent(torch.from_numpy(state).float())
        action = torch.argmax(pred).item()

        step_state = env.step(action)
        state, reward, done = step_state[0], step_state[1], step_state[2]

        score += reward

        if done:
            break

    env.close()
    return score

In [133]:
watch_agent(agent, ENV, max_steps=500)

  deprecation(
  logger.warn(
  logger.deprecation(
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


-200.0