In [61]:
import gym
import torch

In [62]:
ENVIRONMENT_ID = "CartPole-v1" # "LunarLander-v2"
RECORD_PATH = "../videos/reinforce/" + ENVIRONMENT_ID

## Torch RL Library

In [63]:
import torch
import torch.nn as nn
import torch.distributions as dist

class Stochastic(nn.Module):
    def __init__(self, num_actions, distribution: torch.distributions.distribution.Distribution) -> None:
        super().__init__()

        self.num_actions = num_actions
        self.distribution = distribution
    
    def forward(self, states):
        probs = torch.full((states.shape[0], self.num_actions), fill_value=1/self.num_actions)
        return probs

    def policy(self, state):
        values = self.forward(state)
        if hasattr(self.distribution, 'logits'):
            policy = self.distribution(logits=values)
        else:
            policy = self.distribution(values[0], values[1])
        return policy
    
    def action(self, state: torch.Tensor):
        return self.policy(state).sample()


class Reinforce(torch.nn.Module):
    def __init__(self, model: torch.nn.Module, distribution: torch.distributions.distribution.Distribution) -> None:
        super().__init__()

        self.model = model
        self.distribution = distribution

    def forward(self, state: torch.Tensor):
        values = self.model(state)
        return values
    
    def policy(self, state: torch.Tensor):
        values = self.forward(state)
        if hasattr(self.distribution, 'logits'):
            policy = self.distribution(logits=values)
        else:
            policy = self.distribution(values[0], values[1])
        return policy

    def action(self, state: torch.Tensor):
        return self.policy(state).sample()

    def loss(self, state_batch: torch.Tensor, action_batch: torch.Tensor, reward_batch: torch.Tensor, done_batch: torch.Tensor):
        distribution = self.policy(state_batch)
        entropy = distribution.entropy().mean()
        logp = distribution.log_prob(action_batch)
        loss = -(logp * reward_batch).mean()
        return loss, entropy


class ActionBuffer():
    def __init__(self, gamma) -> None:
        super().__init__()
        self.gamma = gamma

        self.state_buffer = []
        self.action_buffer = []
        self.reward_buffer = []
        self.done_buffer = []

    def __len__(self):
        return len(self.state_buffer)
    
    def push(self, state, action, reward, done):
        self.state_buffer.append(state)
        self.action_buffer.append(action)
        self.reward_buffer.append(reward)
        self.done_buffer.append(done)
    
    def flush(self):
        for i in reversed(range(self.__len__())):
            self.reward_buffer[i - 1] = self.reward_buffer[i - 1] + (self.gamma * (self.reward_buffer[i] * (not self.done_buffer[i - 1])))
        
        state_batch = torch.cat(self.state_buffer)
        action_batch = torch.FloatTensor(self.action_buffer)
        reward_batch = torch.FloatTensor(self.reward_buffer)
        done_batch = torch.BoolTensor(self.done_buffer)

        self.__init__(self.gamma)

        return state_batch, action_batch, reward_batch, done_batch

def get_prob_from_pred(pred_batch, action_batch):
    return pred_batch.gather(dim=1,index=action_batch.long().view(-1,1)).squeeze()

def categorical_policy(probs):
    distribution = dist.Categorical(logits=probs)
    return distribution

def categorical_action(probs):
    distribution = dist.Categorical(logits=probs)
    return distribution.sample()

def normal_policy(mu, sigma):
    distribution = dist.Normal(mu, sigma)
    return distribution

def normal_action(mu, sigma):
    distribution = dist.Normal(mu, sigma)
    return distribution.sample()

def ascent_log_loss(action_values_batch, action_batch, reward_batch):
    distribution = dist.Categorical(logits=action_values_batch)
    logp = distribution.log_prob(action_batch)
    loss = -(logp * reward_batch).mean()
    return loss

def entropy_loss(action_values_batch, beta=0.1):
    p = torch.softmax(action_values_batch, dim=1)
    log_p = torch.log_softmax(action_values_batch, dim=1)
    entropy = -1 * torch.mean(torch.sum(p * log_p, dim=1), dim=0)
    entropy_bonus = -1 * beta * entropy
    return entropy_bonus

# make loss function whose gradient, for the right data, is policy gradient
def log_loss(action_values, reward_to_go):
    error = torch.log(action_values) * reward_to_go
    return -torch.mean(error)


## Gym Helpers

In [64]:
from gym.wrappers import RecordVideo

# Play episode
def play_episode(env, agent, record_path=None):
    if record_path:
        env = RecordVideo(env, video_folder=record_path, new_step_api=True)

    cumulative_reward = 0

    observation = env.reset()
    done = False

    while not done:
        torch_state = torch.FloatTensor(observation).unsqueeze(dim=0)
        action = agent.action(torch_state)

        observation, reward, terminated, truncarted, info = env.step(action.item())

        done = terminated or truncarted

        cumulative_reward += reward
    
    env.close()
    
    return cumulative_reward

def evaluate_agent(env, agent, num_episodes=100):
    reward = 0
    for episode in range(num_episodes):
        reward += play_episode(env, agent)
    return reward / num_episodes

def train_epoch(env, agent, optimizer, gamma=0.99, num_episode=32):
    memory = ActionBuffer(gamma)

    cumulative_rewards = []
    episodes_length = []

    for episode in range(num_episode):
        observation = env.reset()
        done = False

        tt_reward = 0
        ep_length = 0

        while not done:
            torch_state = torch.as_tensor(observation).unsqueeze(dim=0)
            torch_action = agent.action(torch_state)

            next_observation, reward, terminated, truncarted, info = env.step(torch_action.item())

            done = terminated or truncarted
            memory.push(torch_state, torch_action, reward, done)

            observation = next_observation

            tt_reward += reward
            ep_length += 1

        cumulative_rewards.append(tt_reward)
        episodes_length.append(ep_length)

    state_batch, action_batch, reward_batch, done_batch = memory.flush()

    loss, entropy = agent.loss(state_batch, action_batch, reward_batch, done_batch)
    loss = loss + entropy

    #action_values_batch = agent(state_batch)

    #loss = ascent_log_loss(action_values_batch, action_batch, reward_batch)
    #loss += entropy_loss(action_values_batch)

    #action_values_batch = torch.softmax(action_values_batch, dim=1)
    #action_value_batch = get_prob_from_pred(action_values_batch, action_batch)
    #loss = log_loss(action_value_batch, reward_batch)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return cumulative_rewards, loss, entropy, episodes_length

## Run

In [65]:
# Parameters:
ALPHA = 1e-2
GAMMA = 0.99

BATCH_SIZE = 64
NB_EPOCH = 100

In [66]:
env = gym.make(ENVIRONMENT_ID, new_step_api=True)

# Infos Env
observation_space_size = env.observation_space.shape[0]
num_actions = env.action_space.n

In [67]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(comment=f'_PG_CP_Gamma={GAMMA},'
                                        f'LR={ALPHA},'
                                        f'BS={BATCH_SIZE}'
                            )

In [68]:
model = nn.Sequential(
            nn.Linear(in_features=observation_space_size, out_features=16, bias=True),
            nn.PReLU(),
            nn.Linear(in_features=16, out_features=16, bias=True),
            nn.PReLU(),
            nn.Linear(in_features=16, out_features=num_actions, bias=True),
            nn.Identity()
        )

random_agent = Stochastic(num_actions=num_actions, distribution=torch.distributions.Categorical)
agent = Reinforce(model=model, distribution=torch.distributions.Categorical)

optimizer = torch.optim.Adam(params=agent.parameters(), lr=ALPHA)

In [69]:
reward_random = evaluate_agent(env, agent=random_agent, num_episodes=100)
reward_agent = evaluate_agent(env, agent=agent, num_episodes=100)

print("Mean Reward, agent:", reward_agent, "random,", reward_random)

Mean Reward, agent: 22.37 random, 22.69


In [70]:
for epoch in range(NB_EPOCH):
    rewards, loss, entropy, episodes_length = train_epoch(env, agent=agent, optimizer=optimizer, gamma=GAMMA, num_episode=BATCH_SIZE)

    print("Epoch", epoch, "reward:", torch.FloatTensor(rewards).mean().item())

    writer.add_scalar("mean_reward", torch.FloatTensor(rewards).mean(), epoch)
    writer.add_scalar("mean_length", torch.FloatTensor(episodes_length).mean(), epoch)
    writer.add_scalar("loss", loss, epoch)
    writer.add_scalar("entropy", entropy, epoch)

Epoch 0 reward: 20.5625
Epoch 1 reward: 19.625
Epoch 2 reward: 26.4375
Epoch 3 reward: 25.15625
Epoch 4 reward: 23.375
Epoch 5 reward: 27.890625
Epoch 6 reward: 30.453125
Epoch 7 reward: 29.53125
Epoch 8 reward: 32.296875
Epoch 9 reward: 34.34375
Epoch 10 reward: 44.515625
Epoch 11 reward: 39.71875
Epoch 12 reward: 54.0625
Epoch 13 reward: 50.65625
Epoch 14 reward: 61.8125
Epoch 15 reward: 63.125
Epoch 16 reward: 78.359375
Epoch 17 reward: 79.09375
Epoch 18 reward: 85.546875
Epoch 19 reward: 91.0625
Epoch 20 reward: 97.875
Epoch 21 reward: 115.4375
Epoch 22 reward: 122.046875
Epoch 23 reward: 148.4375
Epoch 24 reward: 165.1875
Epoch 25 reward: 180.953125
Epoch 26 reward: 229.609375
Epoch 27 reward: 307.671875
Epoch 28 reward: 374.546875
Epoch 29 reward: 394.4375
Epoch 30 reward: 424.515625
Epoch 31 reward: 457.625
Epoch 32 reward: 482.578125
Epoch 33 reward: 496.78125
Epoch 34 reward: 494.359375
Epoch 35 reward: 495.703125
Epoch 36 reward: 482.734375
Epoch 37 reward: 478.75
Epoch 38 re

In [71]:
reward_random = evaluate_agent(env, agent=random_agent, num_episodes=100)
reward_agent = evaluate_agent(env, agent=agent, num_episodes=100)

print("Mean Reward, agent:", reward_agent, "random,", reward_random)

Mean Reward, agent: 460.39 random, 22.08


In [72]:
reward = play_episode(env, agent=agent, record_path=RECORD_PATH)

print("Reward:", reward)

  logger.warn(
  logger.deprecation(
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


Reward: 454.0
