In [1]:
import gym
import torch

In [2]:
ENVIRONMENT_ID = "Pendulum-v1"
RECORD_PATH = "../videos/reinforce/" + ENVIRONMENT_ID

## Torch RL Library

In [46]:
import torch
import torch.nn as nn
import torch.distributions as dist

class Stochastic(nn.Module):

    def __init__(self, num_actions, distribution: torch.distributions.distribution.Distribution) -> None:
        super().__init__()

        self.num_actions = num_actions
        self.distribution = distribution
    
    def forward(self, states):
        probs = torch.full((states.shape[0], self.num_actions), fill_value=1/self.num_actions)
        return probs

    def policy(self, state):
        values = self.forward(state)
        if hasattr(self.distribution, 'logits'):
            policy = self.distribution(logits=values)
        else:
            policy = self.distribution(values, 0.1)
        return policy
    
    def action(self, state: torch.Tensor):
        action = self.policy(state).sample()
        return action


class Reinforce(torch.nn.Module):

    def __init__(self, model: torch.nn.Module, distribution: torch.distributions.distribution.Distribution) -> None:
        super().__init__()

        self.model = model
        self.distribution = distribution

    def forward(self, state: torch.Tensor):
        values = self.model(state)
        return values
    
    def policy(self, state: torch.Tensor):
        values = self.forward(state)
        if hasattr(self.distribution, 'logits'):
            policy = self.distribution(logits=values)
        else:
            policy = self.distribution(torch.tanh(values[:1]) * 2, torch.sigmoid(values[1:]))
            print(policy)
        return policy

    def action(self, state: torch.Tensor):
        action = self.policy(state).sample()
        print(action)
        return action

    def loss(self, state_batch: torch.Tensor, action_batch: torch.Tensor, reward_batch: torch.Tensor, done_batch: torch.Tensor):
        distribution = self.policy(state_batch)
        entropy = distribution.entropy().mean()
        logp = distribution.log_prob(action_batch)
        loss = -(logp * reward_batch).mean()
        return loss, entropy


class ActionBuffer():
    def __init__(self, gamma) -> None:
        super().__init__()
        self.gamma = gamma

        self.state_buffer = []
        self.action_buffer = []
        self.reward_buffer = []
        self.done_buffer = []

    def __len__(self):
        return len(self.state_buffer)
    
    def push(self, state, action, reward, done):
        self.state_buffer.append(state)
        self.action_buffer.append(action)
        self.reward_buffer.append(reward)
        self.done_buffer.append(done)
    
    def flush(self):
        for i in reversed(range(self.__len__())):
            self.reward_buffer[i - 1] = self.reward_buffer[i - 1] + (self.gamma * (self.reward_buffer[i] * (not self.done_buffer[i - 1])))
        
        state_batch = torch.cat(self.state_buffer)
        action_batch = torch.FloatTensor(self.action_buffer)
        reward_batch = torch.FloatTensor(self.reward_buffer)
        done_batch = torch.BoolTensor(self.done_buffer)

        self.__init__(self.gamma)

        return state_batch, action_batch, reward_batch, done_batch

def get_prob_from_pred(pred_batch, action_batch):
    return pred_batch.gather(dim=1,index=action_batch.long().view(-1,1)).squeeze()

def ascent_log_loss(action_values_batch, action_batch, reward_batch):
    distribution = dist.Categorical(logits=action_values_batch)
    logp = distribution.log_prob(action_batch)
    loss = -(logp * reward_batch).mean()
    return loss

def entropy_loss(action_values_batch, beta=0.1):
    p = torch.softmax(action_values_batch, dim=1)
    log_p = torch.log_softmax(action_values_batch, dim=1)
    entropy = -1 * torch.mean(torch.sum(p * log_p, dim=1), dim=0)
    entropy_bonus = -1 * beta * entropy
    return entropy_bonus

## Gym Helpers

In [80]:
from math import exp
from gym.wrappers import RecordVideo

# Play episode
def play_episode(env, agent, record_path=None):
    if record_path:
        env = RecordVideo(env, video_folder=record_path, new_step_api=True)

    cumulative_reward = 0

    observation = env.reset()
    done = False

    while not done:
        torch_state = torch.FloatTensor(observation)
        action = agent.action(torch_state)

        observation, reward, terminated, truncarted, info = env.step(action.numpy())
        done = terminated or truncarted

        cumulative_reward += reward
    
    env.close()
    
    return cumulative_reward

def evaluate_agent(env, agent, num_episodes=100):
    reward = 0
    for episode in range(num_episodes):
        reward += play_episode(env, agent)
    return reward / num_episodes

def train_epoch(env, agent, optimizer, gamma=0.99, num_episode=32):
    memory = ActionBuffer(gamma)

    cumulative_rewards = []
    episodes_length = []

    for episode in range(num_episode):
        observation = env.reset()
        done = False

        tt_reward = 0
        ep_length = 0

        while not done:
            torch_state = torch.as_tensor(observation)
            torch_action = agent.action(torch_state)

            next_observation, reward, terminated, truncarted, info = env.step(torch_action.numpy())
            if type(reward) != "int":
                reward = reward[0]
            done = terminated or truncarted

            memory.push(torch_state, torch_action, reward, done)

            observation = next_observation

            tt_reward += reward 
            ep_length += 1

        cumulative_rewards.append(tt_reward)
        episodes_length.append(ep_length)

    state_batch, action_batch, reward_batch, done_batch = memory.flush()

    loss, entropy = agent.loss(state_batch, action_batch, reward_batch, done_batch)
    loss = loss + 0.1 * entropy

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return cumulative_rewards, loss, entropy, episodes_length

## Pre processing state:

In [None]:
def pre_processing_state(state):
    torch_state = torch.as_tensor(state)
    return torch_state

## Run

In [81]:
# Parameters:
ALPHA = 1e-2
GAMMA = 0.99

BATCH_SIZE = 128
NB_EPOCH = 100

In [82]:
# Create environment
env = gym.make(ENVIRONMENT_ID, new_step_api=True)

# Infos Env
observation_space_size = env.observation_space.shape[0]
num_actions = env.action_space.n if type(env.action_space.sample()) == "int" else env.action_space.sample().shape[0]
is_continuous = env.action_space.dtype == "float32"

print("Environment Infos:")
print("  Observation Shape:", observation_space_size)
print("  Number of actions:", num_actions)
print("  Problem is continuous:", is_continuous)

Environment Infos:
  Observation Shape: 3
  Number of actions: 1
  Problem is continuous: True


In [83]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(comment=f'_PG_CP_Gamma={GAMMA},'
                                f'LR={ALPHA},'
                                f'BS={BATCH_SIZE}')

In [84]:
if is_continuous:
    distribution = torch.distributions.Normal
else:
    distribution = torch.distributions.Categorical

In [85]:
model = nn.Sequential(
            nn.Linear(in_features=observation_space_size, out_features=32, bias=True),
            nn.PReLU(),
            nn.Linear(in_features=32, out_features=16, bias=True),
            nn.PReLU(),
            nn.Linear(in_features=16, out_features=num_actions + is_continuous, bias=True),
            nn.Identity()
        )

random_agent = Stochastic(num_actions=num_actions, distribution=distribution)
agent = Reinforce(model=model, distribution=distribution)

optimizer = torch.optim.Adam(params=agent.parameters(), lr=ALPHA)

In [86]:
reward_random = evaluate_agent(env, agent=random_agent, num_episodes=100)
reward_agent = evaluate_agent(env, agent=agent, num_episodes=100)

print("Mean Reward, agent:", reward_agent, "random,", reward_random)

torch.Size([3])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 

  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(


torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([3, 1])
torch.Size([

In [87]:
for epoch in range(NB_EPOCH):
    rewards, loss, entropy, episodes_length = train_epoch(env, agent=agent, optimizer=optimizer, gamma=GAMMA, num_episode=BATCH_SIZE)

    print("Epoch", epoch, "reward:", torch.FloatTensor(rewards).mean().item())

    writer.add_scalar("mean_reward", torch.FloatTensor(rewards).mean(), epoch)
    writer.add_scalar("mean_length", torch.FloatTensor(episodes_length).mean(), epoch)
    writer.add_scalar("loss", loss, epoch)
    writer.add_scalar("entropy", entropy, epoch)

torch.Size([1, 3])
Normal(loc: torch.Size([0, 2]), scale: torch.Size([0, 2]))
tensor([], size=(0, 2))


IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
reward_random = evaluate_agent(env, agent=random_agent, num_episodes=100)
reward_agent = evaluate_agent(env, agent=agent, num_episodes=100)

print("Mean Reward, agent:", reward_agent, "random,", reward_random)

Mean Reward, agent: -1602.2788122885897 random, [-1380.9838]


In [None]:
reward = play_episode(env, agent=agent, record_path=RECORD_PATH)

print("Reward:", reward)

  logger.deprecation(
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


Reward: -1593.0687357262032
