In [9]:
!pip install --upgrade gym==0.26.2

  and should_run_async(code)


Collecting gym==0.26.2
  Downloading gym-0.26.2.tar.gz (721 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/721.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/721.7 kB[0m [31m4.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m716.8/721.7 kB[0m [31m11.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m721.7/721.7 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: gym
  Building wheel for gym (pyproject.toml) ... [?25l[?25hdone
  Created wheel for gym: filename=gym-0.26.2-py3-none-any.whl size=827628 sha256=faf7ff23cd0dd13a0e6a13fcfd38939a289e3243f67269c3ce3e759eb9c006

In [1]:
import gym
import numpy as np
import torch.distributions
import torch.nn as nn
import torch.nn.functional as F
import numpy

from collections import  deque


In [2]:
class Policy(nn.Module):
    def __init__(self, in_dim: int = 4, hidden_dim: int = 128, out_dim: int = 2):
        super(Policy, self).__init__()
        self.mlp = nn.Sequential(
            nn.Linear(in_features=in_dim, out_features=hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=0.3),
            nn.Linear(in_features=hidden_dim, out_features=hidden_dim),
            nn.ReLU()
        )
        self.fc_out = nn.Linear(in_features=hidden_dim, out_features=out_dim)

    def forward(self, x):
        x = self.mlp(x)
        x_out = self.fc_out(x)
        return F.softmax(x_out, dim=-1)

    def act(self, state):
        state = torch.from_numpy(state).unsqueeze(0)
        probs = self.forward(state)
        m = torch.distributions.Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action), m.entropy()


class ValueFunction(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
        super(ValueFunction, self).__init__()
        self.mlp = nn.Sequential(
            nn.Linear(in_features=input_dim, out_features=2*hidden_dim),
            nn.GELU(),
        )
        self.fc_out = nn.Linear(in_features=2*hidden_dim, out_features=output_dim)

    def forward(self, x):
        x = self.mlp(x)
        x_out = self.fc_out(x)
        return x_out



In [3]:
def train(num_episodes: int, policy, env):
    optimizer = torch.optim.Adam(policy.parameters(), lr=1e-3, weight_decay=1e-3)
    policy.train()
    value_function.train()
    gamma = 0.99
    alpha = 0.08
    seeds = [42, 1234, 555, 52]
    baseline = 0.0
    for episode in range(num_episodes):
        log_probs = []
        rewards = []
        entropies = []
        state = env.reset(seed=seeds[episode % len(seeds)])[0]
        done = False
        while not done:
            action, log_prob, entropy = policy.act(state)
            state, reward, done, _, _ = env.step(action)
            log_probs.append(log_prob)
            rewards.append(reward)
            entropies.append(entropy)

            if reward > 1900:
                break

        n = len(rewards)
        G = 0
        discounted_rewards = deque()
        for t in reversed(range(n)):
            G = rewards[t] + gamma * G
            discounted_rewards.appendleft(G)

        discounted_rewards = torch.tensor(discounted_rewards)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)

        entropies = torch.tensor(entropies).float()

        policy_loss = []
        baseline = torch.mean(torch.tensor(discounted_rewards).float())
        # baseline = 0.25 * baseline + 0.75 * torch.mean(torch.tensor(discounted_rewards).float())
        for discount_reward, log_prob, entropy in zip(discounted_rewards, log_probs, entropies):
            policy_loss.append(-(discount_reward - baseline + alpha * entropy) * log_prob)
        policy_loss = torch.cat(policy_loss).sum()
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        print(f'Episode: {episode}, loss: {policy_loss}, reward: {np.sum(rewards)}')
        print('===========')



In [4]:
def train_with_value_function(num_episodes: int, policy, value_function, env):
    optimizer = torch.optim.Adam(policy.parameters(), lr=1e-2, weight_decay=1e-3)
    optimizer_value_function = torch.optim.Adam(value_function.parameters(), lr=1e-3, weight_decay=1e-3)
    mse_loss_value = nn.MSELoss()
    policy.train()
    value_function.train()
    gamma = 0.99
    alpha = 0.1
    for episode in range(num_episodes):
        log_probs = []
        rewards = []
        state_values = []
        prev_state_values = []
        states = []
        entropies = []
        state = env.reset(seed=42)[0] # add different seeds
        done = False
        while not done:
            state_tensor = torch.tensor(state, dtype=torch.float32)
            action, log_prob, entropy = policy.act(state)
            next_state, reward, done, _, _ = env.step(action)
            log_probs.append(log_prob)
            rewards.append(reward)
            states.append(state)
            entropies.append(entropy)

            if len(state_values) > 0:
                prev_state_values.append(state_values[-1])
            state_values.append(value_function(state_tensor))
            state = next_state


        n = len(rewards)
        G = 0
        discounted_rewards = deque()
        for t in reversed(range(n)):
            G = rewards[t] + gamma * G
            discounted_rewards.appendleft(G)

        discounted_rewards = torch.tensor(discounted_rewards)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)

        entropies = torch.tensor(entropies).float()

        policy_loss = []
        for discount_reward, log_prob, state_value, prev_state_value, entropy in zip(discounted_rewards, log_probs, state_values, prev_state_values, entropies):
            cur_state_value = state_value.detach().clone()
            cur_prev_state_value = prev_state_value.detach().clone()
            cur_state_value.requires_grad = False
            cur_prev_state_value.requires_grad = False
            policy_loss.append(-(discount_reward + cur_state_value - cur_prev_state_value + alpha * entropy) * log_prob)

        policy_loss = torch.cat(policy_loss).sum()
        value_loss = mse_loss_value(torch.cat(state_values).squeeze(), discounted_rewards)

        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

        optimizer_value_function.zero_grad()
        value_loss.backward()
        optimizer_value_function.step()
        print(f'Episode: {episode}, loss: {policy_loss}, value_loss: {value_loss.item()}, reward: {np.sum(rewards)}')
        print('===========')


In [5]:
def train_with_rloo(num_episodes: int, policy, env):
    def get_rloo(rewards):
        n = len(rewards)
        rloo_values = []
        for i in range(n):
            rloo_value = (sum(rewards) - rewards[i]) / (n - 1)
            rloo_values.append(rloo_value / n)
        return rloo_values

    optimizer = torch.optim.Adam(policy.parameters(), lr=1e-3, weight_decay=1e-3)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[300, 420], gamma=0.9)
    policy.train()
    value_function.train()
    gamma = 0.99
    alpha = 0.08
    seeds = [42, 1234, 555, 52]
    for episode in range(num_episodes):
        log_probs = []
        rewards = []
        entropies = []
        state = env.reset(seed=seeds[episode % len(seeds)])[0]
        done = False
        while not done:
            action, log_prob, entropy = policy.act(state)
            state, reward, done, _, _ = env.step(action)
            log_probs.append(log_prob)
            rewards.append(reward)
            entropies.append(entropy)

            if sum(rewards) > 1900:
                break

        n = len(rewards)
        G = 0
        discounted_rewards = deque()
        for t in reversed(range(n)):
            G = rewards[t] + gamma * G
            discounted_rewards.appendleft(G)

        discounted_rewards = torch.tensor(discounted_rewards)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)

        entropies = torch.tensor(entropies).float()

        policy_loss = []
        rloo_values = torch.tensor(get_rloo(discounted_rewards))
        for discounted_reward, rloo_value, log_prob, entropy in zip(discounted_rewards, rloo_values, log_probs, entropies):
            policy_loss.append(-(discounted_reward - rloo_value + alpha * entropy) * log_prob)
        policy_loss = torch.cat(policy_loss).sum()
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        scheduler.step()
        print(f'Episode: {episode}, loss: {policy_loss}, reward: {np.sum(rewards)}')
        print('===========')



In [6]:
def eval(policy, env):
    policy.eval()
    seeds = [42, 1234, 555, 52]
    for seed in seeds:
        total_reward = 0.0
        state = env.reset(seed=seed)[0]
        done = False
        while not done and total_reward < 1900:
            action, log_prob, entropy = policy.act(state)
            state, reward, done, _, _ = env.step(action)
            total_reward += reward
        print(f'Seed: {seed}, total_reward: {total_reward}')


In [8]:
env = gym.make("CartPole-v1")
env.reset(seed=42)

input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n

policy = Policy(in_dim=input_dim, hidden_dim=128, out_dim=output_dim)
value_function = ValueFunction(input_dim=input_dim, hidden_dim=16, output_dim=1)
train(num_episodes=500, policy=policy, env=env)

train_with_value_function(num_episodes=500, policy=policy, value_function=value_function, env=env)

train_with_rloo(num_episodes=500, policy=policy, env=env)

eval(policy, env)

