In [26]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions.categorical import Categorical
from collections import deque
import warnings
import itertools
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt

In [10]:
# !pip install gym[box2d]

In [11]:
gamma = 0.99 # high gamma to not deminish future rewards to much
# MAX_EPISODES = 4000

In [12]:
env = gym.make('LunarLander-v2')
# env = gym.make('CartPole-v1')

In [13]:
class PolicyNet(nn.Module):
    def __init__(self, input_size, hidden_units, output_size):
        super(PolicyNet, self).__init__()
        self.l1 = nn.Linear(input_size, hidden_units)
        self.l2 = nn.Linear(hidden_units, output_size)

    def forward(self, x):
        x = self.l1(x)
        x = F.relu(x)
        logits = self.l2(x)
        probs = F.softmax(logits, dim=-1)
        return probs

    def __call__(self, x):
        out = self.forward(x)
        return out

In [21]:
policy = PolicyNet(input_size=env.observation_space.shape[0], hidden_units=32, output_size=4)

In [22]:
optimizer = torch.optim.AdamW(policy.parameters(), lr=0.01)

In [23]:
def discount_rewards(rewards):
    discounted_rewards = np.zeros(len(rewards))
    cumulative_rewards = 0
    for i in reversed(range(0, len(rewards))):
        cumulative_rewards = cumulative_rewards * gamma + rewards[i]
        discounted_rewards[i] = cumulative_rewards
    return discounted_rewards

### Run 2

In [24]:
returns_log = deque(maxlen=50)
scores = []

for episode in itertools.count():
    rewards = []
    actions = []
    states  = []
    done = False
    state = env.reset()
    while not done:
        with torch.no_grad():
            probs = policy(torch.tensor(state).unsqueeze(0).float())
            sampler = Categorical(probs)
            action = sampler.sample()

        new_state, reward, done, info = env.step(action.item())

        states.append(state)
        actions.append(action)
        rewards.append(reward)

        state = new_state

    rewards = np.array(rewards)
    R = torch.tensor(discount_rewards(rewards))

    states = torch.tensor(states).float()
    actions = torch.tensor(actions)

    probs = policy(states)
    sampler = Categorical(probs)
    log_probs = sampler.log_prob(actions)   
    loss = -torch.mean(log_probs * R) 
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    returns_log.append(np.sum(rewards))
    score = np.mean(returns_log)
    scores.append(score)
    if episode>0 and episode % 50 == 0:
        print(f"Episode: {episode}  Average Return: {score}")

    if score >= 195:
        print(f"Solved! Episode: {episode} Average Return: {score}")
        break

Episode: 50  Average Return: -196.01112879556578
Episode: 100  Average Return: -174.7032256679
Episode: 150  Average Return: -178.14168698939324
Episode: 200  Average Return: -208.1071649408758
Episode: 250  Average Return: -184.43530783587482
Episode: 300  Average Return: -198.5906182062771
Episode: 350  Average Return: -66.33789416522777
Episode: 400  Average Return: -44.2536101481303
Episode: 450  Average Return: -36.00914857018242
Episode: 500  Average Return: -33.51351951429248
Episode: 550  Average Return: -42.528271577275056
Episode: 600  Average Return: -36.171622832990835
Episode: 650  Average Return: -26.67167775048077
Episode: 700  Average Return: -39.67647810095923
Episode: 750  Average Return: -13.949491669543162
Episode: 800  Average Return: -7.193367231394318
Episode: 850  Average Return: 4.786907524169813
Episode: 900  Average Return: 16.473253433170285
Episode: 950  Average Return: 16.330298387456804
Episode: 1000  Average Return: 28.17602358390906
Episode: 1050  Avera

KeyboardInterrupt: 

In [1]:
(fig, ax) = plt.subplots(1, 1)
ax.set_xlabel('Episode')
ax.set_ylabel('Reward')
ax.set_title('Reward vs Episode')
ax.plot(range(1, len(scores) + 1), scores, color='blue', linestyle='-', linewidth=1, label = "Reward")
ax.axhline(y = 195, color = 'r', linestyle = '--')
ax.grid()

NameError: name 'plt' is not defined

In [28]:
from colabgymrender.recorder import Recorder
env = Recorder(env, './video')

In [None]:
state = env.reset()
done = False
while not done:
    with torch.no_grad():
        probs = policy(torch.tensor(state).unsqueeze(0).float())
        sampler = Categorical(probs)
        action = sampler.sample()
    new_state, reward, done, info = env.step(action.item()) 
env.play()
