In [1]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
from collections import deque
import random
from itertools import count

In [2]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=128):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [3]:
def select_action(state, policy_net):
    state = torch.tensor(state, dtype=torch.float32)
    probs = F.softmax(policy_net(state), dim=-1)
    m = Categorical(probs)
    action = m.sample()
    return action.item(), m.log_prob(action)


def update_policy(rewards, log_probs, optimizer_policy, std_baseline_values = None, optimizer_value = None):
    returns = []
    discounted_reward = 0
    for r in rewards[::-1]:
        discounted_reward = r + 0.99 * discounted_reward
        returns.insert(0, discounted_reward)
    returns = torch.tensor(returns, dtype=torch.float32)
    returns = (returns - returns.mean()) / (returns.std() + 1e-9) # normalize


    policy_loss = []
    if std_baseline_values is not None:
        for log_prob, R, baseline in zip(log_probs, returns, std_baseline_values):
            policy_loss.append(-log_prob * (R - baseline))
    else:
        for log_prob, R in zip(log_probs, returns):
            policy_loss.append(-log_prob * R)
            
    optimizer_policy.zero_grad()
    policy_loss = torch.stack(policy_loss).sum()
    policy_loss.backward(retain_graph=True)
    optimizer_policy.step()

    if optimizer_value is not None:
        value_loss = F.mse_loss(std_baseline_values.squeeze(1), returns)
        optimizer_value.zero_grad()
        value_loss.backward()
        optimizer_value.step()

In [15]:
env_name = 'LunarLander-v2'
env = gym.make(env_name)
env_render = gym.make(env_name, render_mode = "human")

policy_net = NeuralNet(env.observation_space.shape[0], env.action_space.n)
value_net = NeuralNet(env.observation_space.shape[0], 1)
optimizer_policy = optim.Adam(policy_net.parameters(), lr=0.001)
optimizer_value = optim.Adam(value_net.parameters(), lr=0.001)

In [16]:
use_baseline = True

num_episodes = 10000
avg_rewards = deque(maxlen=100)
values = None
for episode in range(num_episodes):
    state, info = env.reset()
    done = False
    rewards = []
    log_probs = []
    states = []
    for t in count():
        action, log_prob = select_action(state, policy_net)
        next_state, reward, term, trunc, _ = env.step(action)
        rewards.append(reward)
        log_probs.append(log_prob)
        states.append(state)
        state = next_state
        done = term or trunc
        if done:
            break
    if use_baseline:
        values = value_net(torch.tensor(states, dtype=torch.float32))
        values = (values - values.mean()) / (values.std() + 1e-9) # normalize
    update_policy(rewards, log_probs, optimizer_policy, values, optimizer_value)
    avg_rewards.append(sum(rewards))
    
    if episode % 100 == 0:
        print(f"Episode {episode} - Reward: {sum(rewards)} - Avg Reward: {np.mean(avg_rewards)}")
    
    if np.mean(avg_rewards) > 200 and episode > 100:
        break


Episode 0 - Reward: -387.79145183747715 - Avg Reward: -387.79145183747715
Episode 100 - Reward: -371.44402140854476 - Avg Reward: -283.4727905960478
Episode 200 - Reward: -64.3440700754955 - Avg Reward: -179.36925727585017
Episode 300 - Reward: -93.180336179497 - Avg Reward: -108.54195807691464
Episode 400 - Reward: 3.542618977622027 - Avg Reward: -108.06957258292965
Episode 500 - Reward: -189.53711814849322 - Avg Reward: -83.63851261783132
Episode 600 - Reward: -40.27523320665416 - Avg Reward: -30.18043016992075
Episode 700 - Reward: -15.61083559310822 - Avg Reward: -13.942305555519397
Episode 800 - Reward: 16.69398692111811 - Avg Reward: -8.416047814670804
Episode 900 - Reward: -93.27271479894037 - Avg Reward: 7.3520389952558105
Episode 1000 - Reward: -45.49195198574582 - Avg Reward: 32.14753319610036
Episode 1100 - Reward: -18.093579720998136 - Avg Reward: 23.075630397722406
Episode 1200 - Reward: -5.188329038563694 - Avg Reward: 28.084911462885184
Episode 1300 - Reward: 4.354825902

In [138]:
torch.save(policy_net.state_dict(), f"policy_{env_name}.pth")

In [17]:
env_render = gym.make(env_name, render_mode = "human")
env_render = gym.make(env_name)
deq = deque(maxlen=100)
for m in range(100):
    state, info = env_render.reset()
    total_reward = 0
    for t in count():
        action, _ = select_action(state, policy_net)
        state, reward, term, trunc, _ = env_render.step(action)
        total_reward += reward
        if term or trunc:
            break
    deq.append(total_reward)

print(f"Total Reward: {total_reward}")
print(f"Average Reward: {np.mean(deq)}")
env_render.close()


Total Reward: 42.50689201443953
Average Reward: 34.538132552292616
