In [1]:
%pip install "swig"
%pip install "gymnasium[box2d]" "tensorboardX"

Collecting swig
  Downloading swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.9 MB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m1.2/1.9 MB[0m [31m17.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.3.1
Collecting tensorboardX
  Downloading tensorboardx-2.6.4-py3-none-any.whl.metadata (6.2 kB)
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [79]:
import gymnasium as gym
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from itertools import count
from tensorboardX import SummaryWriter
import random
from collections import namedtuple, deque

In [78]:
class Net(nn.Module):
  def __init__(self,
               state_dims,
               action_dims,
               activation_fn=F.relu
  ):
    super().__init__()

    self.layer1 = nn.Linear(state_dims, 256)
    self.layer2 = nn.Linear(256, 256)
    self.layer3 = nn.Linear(256, action_dims)

  def forward(self, x):
      x = F.relu(self.layer1(x))
      x = F.relu(self.layer2(x))
      return self.layer3(x)

In [92]:
class ReplayBuffer():
    def __init__(self, n_actions, memory_size, batch_size):
        self.n_actions = n_actions
        self.batch_size = batch_size
        self.memory = deque(maxlen = memory_size)
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])

    def __len__(self):
        return len(self.memory)

    def add(self, state, action, reward, next_state, done):
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self):
        experiences = random.sample(self.memory, k=self.batch_size)

        states, actions, rewards, next_states, dones = zip(*experiences)

        states = torch.cat(states)
        next_states = torch.cat(next_states)
        actions = torch.cat(actions)
        rewards = torch.tensor(rewards, dtype=torch.float32, device=device).unsqueeze(1)
        dones = torch.tensor(dones, dtype=torch.bool, device=device).unsqueeze(1)


        # print(states.shape, actions.shape, rewards.shape, next_states.shape, dones.shape)
        return (states, actions, rewards, next_states, dones)

class DQN:
  def __init__(self, state_dims, action_dims, memory_capacity, batch_size, epsilon, gamma, lr, tau):
    super().__init__()
    self.q_net = Net(state_dims, action_dims)
    self.target_net = Net(state_dims, action_dims)
    self.optimizer = optim.AdamW(self.q_net.parameters(), lr=lr)

    self.memory = ReplayBuffer(action_dims, memory_capacity, batch_size)

    self.gamma = gamma
    self.epsilon = epsilon
    self.tau = tau

    self.learn_counter = 0

    self.MSE = nn.MSELoss()
    self.Q_NETWORK_ITERATION = 5

  def save(self, experiences):
    self.memory.add(*experiences)
    self.learn_counter += 1
    if self.learn_counter % self.Q_NETWORK_ITERATION == 0:
      if len(self.memory) > self.memory.batch_size:
        experiences = self.memory.sample()
        self.learn(experiences)

  def learn(self, experiences):
    states, actions, rewards, next_states, is_terminals = experiences

    q_target = (self.gamma * self.target_net(next_states).detach().max(1)[0].unsqueeze(1) * ~is_terminals) + rewards

    q_eval = self.q_net(states).gather(1, actions)

    loss = self.MSE(q_eval, q_target)

    self.optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(self.q_net.parameters(), max_norm=1.0)
    self.optimizer.step()

    # soft update
    self.softUpdate()
  def softUpdate(self):
    for eval_param, target_param in zip(self.q_net.parameters(), self.target_net.parameters()):
      target_param.data.copy_(self.tau*eval_param.data + (1.0-self.tau)*target_param.data)

  def select_action(self, state, env):
    if random.random() > self.epsilon: # eps greedy
      with torch.no_grad():
        action = self.q_net(state).max(1).indices.view(1, 1)
    else: # random
      action = torch.tensor([[env.action_space.sample()]], dtype=torch.long, device=device)

    return action

In [93]:
BATCH_SIZE = 128
LR = 1e-4
GAMMA = 0.99
EPSILON = 1.0
EPSILON_END = 0.01
MEMORY_CAPACITY = 50000
Q_NETWORK_ITERATION = 100
LEARN_STEP = 5
TAU = 0.01

env = gym.make("LunarLander-v3")
NUM_ACTIONS = env.action_space.n
NUM_STATES = env.observation_space.shape[0]
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
agent = DQN(NUM_STATES, NUM_ACTIONS, MEMORY_CAPACITY, BATCH_SIZE, EPSILON, GAMMA, LR, TAU)
tb_writer = SummaryWriter()
max_eps = 10000
ep_rewards = []

solved_reward   = 200
print_interval  = 100

running_reward = 0.0

for ep in tqdm(range(1, max_eps + 1), desc="TRAINING"):
  state, _ = env.reset()
  state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
  done = False
  ep_reward = 0.0

  for step in count():
    action = agent.select_action(state, env)
    next_state, reward, terminated, truncated, _ = env.step(action.item())

    ep_reward += reward
    done = terminated or truncated
    reward = torch.tensor([reward], device=device)
    next_state = torch.tensor(next_state, dtype=torch.float32, device=device).unsqueeze(0)

    agent.save((state, action, reward, next_state, done))
    state = next_state
    if done:
      break


  ep_rewards.append(ep_reward)
  running_reward += ep_reward
  agent.epsilon = max(agent.epsilon * 0.995, EPSILON_END)
  if ep % print_interval == 0:
    avg_ep_reward = running_reward / print_interval

    print(f"Episode {ep:5d}  "
          f"Avg reward: {avg_ep_reward:.2f}")

    if tb_writer is not None:
        tb_writer.add_scalar('Reward', avg_ep_reward, ep)
    if avg_ep_reward > solved_reward:
        print("\n########## Solved! ##########")
        break

    running_reward = 0.0


TRAINING:   1%|          | 100/10000 [00:13<25:02,  6.59it/s]

Episode   100  Avg reward: -162.81


TRAINING:   2%|▏         | 200/10000 [00:58<2:39:26,  1.02it/s]

Episode   200  Avg reward: -67.39


TRAINING:   3%|▎         | 300/10000 [03:02<3:35:48,  1.33s/it]

Episode   300  Avg reward: -18.54


TRAINING:   4%|▍         | 400/10000 [04:43<1:47:25,  1.49it/s]

Episode   400  Avg reward: 77.47


TRAINING:   5%|▌         | 500/10000 [05:58<2:00:50,  1.31it/s]

Episode   500  Avg reward: 199.31


TRAINING:   6%|▌         | 599/10000 [07:07<1:51:44,  1.40it/s]

Episode   600  Avg reward: 202.06

########## Solved! ##########



