In [1]:
import gymnasium as gym
import math
import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

env = gym.make("LunarLander-v2")


# if GPU is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [4]:
class DQN(nn.Module):

    def __init__(self, n_observations, n_actions):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(n_observations, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, n_actions)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)

In [5]:
BATCH_SIZE = 128
GAMMA = 0.99
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1000
TAU = 0.005
LR = 1e-4

# Get number of actions from gym action space
n_actions = env.action_space.n
# Get the number of state observations
state, info = env.reset()
n_observations = len(state)

policy_net = DQN(n_observations, n_actions).to(device)
target_net = DQN(n_observations, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())

optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)
memory = ReplayMemory(10000)


steps_done = 0


def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return the largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)



In [6]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    with torch.no_grad():
        next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0]
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()

In [7]:
if torch.cuda.is_available():
    num_episodes = 600
else:
    num_episodes = 600

for i_episode in range(num_episodes):
    # Initialize the environment and get it's state
    state, info = env.reset()
    state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
    rewards = []
    for t in count():
        action = select_action(state)
        observation, reward, terminated, truncated, _ = env.step(action.item())
        rewards.append(reward)
        reward = torch.tensor([reward], device=device)
        done = terminated or truncated
        
        if terminated:
            next_state = None
        else:
            next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)

        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the policy network)
        optimize_model()

        # Soft update of the target network's weights
        # θ′ ← τ θ + (1 −τ )θ′
        target_net_state_dict = target_net.state_dict()
        policy_net_state_dict = policy_net.state_dict()
        for key in policy_net_state_dict:
            target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
        target_net.load_state_dict(target_net_state_dict)
        
        if done:
            break
    print(f'Episode: {i_episode}   Performance: {sum(rewards)}')


Episode: 0   Performance: -293.560037560002
Episode: 1   Performance: -34.21483081813844
Episode: 2   Performance: -400.6470162478713
Episode: 3   Performance: -203.06309074397535
Episode: 4   Performance: -172.92067530991196
Episode: 5   Performance: -163.47534437112716
Episode: 6   Performance: -229.29612142300203
Episode: 7   Performance: 2.7261605909323094
Episode: 8   Performance: -252.31593668445004
Episode: 9   Performance: -146.0550995957277
Episode: 10   Performance: -101.71053482620184
Episode: 11   Performance: -152.0524397100358
Episode: 12   Performance: -145.84569049219778
Episode: 13   Performance: -237.9503321010779
Episode: 14   Performance: -144.07934712517468
Episode: 15   Performance: 146.07785008325976
Episode: 16   Performance: -243.7225294973615
Episode: 17   Performance: -238.80471284840735
Episode: 18   Performance: -77.17386072607324
Episode: 19   Performance: -105.40357600370334
Episode: 20   Performance: -119.85869102584454
Episode: 21   Performance: -107.84

Episode: 176   Performance: -3.5459976352660334
Episode: 177   Performance: 84.73347509265832
Episode: 178   Performance: 110.25230875147098
Episode: 179   Performance: 116.0488873464355
Episode: 180   Performance: 200.78321017531266
Episode: 181   Performance: 192.00849624446198
Episode: 182   Performance: 225.03041332650142
Episode: 183   Performance: 165.57464575331588
Episode: 184   Performance: 136.22997158954
Episode: 185   Performance: 170.8957463592492
Episode: 186   Performance: 19.232486573861465
Episode: 187   Performance: 116.88772013120976
Episode: 188   Performance: -29.280390344248968
Episode: 189   Performance: -7.1018309647018745
Episode: 190   Performance: -7.000596029251546
Episode: 191   Performance: -37.935426318842474
Episode: 192   Performance: -61.59498865513602
Episode: 193   Performance: -53.961935446428285
Episode: 194   Performance: 2.1296081546156227
Episode: 195   Performance: 4.090225143900007
Episode: 196   Performance: 204.78193316644283
Episode: 197   

KeyboardInterrupt: 

In [8]:
torch.save(policy_net.state_dict(), "LunarLander_policy.pt")

In [9]:
@torch.no_grad()
def make_action(observation):
    action = policy_net(torch.tensor([observation])).max(1)[1]
    return action.item()

In [13]:
env = gym.make("LunarLander-v2", render_mode="human")
observation, info = env.reset()
for _ in range(2000):
#    action = env.action_space.sample()  # this is where you would insert your policy
   action = make_action(observation)
   observation, reward, terminated, truncated, info = env.step(action)

   if terminated or truncated:
        break
env.close()
