# Defining the DQN Agent

In [23]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque, namedtuple
import random
from torch.utils.tensorboard import SummaryWriter
import time

# Define transition for replay buffer
Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done'))

class DQNAgent:
    def __init__(self, state_dim, action_dim, lr=3e-5, gamma=0.99, tau=1.5, tau_min=0.05, tau_decay=0.998):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.tau = tau
        self.tau_min = tau_min
        self.tau_decay = tau_decay

        # Main network
        self.model = nn.Sequential(
            nn.Linear(state_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, action_dim)
        )

        # Target network (initially same as model)
        self.target_model = nn.Sequential(
            nn.Linear(state_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, action_dim)
        )
        self.target_model.load_state_dict(self.model.state_dict())
        self.target_model.eval()  # Don't train target network directly

        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.loss_fn = nn.MSELoss()

        self.replay_buffer = deque(maxlen=100000)
        self.batch_size = 128
        self.train_step_count = 0
        self.target_update_freq = 1000  # steps
        self.writer = SummaryWriter(f"runs/lunar_lander_boltzmann_{time.time()}")

    def get_action(self, state, eval_mode=False):
        state = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            q_values = self.model(state).squeeze().numpy()

        if eval_mode:
            return np.argmax(q_values)

        exp_q = np.exp((q_values - np.max(q_values)) / self.tau)
        probs = exp_q / np.sum(exp_q)
        return np.random.choice(self.action_dim, p=probs)

    def update_tau(self):
        self.tau = max(self.tau_min, self.tau * self.tau_decay)

    def store_transition(self, state, action, reward, next_state, done):
        reward = np.clip(reward, -1, 1)
        self.replay_buffer.append(Transition(state, action, reward, next_state, done))

    def sample_batch(self):
        batch = random.sample(self.replay_buffer, self.batch_size)
        states = torch.FloatTensor(np.array([t.state for t in batch]))
        actions = torch.LongTensor(np.array([t.action for t in batch]))
        rewards = torch.FloatTensor(np.array([t.reward for t in batch]))
        next_states = torch.FloatTensor(np.array([t.next_state for t in batch]))
        dones = torch.FloatTensor(np.array([t.done for t in batch]))
        return states, actions, rewards, next_states, dones

    def train_step(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        states, actions, rewards, next_states, dones = self.sample_batch()

        # Compute Q(s,a)
        current_q = self.model(states).gather(1, actions.unsqueeze(1))

        # Compute target Q using target network
        with torch.no_grad():
            next_q = self.target_model(next_states).max(1)[0]
            target_q = rewards + (1 - dones) * self.gamma * next_q

        loss = self.loss_fn(current_q.squeeze(), target_q)

        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
        self.optimizer.step()

        self.train_step_count += 1

        # Periodically update target network
        if self.train_step_count % self.target_update_freq == 0:
            self.target_model.load_state_dict(self.model.state_dict())

        # Logging
        self.writer.add_scalar('Loss/train', loss.item(), self.train_step_count)
        self.writer.add_scalar('Temperature', self.tau, self.train_step_count)

    def test_episode(self, env, render=False):
        state, _ = env.reset()
        total_reward = 0
        done = False

        while not done:
            if render:
                env.render()
            action = self.get_action(state, eval_mode=True)
            next_state, reward, done, truncated, _ = env.step(action)
            state = next_state
            total_reward += reward
            if done or truncated:
                break

        return total_reward


# Training Loop

In [24]:
env = gym.make("LunarLander-v3")
agent = DQNAgent(
    state_dim=env.observation_space.shape[0],
    action_dim=env.action_space.n,
    tau=1.0,        # Initial temperature
    tau_min=0.01,    # Minimum temperature
    tau_decay=0.995  # Decay rate
)

episodes = 1000
print_interval = 20

for ep in range(episodes):
    state, _ = env.reset()
    total_reward = 0
    done = False
    step_count = 0
    
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, truncated, _ = env.step(action)
        
        # Reward shaping - encourage staying centered
        shaped_reward = reward + 0.1*(abs(state[0]) - abs(next_state[0]))
        
        agent.store_transition(state, action, shaped_reward, next_state, done)
        
        # Train every 4 steps
        if step_count % 4 == 0:
            agent.train_step()
            
        state = next_state
        total_reward += reward
        step_count += 1
    
    agent.update_tau()
    agent.writer.add_scalar('Reward/train', total_reward, ep)
    
    if ep % print_interval == 0:
        test_reward = agent.test_episode(env)
        print(f"Episode {ep}: Train Reward = {total_reward:.1f}, Test Reward = {test_reward:.1f}, Tau = {agent.tau:.3f}")
        agent.writer.add_scalar('Reward/test', test_reward, ep)

env.close()
agent.writer.close()

Episode 0: Train Reward = -345.9, Test Reward = -131.6, Tau = 0.995
Episode 20: Train Reward = -106.7, Test Reward = -384.0, Tau = 0.900
Episode 40: Train Reward = -258.5, Test Reward = -783.2, Tau = 0.814
Episode 60: Train Reward = -172.7, Test Reward = -363.2, Tau = 0.737
Episode 80: Train Reward = -262.5, Test Reward = -144.6, Tau = 0.666
Episode 100: Train Reward = -214.9, Test Reward = -157.7, Tau = 0.603
Episode 120: Train Reward = -58.8, Test Reward = -749.2, Tau = 0.545
Episode 140: Train Reward = -258.4, Test Reward = -757.1, Tau = 0.493
Episode 160: Train Reward = -276.1, Test Reward = -165.8, Tau = 0.446
Episode 180: Train Reward = -370.7, Test Reward = -478.8, Tau = 0.404
Episode 200: Train Reward = -57.7, Test Reward = -175.1, Tau = 0.365
Episode 220: Train Reward = -166.4, Test Reward = -235.9, Tau = 0.330
Episode 240: Train Reward = -140.6, Test Reward = -177.7, Tau = 0.299
Episode 260: Train Reward = -150.0, Test Reward = -158.5, Tau = 0.270
Episode 280: Train Reward = 

In [1]:
%load_ext tensorboard
%tensorboard --logdir=runs/

Launching TensorBoard...