## **Double DQN**

A variant of DQN that reduces overestimation bias by separately selecting and evaluating actions.

**Import**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import gym
import numpy as np
from collections import deque


**Define the Q-Network**

In [None]:
class QNetwork(nn.Module):
    def __init__(self, input_size, output_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)


**Initialize the Environment and Agent**

In [None]:
env = gym.make('CartPole-v1')
input_size = env.observation_space.shape[0]
output_size = env.action_space.n

online_net = QNetwork(input_size, output_size)
target_net = QNetwork(input_size, output_size)
target_net.load_state_dict(online_net.state_dict())
target_net.eval()

optimizer = optim.Adam(online_net.parameters(), lr=0.001)


**Experience Replay Buffer**

In [None]:
replay_buffer = deque(maxlen=10000)
batch_size = 64
gamma = 0.99
epsilon = 0.1


**Training Loop**

In [None]:
for episode in range(1000):
    state = env.reset()
    state = torch.tensor(state, dtype=torch.float32)
    done = False
    total_reward = 0

    while not done:
        if np.random.rand() < epsilon:
            action = np.random.choice(output_size)
        else:
            with torch.no_grad():
                action = online_net(state).argmax().item()

        next_state, reward, done, _ = env.step(action)
        next_state = torch.tensor(next_state, dtype=torch.float32)
        reward = torch.tensor(reward, dtype=torch.float32)

        replay_buffer.append((state, action, reward, next_state, done))
        state = next_state
        total_reward += reward.item()

        if len(replay_buffer) >= batch_size:
            batch = np.random.choice(len(replay_buffer), batch_size, replace=False)
            states, actions, rewards, next_states, dones = zip(*[replay_buffer[idx] for idx in batch])

            states = torch.stack(states)
            actions = torch.tensor(actions)
            rewards = torch.tensor(rewards)
            next_states = torch.stack(next_states)
            dones = torch.tensor(dones)

            q_values = online_net(states)
            next_q_values = target_net(next_states)
            next_q_value = next_q_values.max(dim=1)[0]
            target_q_value = rewards + (1 - dones) * gamma * next_q_value

            q_value = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
            loss = nn.MSELoss()(q_value, target_q_value)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    if episode % 10 == 0:
        target_net.load_state_dict(online_net.state_dict())
        print(f'Episode {episode}, Total Reward: {total_reward}')


**Expected Output**

In [None]:
Episode 0, Total Reward: 21.0
Episode 10, Total Reward: 35.0
Episode 20, Total Reward: 50.0
...
