## **Deep Q-Networks (DQN)**

A value-based RL algorithm that uses a neural network to approximate Q-values for discrete actions.

**Import**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import gymnasium as gym
import numpy as np


**Define the DQN Model**

In [None]:
class DQN(nn.Module):
    def __init__(self, input_size, output_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)


**Initialize Environment and Model**

In [None]:
env = gym.make('CartPole-v1')
model = DQN(input_size=4, output_size=2)
target_model = DQN(input_size=4, output_size=2)
target_model.load_state_dict(model.state_dict())
optimizer = optim.Adam(model.parameters(), lr=0.001)


**Experience Replay Buffer**

In [None]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = []
        self.capacity = capacity
        self.position = 0

    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return np.random.choice(self.buffer, batch_size, replace=False)

    def __len__(self):
        return len(self.buffer)


**Training Loop**

In [None]:
def train(env, model, target_model, optimizer, episodes=1000, batch_size=64, gamma=0.99):
    replay_buffer = ReplayBuffer(10000)
    for episode in range(episodes):
        state, _ = env.reset()
        state = torch.tensor(state, dtype=torch.float32)
        done = False
        total_reward = 0
        while not done:
            action = select_action(state, model)
            next_state, reward, done, _, _ = env.step(action)
            next_state = torch.tensor(next_state, dtype=torch.float32)
            replay_buffer.push(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward
            if len(replay_buffer) > batch_size:
                experiences = replay_buffer.sample(batch_size)
                batch = list(zip(*experiences))
                states, actions, rewards, next_states, dones = [torch.tensor(x) for x in batch]
                # Compute Q targets and loss
                # Update model parameters
        if episode % 10 == 0:
            target_model.load_state_dict(model.state_dict())
        print(f"Episode {episode}, Total Reward: {total_reward}")


**Action Selection**

In [None]:
def select_action(state, model, epsilon=0.1):
    if np.random.rand() < epsilon:
        return np.random.choice([0, 1])
    with torch.no_grad():
        q_values = model(state)
        return torch.argmax(q_values).item()


**Run Training**

In [None]:
train(env, model, target_model, optimizer)
