# Week 8, Day 3: Deep Q-Networks (DQN)

## Learning Objectives
- Understand DQN architecture
- Learn experience replay
- Master target networks
- Practice implementing DQN

## Topics Covered
1. Neural Network Q-Learning
2. Experience Replay
3. Target Networks
4. DQN Variants

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import gym
from collections import deque
import random

## 1. DQN Implementation

In [None]:
class DQN:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_model()
    
    def _build_model(self):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(24, input_dim=self.state_size, activation='relu'),
            tf.keras.layers.Dense(24, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=self.learning_rate))
        return model
    
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state):
        if random.random() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
    
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        states = np.array([i[0] for i in minibatch])
        actions = np.array([i[1] for i in minibatch])
        rewards = np.array([i[2] for i in minibatch])
        next_states = np.array([i[3] for i in minibatch])
        dones = np.array([i[4] for i in minibatch])
        
        states = np.squeeze(states)
        next_states = np.squeeze(next_states)
        
        targets = self.model.predict(states)
        target_next = self.target_model.predict(next_states)
        
        for i in range(batch_size):
            if dones[i]:
                targets[i][actions[i]] = rewards[i]
            else:
                targets[i][actions[i]] = rewards[i] + self.gamma * np.amax(target_next[i])
        
        self.model.fit(states, targets, epochs=1, verbose=0)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

## 2. Training DQN

In [None]:
def train_dqn():
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQN(state_size, action_size)
    batch_size = 32
    episodes = 100
    
    scores = []
    
    for e in range(episodes):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        
        for time in range(500):
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            
            if done:
                agent.update_target_model()
                scores.append(time)
                print(f"Episode: {e+1}/{episodes}, Score: {time}, Epsilon: {agent.epsilon:.2}")
                break
                
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)
    
    return scores

scores = train_dqn()

# Plot results
plt.figure(figsize=(10, 5))
plt.plot(scores)
plt.title('DQN Training Progress')
plt.xlabel('Episode')
plt.ylabel('Score')
plt.show()

## 3. DQN Variants

In [None]:
class DoubleDQN(DQN):
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        states = np.array([i[0] for i in minibatch])
        actions = np.array([i[1] for i in minibatch])
        rewards = np.array([i[2] for i in minibatch])
        next_states = np.array([i[3] for i in minibatch])
        dones = np.array([i[4] for i in minibatch])
        
        states = np.squeeze(states)
        next_states = np.squeeze(next_states)
        
        targets = self.model.predict(states)
        
        # Double DQN: Use online network for action selection
        next_actions = np.argmax(self.model.predict(next_states), axis=1)
        # Use target network for value estimation
        target_next = self.target_model.predict(next_states)
        
        for i in range(batch_size):
            if dones[i]:
                targets[i][actions[i]] = rewards[i]
            else:
                targets[i][actions[i]] = rewards[i] + self.gamma * target_next[i][next_actions[i]]
        
        self.model.fit(states, targets, epochs=1, verbose=0)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

## 4. Comparing DQN Variants

In [None]:
def compare_dqn_variants():
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    
    # Train DQN
    dqn_agent = DQN(state_size, action_size)
    dqn_scores = train_agent(env, dqn_agent, "DQN")
    
    # Train Double DQN
    ddqn_agent = DoubleDQN(state_size, action_size)
    ddqn_scores = train_agent(env, ddqn_agent, "Double DQN")
    
    # Plot comparison
    plt.figure(figsize=(12, 6))
    plt.plot(pd.Series(dqn_scores).rolling(10).mean(), label='DQN')
    plt.plot(pd.Series(ddqn_scores).rolling(10).mean(), label='Double DQN')
    plt.title('DQN vs Double DQN')
    plt.xlabel('Episode')
    plt.ylabel('Score')
    plt.legend()
    plt.show()

def train_agent(env, agent, name, episodes=100):
    scores = []
    batch_size = 32
    
    for e in range(episodes):
        state = env.reset()
        state = np.reshape(state, [1, env.observation_space.shape[0]])
        
        for time in range(500):
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, env.observation_space.shape[0]])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            
            if done:
                agent.update_target_model()
                scores.append(time)
                print(f"{name} Episode: {e+1}/{episodes}, Score: {time}")
                break
                
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)
    
    return scores

compare_dqn_variants()

## Practical Exercises

In [None]:
# Exercise 1: Implement DQN

def dqn_exercise():
    print("Task: Implement basic DQN")
    print("1. Create neural network")
    print("2. Implement experience replay")
    print("3. Add target network")
    print("4. Train and evaluate")
    
    # Your code here

dqn_exercise()

In [None]:
# Exercise 2: Implement Double DQN

def double_dqn_exercise():
    print("Task: Implement Double DQN")
    print("1. Modify DQN architecture")
    print("2. Update target calculation")
    print("3. Train model")
    print("4. Compare with basic DQN")
    
    # Your code here

double_dqn_exercise()

## MCQ Quiz

1. What is DQN?
   - a) Policy gradient method
   - b) Deep Q-learning
   - c) Model-based method
   - d) Supervised learning

2. What is experience replay?
   - a) Model architecture
   - b) Memory mechanism
   - c) Learning rate
   - d) Activation function

3. What is a target network?
   - a) Main network
   - b) Stable value network
   - c) Policy network
   - d) Feature extractor

4. What is Double DQN?
   - a) Two separate networks
   - b) Action selection improvement
   - c) Learning rate method
   - d) Memory mechanism

5. What is the epsilon-greedy strategy in DQN?
   - a) Learning rate
   - b) Exploration method
   - c) Network architecture
   - d) Loss function

6. What is the purpose of target network?
   - a) Faster learning
   - b) Stable learning
   - c) Better exploration
   - d) Memory management

7. What is the replay buffer?
   - a) Neural network
   - b) Experience storage
   - c) Action selection
   - d) Value function

8. What is bootstrapping in DQN?
   - a) Memory mechanism
   - b) Value estimation
   - c) Network architecture
   - d) Action selection

9. What is the advantage of Double DQN?
   - a) Faster training
   - b) Reduced overestimation
   - c) Less memory usage
   - d) Simpler architecture

10. What is the role of batch size in DQN?
    - a) Network architecture
    - b) Training stability
    - c) Action selection
    - d) Memory size

Answers: 1-b, 2-b, 3-b, 4-b, 5-b, 6-b, 7-b, 8-b, 9-b, 10-b