# DQN Implmentation

### Installations

In [2]:
# # Install environment and agent
# !pip install highway-env
# !pip install --upgrade sympy torch


### Learning using existing model

The following is the pesudocode that will be followed when creating the DQN

Useful: https://www.youtube.com/watch?v=RVMpm86equc&list=PL58zEckBH8fCMIVzQCRSZVPUp3ZAVagWi&index=2

https://github.com/saashanair/rl-series/tree/master/dqn

https://github.com/johnnycode8/gym_solutions/blob/main/frozen_lake_dql.py

<img src="DQN.png" style="width: 900px;" align="left"/>


Potential Problems: https://www.reddit.com/r/reinforcementlearning/comments/1555wgi/dqn_loss_increasing_and_rewards_decreasing/


For CNN:

https://www.reddit.com/r/MachineLearning/comments/3l5qu7/rules_of_thumb_for_cnn_architectures/


In [3]:
import gymnasium as gym
import highway_env
import numpy as np
import random
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dist
import torch.nn.init as init

# the paper: https://arxiv.org/pdf/1706.10295
class NoisyLayer(nn.Module):
    #sigma is σi,j for all param where 3.2 INITIALISATION OF NOISY NETWORKS in the paper 0.017(for indimendent gaussain distri)
    def __init__(self, in_features, out_features, sigmaparam=0.4):
        super(NoisyLayer, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.sigmaparam = sigmaparam

        # learnable parameters (sigma and mu)
        self.weight_mu = nn.Parameter(torch.empty(out_features, in_features))
        self.bias_mu = nn.Parameter(torch.empty(out_features))
        self.sigma_weight = nn.Parameter(torch.empty(out_features, in_features))
        self.sigma_bias = nn.Parameter(torch.empty(out_features))

        # noise param using a distribution
        # register buffer so it persists through gradient descent updates, as epsilon doesn't change 
        self.register_buffer("epsilon_weight", torch.empty(out_features, in_features))
        self.register_buffer("epsilon_bias", torch.empty(out_features))
        
        self.reset_parameters()
        self.factorized_noise()


    def reset_parameters(self):
      # xavier uniform due to DQN sigma activation function for actions
        mu_range = 1 / (self.in_features ** 0.5)
        self.weight_mu.data.uniform_(-mu_range, mu_range)
        self.bias_mu.data.uniform_(-mu_range, mu_range)

        self.sigma_weight.data.fill_(self.sigmaparam)
        self.sigma_bias.data.fill_(self.sigmaparam)
      
    def noise(self,size):
        """generate noise (factorized gaussian noise): f(x) = sign(x) * sqrt(abs(x))"""
        factor_noise = torch.randn(size)
        return factor_noise.sign().mul_(factor_noise.abs().sqrt_()) 

    def factorized_noise(self):
        """Create a new noise"""
        epsilon_in = self.noise(self.in_features)
        epsilon_out = self.noise(self.out_features)
        self.epsilon_weight.copy_(epsilon_out.outer(epsilon_in))
        self.epsilon_bias.copy_(epsilon_out)
    def forward(self, x):
        '''
        Comparison between training and evaluation different differ, so it won't be handled differently here
        '''
        noisy_weight = self.weight_mu + (self.sigma_weight * self.epsilon_weight)
        noisy_bias = self.bias_mu + (self.sigma_bias * self.epsilon_bias)
        return F.linear(x, noisy_weight, noisy_bias)

# Define model
class MLPNetwork(nn.Module):
    def __init__(self, in_states, out_actions, vehicle_count, noisy_net=False):
        super(MLPNetwork, self).__init__()
        
        self.layers = nn.Sequential(
            nn.Linear(in_states, 128), 
            nn.ReLU(),
            nn.Linear(128, 128), 
            nn.ReLU(), 
            nn.Linear(128, out_actions),
            nn.ReLU(), 
        )

        self.out = nn.Linear(out_actions * vehicle_count, out_actions)
        
        self.noisy_net = noisy_net
        if self.noisy_net:
            self.noisy_layer = NoisyLayer(out_actions * vehicle_count, out_actions)

    def forward(self, x):
        x = self.layers(x)
        x = torch.flatten(x, start_dim=1)
        if self.noisy_net:
            x = self.noisy_layer(x)
        else:
            x = self.out(x)
        return x
    
    def reset_noise(self):
        self.noisy_layer.factorized_noise()

class CNN(nn.Module):
    def __init__(self, input_shape, num_actions, noisy_net=False):
        super(CNN, self).__init__()
        # greyscale Image is(stack,height,width)
        stack, height, width = input_shape
        self.conv = nn.Sequential(
            nn.Conv2d(stack,16,kernel_size=4, stride=2, padding=1),
            nn.ReLU(),

            nn.Conv2d(16,32,kernel_size=3),
            nn.ReLU(),

            nn.Conv2d(32,64, kernel_size=2),
            nn.ReLU(),
        )
        
        # This is for finding the size to dense more robust compared to decision manually
        with torch.no_grad():
                # Torch uses(1,channels,height,width)
                test = torch.zeros(1, stack, height, width)
                find_conv_size = self.conv(test)
                conv_size = find_conv_size.numel()
        self.out1 = nn.Linear(conv_size,num_actions)
        
        self.noisy_net = noisy_net
        if self.noisy_net:
            self.noisy_layer1 = NoisyLayer(conv_size, 64)
            self.noisy_layer2 = NoisyLayer(64, num_actions)
    
    def forward(self,x):
        x = self.conv(x)
        x = torch.flatten(x, start_dim=1)
        if self.noisy_net:
            x = F.relu(self.noisy_layer1(x))
            x = self.noisy_layer2(x)
        else:
            x = self.out1(x)
        return x

    def reset_noise(self):
        """Reset noisy layers."""
        self.noisy_layer1.factorized_noise()
        self.noisy_layer2.factorized_noise()

In [4]:
from collections.__init__ import namedtuple

Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward', 'done'))

class ReplayMemory():
    def __init__(self, config, transition_type=Transition):
        self.capacity = config.get("capacity", 1000)
        self.index = 0
        self.transition_type = transition_type
        self.discount = config.get("discount", 0.99)

        self.memory = []
        self.device = config.get("device", torch.device("cpu"))
        self.n_steps = config.get("n_steps", 2)
    
    def store(self, *args):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
            self.index = len(self.memory) - 1
        elif len(self.memory) > self.capacity:
            self.memory = self.memory[:self.capacity]
        # Faster than append and pop
        self.memory[self.index] = self.transition_type(*args)
        
        self.index = (self.index+1)%self.capacity # for circular memory

    def sample(self, batch_size, collapsed=True):
        if batch_size > len(self.memory):
            batch_size = len(self.memory)
            
        if self.n_steps <= 1:
            # Directly sample transitions
            memories = random.sample(self.memory, batch_size)
            return self.unwrap_transition(*memories)
        else:
            # Sample initial transition indexes
            indexes = random.sample(range(len(self.memory)), batch_size)
            # Get the batch of n-consecutive-transitions starting from sampled indexes
            all_transitions = [self.memory[i:i+self.n_steps] for i in indexes]
            
            memories = map(self.collapse_n_steps, all_transitions) if collapsed else all_transitions

            return self.unwrap_transition(*memories)
        
    def collapse_n_steps(self, transitions):
        state, action, next_state, reward, done = transitions[0]
        discount = self.discount
        for transition in transitions[1:]:
            if done:
                break
            else:
                _, _, next_state, reward, done = transition
                discount *= self.discount
                reward += discount * reward
        return state, action, next_state, reward, done
    
    def unwrap_transition(self, *transition):
        state, action, next_state, reward, done = zip(*transition)
        
        states = torch.from_numpy(np.array(state)).float().to(self.device)
        actions = torch.from_numpy(np.array(action)).to(self.device)
        next_states = torch.from_numpy(np.array(next_state)).float().to(self.device)
        rewards = torch.from_numpy(np.array(reward)).float().to(self.device)
        dones = torch.from_numpy(np.array(done)).to(self.device)

        return states, actions, next_states, rewards, dones 

from segment_tree import MinSegmentTree, SumSegmentTree

class PrioritizedReplayMemory(ReplayMemory):
    def __init__(self, config, transition_type=Transition):
        super().__init__(config)

        self.capacity = config.get("capacity", 1000)

        tree_size = 1
        while tree_size < self.capacity:
            tree_size *= 2
        
        # size needs to be power of 2
        self.sum_tree = SumSegmentTree(tree_size)
        self.min_tree = MinSegmentTree(tree_size)        
        
        self.index = 0
        self.transition_type = transition_type

        self.memory = []
        self.memory_indexes = []
        self.device = config.get("device", torch.device("cpu"))
        
        # emphasis on the priority
        self.alpha = config.get("alpha", 0.6)
        # control degree of correction B=1 fully corrects bias B=0 no correction
        self.beta = config.get("beta", 0.2) #  will go to 1
        self.max_priority = 1  # priority for new samples, init as eps
    
    def store(self, *args):
        # store the transition in memory
        super().store(*args)
        # add the index to the tree and the loss value
        self.sum_tree[self.index] = self.max_priority ** self.alpha
        self.min_tree[self.index] = self.max_priority ** self.alpha
        # circular
        self.index = (self.index + 1) % self.capacity
        
    def get_sample_weights(self):
        return np.array([self.calculate_weight(i) for i in self.memory_indexes])
    
    def sample(self, batch_size, collapsed=True):
        self.memory_indexes = self.sample_proportional_indexes(batch_size)
        memories = [self.memory[i] for i in self.memory_indexes]
        
        return super().unwrap_transition(*memories)
    
    def sample_proportional_indexes(self, batch_size):
        """
        proportional to the loss for this particular example in the Bellman update
        By adjusting the priorities, we introduce bias to data distribution
        To get the value of the weights for each sample using beta
        """
        indexes = []
        p_total = self.sum_tree.sum(0, len(self.memory) - 1)
        segment = p_total / batch_size
        
        for i in range(batch_size):
            a = segment * i
            b = segment * (i + 1)
            upperbound = random.uniform(a, b)
            idx = self.sum_tree.retrieve(upperbound)
            indexes.append(idx)
        return indexes
    
    def calculate_weight(self, index):
        """Calculate the weight of the experience at index using the paper."""
        sum_weights = self.sum_tree.sum()
        
        # get max weight
        p_min = self.min_tree.min() / sum_weights
        max_weight = (p_min * len(self.memory)) ** (-self.beta)
        
        # calculate weights
        p_sample = self.sum_tree[index] / sum_weights
        weight = (p_sample * len(self.memory)) ** (-self.beta)
        
        # normalize the weight
        return weight / max_weight
    
    def update_priorities(self, priorities):
        for index, priority in zip(self.memory_indexes, priorities):
            if priority < 0 or (index < 0 and index > len(self.memory)):
                return
            
            self.sum_tree[index] = priority ** self.alpha
            self.min_tree[index] = priority ** self.alpha

            self.max_priority = max(self.max_priority, priority)

In [9]:
import gymnasium as gym
import highway_env
import numpy as np
import random
import torch
import torch.optim as optim
import os
import datetime
from tqdm import tqdm
import json
import time

import sys
sys.path.append(os.path.abspath('..'))
from metrics import Metrics

class DQNAgent:
    def __init__(self, params):
        self.q_net = {}
        self.q_target_net = {} 
        self.optimizer= {}        
        
        self.policy = params.get("policy", "CnnPolicy")        
        self.episode_num = params.get("episode_num", 10)

        self.epsilon = params.get("epsilon_max ", 1)
        self.epsilon_min = params.get("epsilon_min ", 0.1)
        self.epsilon_decay = params.get("epsilon_decay", 0.995)
        
        self.learning_rate = params.get("learning_rate", 5e-4)
        self.discount = params.get("discount", 0.2)
        self.batch_size = params.get("batch_size", 32)
        self.device = params.get("device", torch.device("cpu"))
        
        self.memory_capacity = params.get("memory_capacity", 1000)
        self.memory = {} # this is the memory buffer -> setting a limit
        self.n_steps = params.get("n_steps", 2)
        self.prioritize_memory = params.get("prioritize_memory", False)
        
        self.double = params.get("double", False)
        self.noisy_net = params.get("noisy_net", False)
        self.prioritized_memory = params.get("prioritized_memory", False)
        
        self.timeout = params.get("timeout_minute", 0) * 60 # in minutes
        self.time = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
        self.to_save_model = params.get("save_model", False)
        
        use_metrics = params.get("use_metrics", False)
        save_params = params.get("save_params", False)
        
        if save_params and use_metrics:
            self.save_params(params)
            
        self.metrics = Metrics(self.policy, "training_results", use_metrics)
        
    def initialize_weights(self, m):
        if isinstance(m, nn.Conv2d):
            # Xavier initialization for Conv2d weights
            init.xavier_uniform_(m.weight)
            init.zeros_(m.bias)
        elif isinstance(m, nn.Linear):
            # Xavier initialization for Linear weights
            init.xavier_uniform_(m.weight)
            init.zeros_(m.bias)

    def create_network(self, env):
        if self.policy == "CnnPolicy":
            self.create_CNN(env)
        
        if self.policy == "MlpPolicy":
            self.create_MLP_Network(env)
        
        self.q_net.apply(self.initialize_weights)    
        self.update_target_network()
        self.optimizer = optim.Adam(self.q_net.parameters(), lr=self.learning_rate)
    
    def create_CNN(self, env):
        self.num_states = env.observation_space.shape
        self.num_actions = env.action_space.n

        self.q_net = CNN(self.num_states, self.num_actions, self.noisy_net).to(self.device)
        self.q_target_net = CNN(self.num_states, self.num_actions, self.noisy_net).to(self.device)
        
    
    def create_MLP_Network(self, env):
        # the lanes
        self.vehicle_count = env.observation_space.shape[0]
        self.features = env.observation_space.shape[1]
        self.num_actions = env.action_space.n

        self.q_net = MLPNetwork(self.features, self.num_actions, self.vehicle_count,  self.noisy_net).to(self.device)
        self.q_target_net = MLPNetwork(self.features, self.num_actions, self.vehicle_count, self.noisy_net).to(self.device)
    
    def update_target_network(self):
        self.q_target_net.load_state_dict(self.q_net.state_dict())
    
    def learn(self, env):
        self.create_network(env)
        
        self.memory = ReplayMemory({
            "capacity": self.memory_capacity,
            "device": self.device,
            "n_steps": self.n_steps,
        }) if not self.prioritize_memory else PrioritizedReplayMemory({
            "capacity": self.memory_capacity,
            "device": self.device,
            "alpha": 0.6,
            "beta": 0.2,
        })

        self.prefill_memory(env, self.batch_size)

        start_time = time.time()        
        
        for epoch in tqdm(range(self.episode_num), desc="Training Model"):
            state = env.reset()[0]
            
            # True when agent reaches the end states (colliding or passing the time)
            done = False 
            
            truncated = False
            episode_rewards = []
            episode_loss = []
            episode_len = 0
            while(not done and not truncated):
                # choose best action
                action = self.get_action(state)
                next_state, reward, done, truncated, _ = env.step(action)
                self.memory.store(state, action, next_state, reward, done)
                
                episode_loss.append(self.experience_replay())
                
                state = next_state
                
                episode_rewards.append(reward)
                episode_len += 1
                
            self.metrics.add("rollout/rewards", sum(episode_rewards) / len(episode_rewards), epoch)
            self.metrics.add("rollout/exploration-rate", self.epsilon, epoch)
            self.metrics.add("rollout/episode-length", episode_len, epoch)
            self.metrics.add("train/loss", sum(episode_loss) / len(episode_loss), epoch)
            
            if self.timeout:
                elapsed_time = time.time() - start_time
                if elapsed_time > self.timeout:
                    print("Timeout reached. Stopping training.\n")
                    break
            
            # if epoch % 10 == 0:
            self.decay_epsilon()
            self.update_target_network()
            
        self.metrics.close()
        
        if self.to_save_model:
            self.save_model()
        
    # either the policies are able to get miltuple actions and into the NN or the input of NN should be able to handle all of these
    # output (one of): {0: 'LANE_LEFT', 1: 'IDLE', 2: 'LANE_RIGHT', 3: 'FASTER', 4: 'SLOWER'}
    def get_action(self, state, eval_mode=False):
        # amount of exploration reduces with the epsilon value
        if random.random() <= self.epsilon and not eval_mode and self.epsilon_decay != 0: 
            return random.randrange(self.num_actions)
        
        state = torch.tensor(np.array([state]), dtype=torch.float32).to(self.device)
        actions = self.q_net(state)
        return torch.argmax(actions).item()             


    
    def experience_replay(self):
        # I need the weights here for the loss
        states, actions, next_states, rewards, dones = self.memory.sample(self.batch_size)
        if self.prioritize_memory:
            memory_weights = self.memory.get_sample_weights()
            memory_weights = torch.tensor(memory_weights).float().to(self.device)
        
        q_pred = self.q_net(states)
        # q value of the action taken
        q_pred = q_pred.gather(1, actions.view(-1, 1)) 
        q_pred = q_pred.squeeze(1)

        # Double DQN
        if self.double:
             # pick best actions from policy network
            q_best_action = self.q_net(next_states)
            _, q_best_action = q_best_action.max(dim=1)
            q_best_action = q_best_action.unsqueeze(1)
            
            # use those actions for the target policy
            q_target = self.q_target_net(next_states)
            q_target = q_target.gather(1, q_best_action)
            q_target = q_target.squeeze(1)
        else:
            q_target = self.q_target_net(next_states)
            q_target = q_target.max(dim=1).values
        
        # setting Q(s',a') to 0 when the current state is a terminal state
        q_target[dones] = 0.0
        
        y_j = rewards + (self.discount * q_target)
        
        
        # calculate the loss as the mean-squared error of yj and qpred
        element_loss = F.mse_loss(y_j, q_pred, reduction="none")
        if self.prioritize_memory:
            element_loss = element_loss * memory_weights
        loss = element_loss.mean()
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        if self.noisy_net:
            self.q_net.reset_noise()
            self.q_target_net.reset_noise()
        
        if self.prioritize_memory:
            loss_for_prior = np.array(element_loss.detach().cpu())
            # so that each transition is sampled
            new_priorities = loss_for_prior + 1e-5
            self.memory.update_priorities(new_priorities)
        
        return loss.item()
        
    def prefill_memory(self, env, prefill_num):
        for _ in tqdm(range(prefill_num), desc="Prefilling Memory "):
            done = False
            truncated = False
            state = env.reset()[0]

            while not done and not truncated:
                action = env.action_space.sample()
                next_state, reward, done, truncated, info = env.step(action)
                self.memory.store(state, action, next_state, reward, done)    
                
    def decay_epsilon(self):
        if self.epsilon_decay != 0:
            self.epsilon = max(self.epsilon_min, self.epsilon*self.epsilon_decay)
    
    def evaluate(self, env, episode_num):
        # add camera here
        for _ in tqdm(range(episode_num), desc="Evaluating Model"):
            state = env.reset()[0]  
            done = False      
            truncated = False 

            # Agent navigates map until it falls into a hole (terminated), reaches goal (terminated), or has taken 200 actions (truncated).
            while(not done and not truncated):  
                # Select best action   
                action = self.get_action(state, eval_mode=True)
                next_state, reward, done, truncated, info = env.step(action)
                state = next_state
                env.render()
        
    def save_model(self):
        folder_name = self.policy + "_save_models"
        self.metrics.create_folder(folder_name)
        new_model_num = str(len(os.listdir("./" +folder_name)) + 1)
        file_name = f'{folder_name}/DQN_{new_model_num}_{self.time}.pth'
        state = {'state_dict': self.q_net.state_dict(),
            'optimizer': self.optimizer.state_dict()}
        torch.save(state, file_name)
        
    def load_model(self, env, file_name):
        folder_name = self.policy + "_save_models"
                
        filename = folder_name + "/" + file_name + ".pth"
        self.create_network(env)
        
        models = torch.load(filename, map_location=self.device)
        
        self.q_net.load_state_dict(models['state_dict'])
        self.optimizer.load_state_dict(models['optimizer'])
    
    def save_params(self, params):
        folder_name = "hyperparameters"
        self.metrics.create_folder(folder_name)
        
        file_name = f'./{folder_name}/{self.policy}_DQN_{self.time}'
        with open(file_name + '.txt', 'w') as file:
            file.write(json.dumps(str(params)))

In [6]:
config = {}
policy = "CnnPolicy"
# policy = "MlpPolicy"

if policy == "CnnPolicy":
    config={
        "lanes_count" : 3,
        "observation": {
            "type": "GrayscaleObservation",
            "observation_shape": (128, 64),
            "stack_size": 4,
            "weights": [0.2989, 0.5870, 0.1140],  # weights for RGB conversion keep this conversion this is in the highway env page
            "scaling": 1.75,
        },
    }
else:
    config = {
        "lanes_count" : 3,
        "observation": {
            "type": "Kinematics",
            "vehicles_count": 5,
            "features": ["presence", "x", "y", "vx", "vy", "cos_h", "sin_h"],
            "features_range": {
                "x": [-100, 100],
                "y": [-100, 100],
                "vx": [-20, 20],
                "vy": [-20, 20]
            },
            "absolute": False,
            "order": "sorted"
        }
    }

In [None]:
import torch
params = {
    'policy' : policy,
    'episode_num' : 10,
    'discount' : 0.7,
    'batch_size' : 3,
    'learning_rate': 4e-5,
    'n_steps': 3,
    'double': True,
    # 'epsilon_decay': 0, # To deactivate epsilon decay 
    'noisy_net': False,
    'prioritize_memory': False,
    'device' : torch.device("mps"),
    'memory_capacity' : 10000,
    'timeout_minute': 15,
    'use_metrics' : False,
    'save_model': False,
    'save_params': True, # metrics also need to be on
}

seed = 72 # Our group number
for i in range(1):
    seed += i
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    # so that the GPU seed is also random
    if torch.backends.cudnn.enabled:
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # not optimize for when the network doesn't get the same input size 
        # (because of batch sizes and get best action input different sizes)
        torch.backends.cudnn.benchmark = False 
        torch.backends.cudnn.deterministic = True # slows down training
    
    dqn_agent = DQNAgent(params)
    env = gym.make('highway-fast-v0', render_mode='rgb_array', config=config)
    dqn_agent.learn(env)
    
    env = gym.make('highway-v0', render_mode='rgb_array', config=config)
    dqn_agent.evaluate(env, 5)
    

# # if you wanna save a model again
# # dqn_agent.save_model("highway_dqn_model")

Prefilling Memory : 100%|██████████| 3/3 [00:00<00:00,  5.20it/s]
Training Model: 100%|██████████| 10/10 [00:04<00:00,  2.05it/s]
Evaluating Model:   0%|          | 0/5 [00:00<?, ?it/s]

In [218]:
env = gym.make('highway-v0', render_mode='rgb_array', config=config)

dqn_agent_test = DQNAgent(params)
dqn_agent_test.load_model(env, "DQN_1_20241230164948")

dqn_agent_test.evaluate(env, 20)

  models = torch.load(filename, map_location=self.device)


FileNotFoundError: [Errno 2] No such file or directory: 'CnnPolicy_save_models/DQN_1_20241230164948.pth'

### Run the Tensorboard

In [13]:
%reload_ext tensorboard

%tensorboard --logdir training_results --host localhost --port 6010