# DQN Implmentation

### Installations

In [12]:
# Install environment and agent
!pip install highway-env
!pip install --upgrade sympy torch

2024-12-17 16:24:05.748 python[2021:12877892] +[IMKClient subclass]: chose IMKClient_Legacy
2024-12-17 16:24:05.748 python[2021:12877892] +[IMKInputSession subclass]: chose IMKInputSession_Legacy



### Learning using existing model

The following is the pesudocode that will be followed when creating the DQN

Useful: https://www.youtube.com/watch?v=RVMpm86equc&list=PL58zEckBH8fCMIVzQCRSZVPUp3ZAVagWi&index=2

https://github.com/saashanair/rl-series/tree/master/dqn

https://github.com/johnnycode8/gym_solutions/blob/main/frozen_lake_dql.py

<img src="DQN.png" style="width: 900px;" align="left"/>




In [112]:
import gymnasium as gym
import highway_env
import numpy as np
import random
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dist

# Define model
class MLPNetwork(nn.Module):
    def __init__(self, in_states, h1_nodes, out_actions):
        super(MLPNetwork, self).__init__()

        # Define network layers
        self.fc1 = nn.Linear(in_states, h1_nodes)   # first fully connected layer
        self.out = nn.Linear(h1_nodes, out_actions) # output layer
        self.out2 = nn.Linear(out_actions, 1) # output layer
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = F.relu(self.fc1(x)) # Apply rectified linear unit (ReLU) activation
        x = F.relu(self.out(x))         
        x = self.out2(x)
        x = self.softmax(x)
        return x

class CNN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(CNN, self).__init__()
        # greyscale Image is(stack,height,width)
        stack, height, width = input_shape
        self.conv = nn.Sequential(
            # first Conv layer
            nn.Conv2d(stack,32,kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2,stride=2),

            # second Conv layer
            nn.Conv2d(32,64,kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2,stride=2),
        )
        
        # This is for finding the size to dense more robust compared to decision manually
        with torch.no_grad():
                # Torch uses(1,channels,height,width)
                test = torch.zeros(1, stack, height, width)
                find_conv_size = self.conv(test)
                conv_size = find_conv_size.numel()
        self.fc = nn.Linear(conv_size,num_actions)
        self.softmax = nn.Softmax(dim=0)

    def forward(self,x):
        x = self.conv(x)
        x = torch.flatten(x, start_dim=1)
        x = self.fc(x)
        x = x.unsqueeze(2)
        x = self.softmax(x)
        return x
    
# Define memory for Experience Replay
# TODO: Prioritize, n-steps
class ReplayMemory():
    def __init__(self, config):
        self.capacity = config.get("capacity", 1000)
        self.buffer_state = []
        self.buffer_action = []
        self.buffer_next_state = []
        self.buffer_reward = []
        self.buffer_done = []
        self.index = 0
        self.memory = []
        self.device = config.get("device", torch.device("cpu"))
        self.n_steps = config.get("n_steps", 0)
    
    def store(self, state, action, next_state, reward, done):
        if len(self.buffer_state) < self.capacity:
            self.buffer_state.append(state)
            self.buffer_action.append(action)
            self.buffer_next_state.append(next_state)
            self.buffer_reward.append(reward)
            self.buffer_done.append(done)
        else:
            self.buffer_state[self.index] = state
            self.buffer_action[self.index] = action
            self.buffer_next_state[self.index] = next_state
            self.buffer_reward[self.index] = reward
            self.buffer_done[self.index] = done

        self.index = (self.index+1)%self.capacity # for circular memory

    def sample(self, batch_size):
        if batch_size >  len(self.buffer_state):
            batch_size = len(self.buffer_state)
        
        # stop this and add n-steps
        
        indices_to_sample = random.sample(range(len(self.buffer_state)), batch_size)
        # if self.n_steps != 0:
            # indices_to_sample = 
        
        
        
        states = torch.from_numpy(np.array(self.buffer_state)[indices_to_sample]).float().to(self.device)
        actions = torch.from_numpy(np.array(self.buffer_action)[indices_to_sample]).to(self.device)
        next_states = torch.from_numpy(np.array(self.buffer_next_state)[indices_to_sample]).float().to(self.device)
        rewards = torch.from_numpy(np.array(self.buffer_reward)[indices_to_sample]).float().to(self.device)
        done = torch.from_numpy(np.array(self.buffer_done)[indices_to_sample]).to(self.device)

        return states, actions, next_states, rewards, done
    def collapse_n_steps(self):
        print("dwadaw")

In [44]:
from torch.utils.tensorboard import SummaryWriter
import os

class Metrics:
    def __init__(self, policy, result_file_name, use_metrics, time):
        self.use_metrics = use_metrics
        if not self.use_metrics:
            return
        new_num = str(len(os.listdir("./" +result_file_name)) + 1)
        file_name = f'{result_file_name}/{policy}_DQN_{new_num}_{time}'
        self.writer = SummaryWriter(log_dir=file_name, flush_secs=60)
            
    def add(self, type, y, x):
        if not self.use_metrics:
            return
        self.writer.add_scalar(type, y, x)
    def close(self):
        if not self.use_metrics:
            return
        self.writer.close()

In [110]:
import gymnasium as gym
import highway_env
import numpy as np
import random
import torch
import torch.optim as optim
import os
import datetime
from tqdm import tqdm
import json

class DQNAgent:
    def __init__(self, params):
        self.q_net = {}
        self.q_target_net = {} 
        self.optimizer= {}        
        
        self.policy = params.get("policy", "CnnPolicy")        
        self.episode_num = params.get("episode_num", 10)

        self.epsilon = params.get("epsilon_max ", 1)
        self.epsilon_min = params.get("epsilon_min ", 0.1)
        self.epsilon_decay = params.get("epsilon_decay", 0.995)
        
        self.learning_rate = params.get("learning_rate", 3e-4)
        self.discount = params.get("discount", 0.2)
        self.batch_size = params.get("batch_size", 32 )
        self.device = params.get("device", torch.device("cpu"))
        
        self.memory_capacity = params.get("memory_capacity", 1000)
        self.memory = {} # this is the memory buffer -> setting a limit
        ct = datetime.datetime.now()

        self.time = str(ct).replace(" ", "|")
        self.to_save_model = params.get("save_model", False)
        
        use_metrics = params.get("use_metrics", False)
        if use_metrics:
            self.create_folder("training_results")
            self.save_params(params)
            
        self.metrics = Metrics(self.policy, "training_results", use_metrics, self.time)
        
    def create_network(self, env):
        if self.policy == "CnnPolicy":
            self.create_CNN(env)
        
        if self.policy == "MlpPolicy":
            self.create_MLP_Network(env)
            
        self.update_target_network()
        self.optimizer = optim.Adam(self.q_net.parameters(), lr=1e-3)
    
    def create_CNN(self, env):
        self.num_states = env.observation_space.shape
        self.num_actions = env.action_space.n

        self.q_net = CNN(self.num_states, self.num_actions).to(self.device)
        self.q_target_net = CNN(self.num_states, self.num_actions).to(self.device)
        
    
    def create_MLP_Network(self, env):
        # the lanes
        self.num_states = env.observation_space.shape[1]
        self.num_actions = env.action_space.n

        self.q_net = MLPNetwork(self.num_states, self.num_states, self.num_actions).to(self.device)
        self.q_target_net = MLPNetwork(self.num_states, self.num_states, self.num_actions).to(self.device)
    
    def update_target_network(self):
        # make the weights and biases the same
        self.q_target_net.load_state_dict(self.q_net.state_dict())
    
    def learn(self, env):
        self.create_network(env)
        
        self.memory = ReplayMemory({
            "capacity": self.memory_capacity,
            "device": self.device,
            "n-steps": 0,
        })

        self.prefill_memory(env, self.batch_size)

        print("training...")
        
        for epoch in tqdm(range(self.episode_num), desc="Training Model"):
            state = env.reset()[0]
            
            # True when agent reaches the end states (colliding or passing the time)
            done = False 
            
            # TODO: see how many actions until truncate
            # True when agent takes more than some actions 
            truncated = False
            episode_rewards = []
            episode_loss = []
            episode_len = 0
            while(not done and not truncated):
                # choose best action
                action = self.get_action(state)
                next_state, reward, done, truncated, info = env.step(action)
                self.memory.store(state=state, action=action, next_state=next_state, reward=reward, done=done)
                
                episode_loss.append(self.experience_replay())
                
                self.update_target_network()
                
                state = next_state
                
                episode_rewards.append(reward)
                episode_len += 1
                
            self.metrics.add("rollout/rewards", sum(episode_rewards) / len(episode_rewards), epoch)
            self.metrics.add("rollout/exploration-rate", self.epsilon, epoch)
            self.metrics.add("rollout/episode-length", episode_len, epoch)
            self.metrics.add("train/loss", sum(episode_loss) / len(episode_loss), epoch)
            
            self.decay_epsilon()
            
            
        self.metrics.close()
        
        if self.to_save_model:
            self.save_model()
        
    # either the policies are able to get miltuple actions and into the NN or the input of NN should be able to handle all of these
    # output (one of): {0: 'LANE_LEFT', 1: 'IDLE', 2: 'LANE_RIGHT', 3: 'FASTER', 4: 'SLOWER'}
    def get_action(self, state):
        if random.random() <= self.epsilon: # amount of exploration reduces with the epsilon value
            return random.randrange(self.num_actions)
        
        state = torch.tensor(np.array([state]), dtype=torch.float32).to(self.device)
        actions = self.q_net(state)
        return torch.argmax(actions).item()             

    def experience_replay(self):
        states, actions, next_states, rewards, dones = self.memory.sample(self.batch_size)
        
        q_pred = self.q_net(states)
        # q value of the action taken
        q_pred = q_pred.gather(1, actions.view(-1, 1, 1)) 
        q_pred = q_pred.squeeze(1)
        
        q_target = self.q_target_net(next_states)
        q_target = q_target.gather(1, actions.view(-1, 1, 1))
        q_target = q_target.squeeze(1)
        
        # setting Q(s',a') to 0 when the current state is a terminal state
        q_target[dones] = 0.0
        
        y_j = rewards + (self.discount * q_target)
        y_j = y_j.mean(dim=0, keepdim=True)
        y_j = y_j.view(-1, 1)
        
        # calculate the loss as the mean-squared error of yj and qpred
        self.optimizer.zero_grad()
        loss = F.mse_loss(y_j, q_pred).mean()
        loss.backward()
        self.optimizer.step()
        
        return loss.item()
        
    def prefill_memory(self, env, prefill_num):
        print("prefilling memory...")
        for _ in range(prefill_num):
            done = False
            truncated = False
            state = env.reset()[0]

            while not done and not truncated:
                action = env.action_space.sample()
                next_state, reward, done, truncated, info = env.step(action)
                self.memory.store(state=state, 
                                    action=action, 
                                    next_state=next_state, 
                                    reward=reward, 
                                    done=done)
    
    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon*self.epsilon_decay)

    def save_model(self):
        folder_name = self.policy + "_save_models"
        self.create_folder(folder_name)
        new_model_num = str(len(os.listdir("./" +folder_name)) + 1)
        file_name = f'{folder_name}/DQN_{new_model_num}_{self.time}.pth'
        state = {'state_dict': self.q_net.state_dict(),
            'optimizer': self.optimizer.state_dict()}
        torch.save(state, file_name)
        
    def load_model(self, env, file_name):
        folder_name = self.policy + "_save_models"
                
        filename = folder_name + "/" + file_name + ".pth"
        self.create_network(env)
        
        models = torch.load(filename, map_location=self.device)
        
        self.q_net.load_state_dict(models['state_dict'])
        self.q_net.load_state_dict(models['state_dict'])
        self.optimizer.load_state_dict(models['optimizer'])
    
    def save_params(self, params):
        folder_name = "hyperparameters"
        self.create_folder(folder_name)
        
        file_name = f'./{folder_name}/{self.policy}_DQN_{self.time}'
        with open(file_name + '.txt', 'w') as file:
            file.write(json.dumps(str(params)))

    def create_folder(self, directory_name):
        try:
            os.mkdir(directory_name)
            print(f"Directory '{directory_name}' created successfully.")
        except FileExistsError:
            return
        except PermissionError:
            print(f"Permission denied: Unable to create '{directory_name}'.")
        except Exception as e:
            print(f"An error occurred: {e}")


In [86]:
config = {}
policy = "CnnPolicy"
# policy = "MlpPolicy"

if policy == "CnnPolicy":
    config={
        "observation": {
            "type": "GrayscaleObservation",
            "observation_shape": (128, 64),
            "stack_size": 4,
            "weights": [0.2989, 0.5870, 0.1140],  # weights for RGB conversion keep this conversion this is in the highway env page
            "scaling": 1.75,
        },
    }
else:
    config = {
        "observation": {
            "type": "Kinematics",
            "vehicles_count": 5,
            "features": ["presence", "x", "y", "vx", "vy", "cos_h", "sin_h"],
            "features_range": {
                "x": [-100, 100],
                "y": [-100, 100],
                "vx": [-20, 20],
                "vy": [-20, 20]
            },
            "absolute": False,
            "order": "sorted"
        }
    }

In [115]:
params = {
    'policy' : policy,
    'episode_num' : 1,
    'discount' : 0.2,
    'batch_size' : 16,
    'device' : torch.device("mps"),
    'memory_capacity' : 5000,
    'use_metrics' : False,
    'save_model': False,
}

dqn_agent = DQNAgent(params)
env = gym.make('highway-fast-v0', render_mode='rgb_array', config=config)

dqn_agent.learn(env)

prefilling memory...
training...


Training Model: 100%|██████████| 1/1 [00:00<00:00,  1.21it/s]


In [None]:
%reload_ext tensorboard

%tensorboard --logdir training_results --host localhost --port 6005

In [39]:
dqn_agent.save_model("highway_dqn_model2")

### Test the Model

In [57]:
import gymnasium as gym
import torch
env = gym.make('highway-v0', render_mode='rgb_array', config=config)
dqn_agent_test = DQNAgent(policy=policy, episode_num=0,epsilon=0.0, discount=1e-2, batch_size=10, device=torch.device("mps"), memory_capacity=5000)
dqn_agent_test.load_model(env, "DQN_2_2024-12-20|14:29:48.867232")

for i in range(10):
    state = env.reset()[0]  
    done = False      
    truncated = False 

    # Agent navigates map until it falls into a hole (terminated), reaches goal (terminated), or has taken 200 actions (truncated).
    while(not done and not truncated):  
        # Select best action   
        action = dqn_agent_test.get_action(state)
        next_state, reward, done, truncated, info = env.step(action)
        env.render()

  models = torch.load(filename, map_location=self.device)


FileNotFoundError: [Errno 2] No such file or directory: 'CnnPolicy_save_models/DQN_2_2024-12-20|14:29:48.867232.pth'

In [8]:
import gymnasium as gym
import torch
from torch.utils.tensorboard import SummaryWriter
import os

# Initialize the TensorBoard writer
result_file_name = "run_results"
new_test = len(os.listdir("./" +result_file_name)) + 1
writer = SummaryWriter(log_dir=result_file_name + '/DQN_' + str(new_test), flush_secs=60)

env = gym.make('highway-v0', render_mode='rgb_array', config=config)
dqn_agent_test = DQNAgent(policy=policy, episode_num=100 ,epsilon=0.0, discount=1e-2, batch_size=10, device=torch.device("mps"), memory_capacity=5000)
dqn_agent_test.load_model(env, "highway_dqn_model")

num_episodes = 10
for episode in range(num_episodes):
    state = env.reset()[0]  
    done = False      
    truncated = False 
    episode_reward = 0.0  # Track cumulative reward for this episode

    while not done and not truncated:  
        # Select best action   
        action = dqn_agent_test.get_action(state)
        next_state, reward, done, truncated, info = env.step(action)
        episode_reward += reward
        
        state = next_state

    writer.add_scalar('Reward', episode_reward, episode)

writer.close()

  models = torch.load(filename, map_location=self.device)


In [None]:
%reload_ext tensorboard

%tensorboard --logdir run_results --host localhost --port 6006

In [7]:
%reload_ext tensorboard

%tensorboard --logdir highway_dqn/