# DQN Implmentation

In [2]:
# https://huggingface.co/blog/deep-rl-dqn

### Running it without Training

In [12]:
import gymnasium as gym
import highway_env
from matplotlib import pyplot as plt
# %matplotlib inline

# env = gym.make('highway-v0', render_mode='rgb_array')
# env.reset()
# for _ in range(5):
# # while True:
#     action = env.unwrapped.action_type.actions_indexes["IDLE"]
#     obs, reward, done, truncated, info = env.step(action)
#     env.render()

# plt.imshow(env.render())
# plt.show()

2024-12-17 16:24:05.748 python[2021:12877892] +[IMKClient subclass]: chose IMKClient_Legacy
2024-12-17 16:24:05.748 python[2021:12877892] +[IMKInputSession subclass]: chose IMKInputSession_Legacy


## Running Their DQN

In [246]:
import gymnasium as gym
import highway_env
from stable_baselines3 import DQN


# Visualization utils
%load_ext tensorboard
import sys
from tqdm.notebook import trange
# !pip install tensorboardx gym pyvirtualdisplay
# doesn't work cause not linux
# !apt-get install -y xvfb ffmpeg
# !git clone https://github.com/Farama-Foundation/HighwayEnv.git 2> /dev/null
# sys.path.insert(0, '/content/HighwayEnv/scripts/')
# from utils import record_videos, show_videos

In [2]:
import torch


### Learning using existing model

In [304]:
# for MLP
config = {
    "observation": {
        "type": "Kinematics",
        "vehicles_count": 15,
        "features": ["presence", "x", "y", "vx", "vy", "cos_h", "sin_h"],
        "features_range": {
            "x": [-100, 100],
            "y": [-100, 100],
            "vx": [-20, 20],
            "vy": [-20, 20]
        },
        "absolute": False,
        "order": "sorted"
    }
}

# # For CNN
# config = {
#     "observation": {
#         "type": "GrayscaleObservation",
#         "observation_shape": (128, 64),
#         "stack_size": 4,
#         "weights": [0.2989, 0.5870, 0.1140],  # weights for RGB conversion
#         "scaling": 1.75,
#     },
#     "policy_frequency": 2
# }

env = gym.make("highway-fast-v0", render_mode='rgb_array', config=config)
print(env.unwrapped.config)
mps_device = torch.device("mps")

model = DQN('MlpPolicy', env,
              policy_kwargs=dict(net_arch=[256, 256]),
              learning_rate=5e-4,
              buffer_size=15000,
              learning_starts=200,
              batch_size=32,
              gamma=0.8,
              device=mps_device,
              train_freq=1,
              gradient_steps=1,
              target_update_interval=50,
              verbose=1,
              tensorboard_log="highway_dqn/")

model.learn(int(1000))

{'observation': {'type': 'Kinematics', 'vehicles_count': 15, 'features': ['presence', 'x', 'y', 'vx', 'vy', 'cos_h', 'sin_h'], 'features_range': {'x': [-100, 100], 'y': [-100, 100], 'vx': [-20, 20], 'vy': [-20, 20]}, 'absolute': False, 'order': 'sorted'}, 'action': {'type': 'DiscreteMetaAction'}, 'simulation_frequency': 5, 'policy_frequency': 1, 'other_vehicles_type': 'highway_env.vehicle.behavior.IDMVehicle', 'screen_width': 600, 'screen_height': 150, 'centering_position': [0.3, 0.5], 'scaling': 5.5, 'show_trajectories': False, 'render_agent': True, 'offscreen_rendering': False, 'manual_control': False, 'real_time_rendering': False, 'lanes_count': 3, 'vehicles_count': 20, 'controlled_vehicles': 1, 'initial_lane_id': None, 'duration': 30, 'ego_spacing': 1.5, 'vehicles_density': 1, 'collision_reward': -1, 'right_lane_reward': 0.1, 'high_speed_reward': 0.4, 'lane_change_reward': 0, 'reward_speed_range': [20, 30], 'normalize_reward': True, 'offroad_terminal': False}
Using mps device
Wrapp

<stable_baselines3.dqn.dqn.DQN at 0x334cc4550>

In [305]:
model.save("highway_dqn/model")

In [307]:
# Load and test saved model
model = DQN.load("highway_dqn/model")

# while True:
for i in range(100):
  done = truncated = False
  obs, info = env.reset()
  while not (done or truncated):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, truncated, info = env.step(action)
    env.render()

KeyboardInterrupt: 


## Own Implementations

### DQN

Chose to first make naive work and then make the normal work


The following is the pesudocode that will be followed when creating the DQN

Useful: https://www.youtube.com/watch?v=RVMpm86equc&list=PL58zEckBH8fCMIVzQCRSZVPUp3ZAVagWi&index=2

https://github.com/saashanair/rl-series/tree/master/dqn

https://github.com/johnnycode8/gym_solutions/blob/main/frozen_lake_dql.py

<img src="DQN.png" style="width: 900px;" align="left"/>




In [308]:
import gymnasium as gym
import highway_env
import numpy as np
import random
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dist

# Define model
class MLQNetwork(nn.Module):
    def __init__(self, in_states, h1_nodes, out_actions):
        super(MLQNetwork, self).__init__()

        # Define network layers
        self.fc1 = nn.Linear(in_states, h1_nodes)   # first fully connected layer
        self.out = nn.Linear(h1_nodes, out_actions) # output layer w
        self.flatten = nn.Flatten()
        self.softmax = nn.Softmax(dim=1)
        
        self.optimizer = optim.Adam(self.parameters(), lr=1e-3)

    def forward(self, x):
        x = F.relu(self.fc1(x)) # Apply rectified linear unit (ReLU) activation
        x = self.out(x)         
        # x = self.flatten(x)         
        x = self.softmax(x)
        return x
    
    def save_model(self, filename):
        filename = filename + ".pth"
        torch.save(self.state_dict(), filename)
    
    def load_model(self, filename, device):
        filename = filename + ".pth"
        self.load_state_dict(torch.load(filename, map_location=device))
        
# class CNNNetwork(nn.Module):
#     def __init__(self, in_states, h1_nodes, out_actions):
#         super(MLQNetwork, self).__init__()

#         # Define network layers
#         self.fc1 = nn.Linear(in_states, h1_nodes)   # first fully connected layer
#         self.out = nn.Linear(h1_nodes, out_actions) # output layer w
#         self.flatten = nn.Flatten()
#         self.softmax = nn.Softmax(dim=1)
        
#         self.optimizer = optim.Adam(self.parameters(), lr=1e-3)

#     def forward(self, x):
#         x = F.relu(self.fc1(x)) # Apply rectified linear unit (ReLU) activation
#         x = self.out(x)         
#         # x = self.flatten(x)         
#         x = self.softmax(x)
#         return x
    
#     def save_model(self, filename):
#         filename = filename + ".pth"
#         torch.save(self.state_dict(), filename)
    
#     def load_model(self, filename, device):
#         filename = filename + ".pth"
#         self.load_state_dict(torch.load(filename), map_location=device)
        

    
# Define memory for Experience Replay
class ReplayMemory():
    def __init__(self, capacity, device):
        self.capacity = capacity
        self.buffer_state = []
        self.buffer_action = []
        self.buffer_next_state = []
        self.buffer_reward = []
        self.buffer_done = []
        self.idx = 0
        self.memory = []
        self.device = device
    
    def store(self, state, action, next_state, reward, done):
        if len(self.buffer_state) < self.capacity:
            self.buffer_state.append(state)
            self.buffer_action.append(action)
            self.buffer_next_state.append(next_state)
            self.buffer_reward.append(reward)
            self.buffer_done.append(done)
        else:
            self.buffer_state[self.idx] = state
            self.buffer_action[self.idx] = action
            self.buffer_next_state[self.idx] = next_state
            self.buffer_reward[self.idx] = reward
            self.buffer_done[self.idx] = done

        self.idx = (self.idx+1)%self.capacity # for circular memory


    def sample(self, batch_size):
        if batch_size >  len(self.buffer_state):
            batch_size = len(self.buffer_state)
        
        indices_to_sample = random.sample(range(len(self.buffer_state)), batch_size)
        
        states = torch.from_numpy(np.array(self.buffer_state)[indices_to_sample]).float().to(self.device)
        actions = torch.from_numpy(np.array(self.buffer_action)[indices_to_sample]).to(self.device)
        next_states = torch.from_numpy(np.array(self.buffer_next_state)[indices_to_sample]).float().to(self.device)
        rewards = torch.from_numpy(np.array(self.buffer_reward)[indices_to_sample]).float().to(self.device)
        done = torch.from_numpy(np.array(self.buffer_done)[indices_to_sample]).to(self.device)

        return states, actions, next_states, rewards, done

    def __len__(self):
        return len(self.memory)

class DQNAgent:
    def __init__(self, memory_capacity=5000):
        # weights
        self.w1 = []
        
        self.q_net = {}
        self.w2 = []
        self.q_target_net = self.q_net # the same
        self.policy = {}

        self.epsilon = 0.2
        self.discount = 0.01
        self.batch_size = 2
        
        # the weights of q1 and q2 should be the same
        self.memory_capacity = memory_capacity
        self.memory = {} # this is the memory buffer -> setting a limit
        self.device = torch.device("mps")
    
    def initialize_neural_networks(self, env):
        # the lanes
        self.num_states = env.observation_space.shape[0]
        # print(env.action_space.sample())
        self.num_actions = env.action_space.n

        self.q_net = MLQNetwork(self.num_states, self.num_states, self.num_actions).to(self.device)
        self.q_target_net = MLQNetwork(self.num_states, self.num_states, self.num_actions).to(self.device)
        self.update_target_network()
    
    def update_target_network(self):
        # make the weights and biases the same
        self.q_target_net.load_state_dict(self.q_net.state_dict())
    
    def learn(self, env, policy):
        print(policy)
        self.initialize_neural_networks(env)
        
        self.memory = ReplayMemory(capacity=self.memory_capacity, device=self.device)

        env.reset()
        num_episodes = 2
        prefill_num = 2

        self.prefill_memory(env, prefill_num)

        print("training...")
        
        for _ in range(num_episodes):
            state = env.reset()[0]
            
            # True when agent reaches the end states (colliding or passing the time)
            done = False 
            # True when agent takes more than 100 actions    
            truncated = False  
            while(not done and not truncated):
            # for i in range(2):
                # choose best action
                action = self.get_action(state)
                next_state, reward, done, truncated, info = env.step(action)
                self.memory.store(state=state, action=action, next_state=next_state, reward=reward, done=done)
                self.experience_replay()
                
                self.update_target_network()
                
                state = next_state
                env.render()
        
    # either the policies are able to get miltuple actions and into the NN or the input of NN should be able to handle all of these
    # output (one of): {0: 'LANE_LEFT', 1: 'IDLE', 2: 'LANE_RIGHT', 3: 'FASTER', 4: 'SLOWER'}
    def get_action(self, state):
        if random.random() <= self.epsilon: # amount of exploration reduces with the epsilon value
            return random.randrange(self.num_actions)

        if not torch.is_tensor(state):
            state = torch.tensor(np.array([state]), dtype=torch.float32).to(self.device)[0]

        with torch.no_grad():
        # pick the action with maximum Q-value in the Q-network 
            actions = self.q_net(state)
            actions = actions.mean(dim=1, keepdim=True)
        return torch.argmax(actions).item() 

    def experience_replay(self):
        states, actions, next_states, rewards, dones = self.memory.sample(self.batch_size)
        
        q_pred = self.q_net(states)
        q_pred = q_pred.mean(dim=2, keepdim=True)
        # q value of the action taken
        q_pred = q_pred.gather(1, actions.view(-1, 1, 1)) 
        q_pred = q_pred.squeeze(1)
        
        q_target = self.q_target_net(next_states)
        q_target = q_target.mean(dim=2, keepdim=True)
        q_target = q_target.gather(1, actions.view(-1, 1, 1))
        q_target = q_target.squeeze(1)
        
        # setting Q(s',a') to 0 when the current state is a terminal state
        q_target[dones] = 0.0
        
        y_j = rewards + (self.discount * q_target)
        y_j = y_j.mean(dim=0, keepdim=True)
        y_j = y_j.view(-1, 1)
        
        # calculate the loss as the mean-squared error of yj and qpred
        self.q_net.optimizer.zero_grad()
        loss = F.mse_loss(y_j, q_pred).mean()
        loss.backward()
        self.q_net.optimizer.step()
        
    def prefill_memory(self, env, prefill_num):
        print("prefilling memory...")
        for _ in range(prefill_num):
            done = False
            truncated = False
            state = env.reset()[0]

            while not done and not truncated:
            # for _ in range(2):
                action = env.action_space.sample()
                next_state, reward, done, truncated, info = env.step(action)
                dqn_agent.memory.store(state=state, 
                                    action=action, 
                                    next_state=next_state, 
                                    reward=reward, 
                                    done=done)
    def save_model(self, filename):
        self.q_net.save_model(filename)
        
    def load_model(self, env, filename):
        self.initialize_neural_networks(env)
        self.q_net.load_model(filename, self.device)
        # turn into evaluation model for test
        self.q_net.eval()

In [282]:
config = {}
# policy = "CnnPolicy"
policy = "MlpPolicy"

if policy == "CnnPolicy":
    config = {
        "observation": {
            "type": "GrayscaleObservation",
            "observation_shape": (128, 64),
            "stack_size": 4,
            "weights": [0.2989, 0.5870, 0.1140],  # weights for RGB conversion
            "scaling": 1.75,
        },
        "policy_frequency": 2
    }
else:
    config = {
        "observation": {
            "type": "Kinematics",
            "vehicles_count": 5,
            "features": ["presence", "x", "y", "vx", "vy", "cos_h", "sin_h"],
            "features_range": {
                "x": [-100, 100],
                "y": [-100, 100],
                "vx": [-20, 20],
                "vy": [-20, 20]
            },
            "absolute": False,
            "order": "sorted"
        }
    }
dqn_agent = DQNAgent()
env = gym.make('highway-fast-v0', render_mode='rgb_array')
dqn_agent.learn(env, policy=policy)

MlpPolicy
prefilling memory...
training...


In [None]:
dqn_agent.save_model("highway_dqn/test2")

In [295]:
print("CUDA Available:", torch.cuda.is_available())
print("MPS Available:", torch.backends.mps.is_available())
print("Device Count:", torch.cuda.device_count())

CUDA Available: False
MPS Available: True
Device Count: 0


### Test the Model

In [309]:
env = gym.make('highway-v0', render_mode='rgb_array')
dqn_agent_test = DQNAgent()
dqn_agent_test.load_model(env, "highway_dqn_model")

for i in range(100):
    state = env.reset()[0]  # Initialize to state 0
    done = False      # True when agent falls in hole or reached goal
    truncated = False       # True when agent takes more than 200 actions            

    # Agent navigates map until it falls into a hole (terminated), reaches goal (terminated), or has taken 200 actions (truncated).
    while(not done and not truncated):  
        # Select best action   
        action = dqn_agent_test.get_action(state)
        next_state, reward, done, truncated, info  = env.step(action)
        env.render()


  self.load_state_dict(torch.load(filename, map_location=device))


KeyboardInterrupt: 

## Compare with other model (also their DQN)

