In [13]:
# Imports
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
from copy import deepcopy
import gymnasium as gym


import time
import os
env = gym.make('highway-v0', render_mode='rgb_array')

from IPython.display import clear_output

import matplotlib.pyplot as plt

# Setting up the DQN Class and Agent

In [30]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, state, action, reward, terminated, next_state):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = (state, action, reward, terminated, next_state)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.choices(self.memory, k=batch_size)

    def __len__(self):
        return len(self.memory)

# create instance of replay buffer
#replay_buffer = ReplayBuffer(BUFFER_CAPACITY)
    
class Net(nn.Module):
    """
    Basic neural net.
    """
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )

    def forward(self, x):
        return self.net(x)
    
class DQN_Skeleton: 
    def __init__(self,
                action_space,
                observation_space,
                gamma,
                batch_size,
                buffer_capacity,
                update_target_every, 
                epsilon_start, 
                decrease_epsilon_factor, 
                epsilon_min,
                learning_rate,
                ): 
        self.action_space = action_space
        self.observation_space = observation_space
        self.gamma = gamma
        
        self.batch_size = batch_size
        self.buffer_capacity = buffer_capacity
        self.update_target_every = update_target_every
        
        self.epsilon_start = epsilon_start
        self.decrease_epsilon_factor = decrease_epsilon_factor # larger -> more exploration
        self.epsilon_min = epsilon_min
        
        self.learning_rate = learning_rate
        
        self.reset()
        
    def get_action(self, state, epsilon=None):
        """
            ** Solution ** 
        
            Return action according to an epsilon-greedy exploration policy
        """
        if epsilon is None: 
            epsilon = self.epsilon
            
        if np.random.rand() < epsilon: 
            action = env.action_space.sample()
            #print(f"np.random.rand() < epsilon"," choix de l'action :",action)
            return action
        else: 
            #print("else")
            #print("np.argmax(self.get_q(state))",np.argmax(self.get_q(state)))

            q = self.get_q(state)

            #print("Q :",self.get_q(state))
            #print("argmax Q",np.argmax(self.get_q(state)))

            index_max_sum = np.argmax(np.sum(q, axis=1))

            # Sélectionner la ligne correspondante dans le tableau q
            selected_action = np.argmax(q[index_max_sum])

            #print("selected_action",selected_action)

            return selected_action
            #return np.argmax(self.get_q(state))


            # Sinon, retourner l'action maximisant la valeur Q prédite
            # q_values = self.get_q(state)
            # print("q_values",q_values)
            # # Assurez-vous que les actions valides restent dans les limites de l'espace d'actions
            # valid_actions = np.arange(self.action_space.n)
            # print("valid_action",valid_actions)
            # print("np.argmax(q_values[valid_actions])",np.argmax(q_values[valid_actions]))
            # return np.argmax(q_values[valid_actions])
    
    def update(self, state, action, reward, terminated, next_state):
        #print("UPDATE")
        # add data to replay buffer
        self.buffer.push(torch.tensor(state).unsqueeze(0), 
                        torch.tensor([[action]], dtype=torch.int64), 
                        torch.tensor([reward]), 
                        torch.tensor([terminated], dtype=torch.int64), 
                        torch.tensor(next_state).unsqueeze(0),
                        )

        if len(self.buffer) < self.batch_size:
            return np.inf

        # get batch
        transitions = self.buffer.sample(self.batch_size)

        state_batch, action_batch, reward_batch, terminated_batch, next_state_batch = tuple(
            [torch.cat(data) for data in zip(*transitions)]
        )

        action_batch = action_batch.unsqueeze(1)
        values  = self.q_net.forward(state_batch).gather(1, action_batch)

        # Compute the ideal Q values
        with torch.no_grad():
            #print("terminated_batch",terminated_batch)
            # print("(1 - terminated_batch)",(1 - terminated_batch))
            # print("next_state_batch",next_state_batch)
            # print("self.target_net(next_state_batch)",self.target_net(next_state_batch))
            #print("self.target_net(next_state_batch).max(1)[0][0]",self.target_net(next_state_batch).max(1)[0])
            # print("(1 - terminated_batch) * self.target_net(next_state_batch).max(1)[0]",(1 - terminated_batch) * self.target_net(next_state_batch).max(1)[0])

            target_net_values=torch.max(self.target_net(next_state_batch).max(1)[0], dim=1)
            next_state_values = (1 - terminated_batch) * target_net_values
            targets = next_state_values * self.gamma + reward_batch

        loss = self.loss_function(values, targets)

        # Optimize the model 
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        if not((self.n_steps+1) % self.update_target_every): 
            self.target_net.load_state_dict(self.q_net.state_dict())
            
        self.decrease_epsilon()
            
        self.n_steps += 1
        if terminated: 
            self.n_eps += 1

        return loss.detach().numpy()

    
    def get_q(self, state):
        """
        Compute Q function for a states
        """
        state_tensor = torch.tensor(state).unsqueeze(0)
        with torch.no_grad():
            output = self.q_net.forward(state_tensor) # shape (1,  n_actions)
        return output.numpy()[0]  # shape  (n_actions)
    
    def decrease_epsilon(self):
        self.epsilon = self.epsilon_min + (self.epsilon_start - self.epsilon_min) * (
                        np.exp(-1. * self.n_eps / self.decrease_epsilon_factor ) )
    
    def reset(self):
        hidden_size = 128
        
        obs_size = self.observation_space.shape[0]
        n_actions = self.action_space.n
        
        self.buffer = ReplayBuffer(self.buffer_capacity)
        self.q_net =  Net(obs_size, hidden_size, n_actions)
        self.target_net = Net(obs_size, hidden_size, n_actions)
        
        self.loss_function = nn.MSELoss()
        self.optimizer = optim.Adam(params=self.q_net.parameters(), lr=self.learning_rate)
        
        self.epsilon = self.epsilon_start
        self.n_steps = 0
        self.n_eps = 0

def eval_agent(agent, env, n_sim=5):
    """
    ** Solution **
    
    Monte Carlo evaluation of DQN agent.

    Repeat n_sim times:
        * Run the DQN policy until the environment reaches a terminal state (= one episode)
        * Compute the sum of rewards in this episode
        * Store the sum of rewards in the episode_rewards array.
    """
    env_copy = deepcopy(env)
    episode_rewards = np.zeros(n_sim)
    for i in range(n_sim):
        state, _ = env_copy.reset()
        reward_sum = 0
        done = False
        while not done: 
            action = agent.get_action(state, 0)
            state, reward, terminated, truncated, _ = env_copy.step(action)
            reward_sum += reward
            done = terminated or truncated
        episode_rewards[i] = reward_sum
    return episode_rewards


def run_one_episode(env, agent, display=True):
    display_env = deepcopy(env)
    done = False
    state, _ = display_env.reset()

    rewards = 0

    while not done:
        action = agent.get_action(state, 0)
        print("action",action)
        state, reward, done, _, _ = display_env.step(action)
        rewards += reward
        if display: 
            clear_output(wait=True)
            plt.imshow(display_env.render())
            plt.show()
    if display:
        display_env.close()
    print(f'Episode length {rewards}')

env = gym.make('highway-v0', render_mode='rgb_array')
agent = DQN_Skeleton(action_space=env.action_space,
                     observation_space=env.observation_space,
                     gamma=0.99,
                     batch_size=32,
                     buffer_capacity=10000,
                     update_target_every=1000,
                     epsilon_start=0.9,
                     decrease_epsilon_factor=2000,
                     epsilon_min=0.01,
                     learning_rate=0.001)
#run_one_episode(env, agent, display=True)
#eval_agent(agent,env)

In [31]:
env = gym.make('highway-v0', render_mode='rgb_array')

In [32]:
env.action_space

Discrete(5)

In [33]:
np.argmax(np.array([[ 0.02443908 , 0.01280055, -0.01428717 ,-0.15400532  ,0.18554473],
 [-0.12202579 , 0.01255986, -0.04294205, -0.01470667  ,0.07940094],
 [-0.08228531  ,0.00596284 ,-0.03702209 ,-0.02130748 , 0.07430322],
 [-0.10946059,  0.00109198, -0.04108299 ,-0.06560317 , 0.14844404],
 [-0.07317315 ,-0.00265739 ,-0.03074778 ,-0.07818615 , 0.14204383]]))

4

In [34]:
import matplotlib.pyplot as plt

def train(env, agent, N_episodes, eval_every=2, reward_threshold=300):
    total_time = 0
    state, _ = env.reset()
    losses = []
    for ep in range(N_episodes):
        print(f"ep = {ep}")
        done = False
        state, _ = env.reset()
        while not done: 
            action = agent.get_action(state)

            next_state, reward, terminated, truncated, _ = env.step(action)
            #print("ligne avant update")
            loss_val = agent.update(state, action, reward, terminated, next_state)
            #print("ligne apres update")
            state = next_state
            losses.append(loss_val)

            done = terminated or truncated
            total_time += 1

        if ((ep+1)% eval_every == 0):
            #print("Eval agent")
            rewards = eval_agent(agent, env)
            #print("episode =", ep+1, ", reward = ", np.mean(rewards))
            if np.mean(rewards) >= reward_threshold:
                break

    return losses

# Initialisation de l'environnement
env = gym.make('highway-v0')
env.reset()

# Initialisation de l'agent
action_space = env.action_space
observation_space = env.observation_space

gamma = 0.99
batch_size = 8
buffer_capacity = 10_000
update_target_every = 8

epsilon_start = 0.9
decrease_epsilon_factor = 1000
epsilon_min = 0.05

learning_rate = 1e-1

arguments = (action_space,
            observation_space,
            gamma,
            batch_size,
            buffer_capacity,
            update_target_every, 
            epsilon_start, 
            decrease_epsilon_factor, 
            epsilon_min,
            learning_rate,
        )

N_episodes = 300

agent = DQN_Skeleton(*arguments)

# Exécution de l'apprentissage
losses = train(env, agent, N_episodes)

# Affichage des résultats
plt.plot(losses)
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.show()

# Évaluation de la politique finale
rewards = eval_agent(agent, env, 20)
print("")
print("mean reward after training = ", np.mean(rewards))


ep = 0
ep = 1
terminated_batch tensor([0, 0, 0, 0, 0, 0, 0, 0])
self.target_net(next_state_batch).max(1)[0][0] tensor([[ 0.1909,  0.0996,  0.0348,  0.2110,  0.0480],
        [ 0.1765,  0.0915,  0.0248,  0.2278,  0.0214],
        [ 0.1768,  0.0913,  0.0225,  0.2277,  0.0181],
        [ 0.1795,  0.0996,  0.0324,  0.2160,  0.0441],
        [ 0.1909,  0.0996,  0.0348,  0.2110,  0.0480],
        [ 0.1942,  0.0937,  0.0283,  0.1988,  0.0349],
        [ 0.1765,  0.0915,  0.0248,  0.2278,  0.0214],
        [ 0.1899,  0.0843,  0.0124,  0.2250, -0.0049]])


RuntimeError: The size of tensor a (8) must match the size of tensor b (5) at non-singleton dimension 0

In [37]:
tensor_original = torch.tensor([
    [0.1909, 0.0996, 0.0348, 0.2110, 0.0480],
    [0.1765, 0.0915, 0.0248, 0.2278, 0.0214],
    [0.1768, 0.0913, 0.0225, 0.2277, 0.0181],
    [0.1795, 0.0996, 0.0324, 0.2160, 0.0441],
    [0.1909, 0.0996, 0.0348, 0.2110, 0.0480],
    [0.1942, 0.0937, 0.0283, 0.1988, 0.0349],
    [0.1765, 0.0915, 0.0248, 0.2278, 0.0214],
    [0.1899, 0.0843, 0.0124, 0.2250, -0.0049]
])

# Trouver la valeur maximale de chaque ligne
max_values_per_row, _ = torch.max(tensor_original, dim=1)

print(max_values_per_row)

tensor([0.2110, 0.2278, 0.2277, 0.2160, 0.2110, 0.1988, 0.2278, 0.2250])
