# Deep Q-Learning

# 1. Deep Q-Network (I)

In [127]:
import torch
import torch.nn as nn

import copy

import numpy as np

from replay_buffer import ReplayBuffer

from torch.optim import AdamW

import torch.nn.functional as F

import gym

class DeepQNetwork(nn.Module):

    def __init__(self, dim_states, dim_actions):
        super(DeepQNetwork, self).__init__()
        # MLP, fully connected layers, ReLU activations, linear ouput activation
        # dim_states -> 64 -> 64 -> dim_actions
        self.layers = nn.Sequential(
            nn.Linear(dim_states, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, dim_actions)
        )

    def forward(self, input):
        # tensor format
        #input = torch.from_numpy(input).unsqueeze(dim=0).float()

        q_values = self.layers(input)

        return q_values

class DeepQNetworkAgent:

    def __init__(self, dim_states, dim_actions, lr, gamma, epsilon, nb_training_steps, replay_buffer_size, batch_size):
        
        self._learning_rate = lr
        self._gamma = gamma
        self._epsilon = epsilon

        self._epsilon_min = 0
        self._epsilon_decay = self._epsilon / (nb_training_steps / 2.)

        self._dim_states = dim_states
        self._dim_actions = dim_actions

        self.replay_buffer = ReplayBuffer(dim_states=self._dim_states,
                                          dim_actions=self._dim_actions,
                                          max_size=replay_buffer_size,
                                          sample_size=batch_size)

        # Complete
        self._deep_qnetwork = DeepQNetwork(self._dim_states, self._dim_actions)
        self._target_deepq_network = copy.deepcopy(self._deep_qnetwork).eval()

        # Adam optimizer
        self._optimizer = AdamW(self._deep_qnetwork.parameters(), lr=self._learning_rate)


    def store_transition(self, s_t, a_t, r_t, s_t1, done_t):
        self.replay_buffer.store_transition(s_t, a_t, r_t, s_t1, done_t)


    def replace_target_network(self):
        self._target_deepq_network.load_state_dict(self._deep_qnetwork.state_dict())
        

    def select_action(self, observation, greedy=False):
        
           
            if np.random.random() > self._epsilon or greedy:
                # Select action greedily

                # Action values
                qa = self._target_deepq_network(observation)

                # Action con mayor q-value
                action=qa.argmax().item()
        
            else:
                # Exploración
                action=np.random.randint(2)

            if not greedy and self._epsilon >= self._epsilon_min:
                
                # Implement epsilon linear decay
                self._epsilon-=self._epsilon_decay 
                

            return action

    def update(self):
        s_t, a_t, r_t, s_t1, done_t=self.replay_buffer.sample_transitions()

        qsa_b = q_network(state_b).gather(1, action_b)
                
        next_qsa_b = target_q_network(next_state_b)
        next_qsa_b = torch.max(next_qsa_b, dim=-1, keepdim=True)[0]
        
        target_b = reward_b + ~done_b * gamma * next_qsa_b
        loss = F.mse_loss(qsa_b, target_b)
        q_network.zero_grad()
        loss.backward()
        optim.step()
        pass


In [128]:
env = gym.make('CartPole-v1')
eval_env = gym.make('CartPole-v1')

# Actions are discrete
dim_actions = np.array(env.action_space.n)

# States are continuous
dim_states = env.observation_space.shape[0]

print(dim_states)
print(dim_actions)

4
2


In [129]:
deep_qnetwork = DeepQNetwork(dim_states, dim_actions)
deep_qnetwork

DeepQNetwork(
  (layers): Sequential(
    (0): Linear(in_features=4, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=2, bias=True)
  )
)

In [130]:
observation=env.reset()
observation

array([-0.04055779,  0.00673627, -0.00092249, -0.01014175], dtype=float32)

In [131]:
observation = torch.from_numpy(observation).unsqueeze(dim=0).float()
observation

tensor([[-0.0406,  0.0067, -0.0009, -0.0101]])

In [132]:
tensor1=observation
tensor2=observation
full=torch.cat([tensor1, tensor2], dim=0)


In [133]:
qsa=deep_qnetwork(full)
qsa

tensor([[ 0.0161, -0.0858],
        [ 0.0161, -0.0858]], grad_fn=<AddmmBackward>)

In [134]:
action=np.random.randint(2)
action

0

In [135]:
qsa=qsa[:,action]
qsa

tensor([0.0161, 0.0161], grad_fn=<SelectBackward>)

# 2. Replay Buffer

In [136]:
import gym
import numpy as np

In [139]:
class ReplayBuffer:

    def __init__(self, dim_states, dim_actions, max_size, sample_size):

        assert sample_size < max_size, "Sample size cannot be greater than buffer size"
        
        self._buffer_idx     = 0
        self._exps_stored    = 0
        self._buffer_size    = max_size
        self._sample_size    = sample_size

        self._s_t_array      = np.zeros((max_size, dim_states))
        self._a_t_array      = np.zeros((max_size))
        self._r_t_array      = np.zeros((max_size,))
        self._s_t1_array     = np.zeros((max_size, dim_states))
        self._term_t_array   = np.zeros((max_size,))


    def store_transition(self, s_t, a_t, r_t, s_t1, done_t):

        # Add transition to replay buffer according to self._buffer_idx
        self._s_t_array[self._buffer_idx]=s_t   
        self._a_t_array[self._buffer_idx]=a_t  
        self._r_t_array[self._buffer_idx]=r_t  
        self._s_t1_array[self._buffer_idx]=s_t1 
        self._term_t_array[self._buffer_idx]=done_t

        # Update replay buffer index
        # Aumento de indice y reinicio de indice si superamos capacidad
        self._buffer_idx = (self._buffer_idx + 1) % self._buffer_size
        self._exps_stored += 1

    
    def sample_transitions(self):
        assert self._exps_stored + 1 > self._sample_size, "Not enough samples have been stored to start sampling"
        
        sample_idxs = np.random.choice(self._buffer_size, size=self._sample_size,replace=False)
        
        return (self._s_t_array[sample_idxs],
                self._a_t_array[sample_idxs],
                self._r_t_array[sample_idxs],
                self._s_t1_array[sample_idxs],
                self._term_t_array[sample_idxs])
        
 

In [140]:
# Ambiente: CartPole
env = gym.make('CartPole-v1')
eval_env = gym.make('CartPole-v1')

# Actions are discrete
dim_actions = np.array(env.action_space.n)

# States are continuous
dim_states = env.observation_space.shape[0]

print(dim_states)
print(dim_actions)

4
2


In [141]:
# Inicialización de memory Buffer
max_size=5
sample_size=3
memory=ReplayBuffer(dim_states, dim_actions, max_size, sample_size)

In [142]:
# Simulación de 6 transiciones
s_t=env.reset()

for i in range(6):
   
    a_t=np.random.randint(2)
    s_t1, r_t, done_t, _ = env.step(a_t)
    print(s_t1, r_t, done_t)

    # Guardar
    memory.store_transition(s_t, a_t, r_t, s_t1, done_t)

    s_t = s_t1

[-0.00939226  0.21154283 -0.01240931 -0.28007838] 1.0 False
[-0.0051614   0.40683958 -0.01801088 -0.5766492 ] 1.0 False
[ 0.00297539  0.21197465 -0.02954386 -0.28969413] 1.0 False
[ 0.00721488  0.01728617 -0.03533775 -0.00647347] 1.0 False
[ 0.0075606   0.21289663 -0.03546721 -0.3100931 ] 1.0 False
[ 0.01181854  0.01829748 -0.04166908 -0.02880314] 1.0 False


In [143]:
# Datos almacenados: Conjunto de estados almacenados
memory._s_t1_array

array([[ 0.01181854,  0.01829748, -0.04166908, -0.02880314],
       [-0.0051614 ,  0.40683958, -0.01801088, -0.57664919],
       [ 0.00297539,  0.21197465, -0.02954386, -0.28969413],
       [ 0.00721488,  0.01728617, -0.03533775, -0.00647347],
       [ 0.0075606 ,  0.21289663, -0.03546721, -0.3100931 ]])

**Se observa como el sexto elemento de la transición de estados, coincide con el primer elemento del conjunto de estados almacenados. Esto debido a que la data se va sobreescribiendo en la medida de que hay nuevos registros.**

In [144]:
# Se guardan a los 5 elementos, los cuales son el max_size del buffer
memory._s_t1_array.shape

(5, 4)

In [145]:
# Sample
memory.sample_transitions()

(array([[ 0.00721488,  0.01728617, -0.03533775, -0.00647347],
        [ 0.00297539,  0.21197465, -0.02954386, -0.28969413],
        [-0.00939226,  0.21154283, -0.01240931, -0.28007838]]),
 array([1., 0., 1.]),
 array([1., 1., 1.]),
 array([[ 0.0075606 ,  0.21289663, -0.03546721, -0.3100931 ],
        [ 0.00721488,  0.01728617, -0.03533775, -0.00647347],
        [-0.0051614 ,  0.40683958, -0.01801088, -0.57664919]]),
 array([0., 0., 0.]))

In [146]:
# Se observa el muestreo de 3 elementos para el conjunto de estados
len(memory.sample_transitions()[0])

3

# 3. Deep Q-Network (II)

In [None]:
import torch
import torch.nn as nn

import copy

import numpy as np

from replay_buffer import ReplayBuffer

from torch.optim import AdamW

import torch.nn.functional as F

class DeepQNetwork(nn.Module):

    def __init__(self, dim_states, dim_actions):
        super(DeepQNetwork, self).__init__()
        # MLP, fully connected layers, ReLU activations, linear ouput activation
        # dim_states -> 64 -> 64 -> dim_actions
        self.layers = nn.Sequential(
            nn.Linear(dim_states, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, dim_actions)
        )

    def forward(self, input):
        # tensor format
        input = torch.from_numpy(input).unsqueeze(dim=0).float()

        q_values = self.layers(input)

        return q_values

class DeepQNetworkAgent:

    def __init__(self, dim_states, dim_actions, lr, gamma, epsilon, nb_training_steps, replay_buffer_size, batch_size):
        
        self._learning_rate = lr
        self._gamma = gamma
        self._epsilon = epsilon

        self._epsilon_min = 0
        self._epsilon_decay = self._epsilon / (nb_training_steps / 2.)

        self._dim_states = dim_states
        self._dim_actions = dim_actions

        self.replay_buffer = ReplayBuffer(dim_states=self._dim_states,
                                          dim_actions=self._dim_actions,
                                          max_size=replay_buffer_size,
                                          sample_size=batch_size)

        # Complete
        self._deep_qnetwork = DeepQNetwork(self._dim_states, self._dim_actions)
        self._target_deepq_network = copy.deepcopy(self._deep_qnetwork).eval()

        # Adam optimizer
        self._optimizer = AdamW(self._deep_qnetwork.parameters(), lr=self._learning_rate)


    def store_transition(self, s_t, a_t, r_t, s_t1, done_t):
        self.replay_buffer.store_transition(s_t, a_t, r_t, s_t1, done_t)


    def replace_target_network(self):
        self._target_deepq_network.load_state_dict(self._deep_qnetwork.state_dict())
        

    def select_action(self, observation, greedy=False):
        
           
            if np.random.random() > self._epsilon or greedy:
                # Select action greedily

                # Action values
                qa = self._target_deepq_network(observation)

                # Action con mayor q-value
                action=qa.argmax().item()
        
            else:
                # Exploración
                action=np.random.randint(2)

            if not greedy and self._epsilon >= self._epsilon_min:
                
                # Implement epsilon linear decay
                self._epsilon-=self._epsilon_decay 
                

            return action

    def update(self):
        s_t, a_t, r_t, s_t1, done_t=self.replay_buffer.sample_transitions()

        qsa_b = q_network(state_b).gather(1, action_b)
                
        next_qsa_b = target_q_network(next_state_b)
        next_qsa_b = torch.max(next_qsa_b, dim=-1, keepdim=True)[0]
        
        target_b = reward_b + ~done_b * gamma * next_qsa_b
        loss = F.mse_loss(qsa_b, target_b)
        q_network.zero_grad()
        loss.backward()
        optim.step()
        pass
