# Deep Q-Learning

# 1. y 3. Deep Q-Network (I)

In [2]:
import torch
import torch.nn as nn

import copy

import numpy as np

from replay_buffer import ReplayBuffer

from torch.optim import AdamW

import torch.nn.functional as F

import torch
import torch.nn as nn

import copy

import numpy as np

from replay_buffer import ReplayBuffer

from torch.optim import AdamW

import torch.nn.functional as F

In [3]:
class DeepQNetwork(nn.Module):
    
    def __init__(self, dim_states, dim_actions):
        super(DeepQNetwork, self).__init__()
        # MLP, fully connected layers, ReLU activations, linear ouput activation
        # dim_states -> 64 -> 64 -> dim_actions

        # Inicialización de pesos con distribución uniforme
        def init_weights(m):
            if isinstance(m, nn.Linear):

                #nn.init.normal_(m.weight, mean=0.0, std=1)
                #nn.init.normal_(m.weight, mean=0.0, std=1)
                nn.init.uniform_(m.weight, a=0.0, b=1.0)
                nn.init.uniform_(m.bias, a=0.0, b=1.0)
                
        self.layers = nn.Sequential(
            nn.Linear(dim_states, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, dim_actions)
        )

        self.layers.apply(init_weights)

    def forward(self, input):

        # tensor format
        if isinstance(input, torch.Tensor):
            input=input
            
        else:
            input = torch.from_numpy(input).unsqueeze(dim=0).float()
            
        q_values = self.layers(input)

        return q_values

class DeepQNetworkAgent:

    def __init__(self, dim_states, dim_actions, lr, gamma, epsilon, nb_training_steps, replay_buffer_size, batch_size):
        
        self._learning_rate = lr
        self._gamma = gamma
        self._epsilon = epsilon

        self._epsilon_min = 0.0
        self._epsilon_decay = self._epsilon / (nb_training_steps / 2.)

        self._dim_states = dim_states
        self._dim_actions = dim_actions

        self.replay_buffer = ReplayBuffer(dim_states=self._dim_states,
                                          dim_actions=self._dim_actions,
                                          max_size=replay_buffer_size,
                                          sample_size=batch_size)

        # Complete
        self._deep_qnetwork = DeepQNetwork(self._dim_states, self._dim_actions)
        self._target_deepq_network = copy.deepcopy(self._deep_qnetwork).eval()

        # Adam optimizer
        self._optimizer = AdamW(self._deep_qnetwork.parameters(), lr=self._learning_rate)


    def store_transition(self, s_t, a_t, r_t, s_t1, done_t):
        self.replay_buffer.store_transition(s_t, a_t, r_t, s_t1, done_t)


    def replace_target_network(self):
        self._target_deepq_network.load_state_dict(self._deep_qnetwork.state_dict())
        

    def select_action(self, observation, greedy=False):
        
           
            if np.random.random() > self._epsilon or greedy:
                # Select action greedily

                # Action values
                qa = self._deep_qnetwork(observation)

                # Action con mayor q-value
                action=qa.argmax().item()
        
            else:
                # Exploración
                action=np.random.randint(2)

            if not greedy and self._epsilon >= self._epsilon_min:
                
                # Implement epsilon linear decay
                self._epsilon-=self._epsilon_decay 
                

            return action

    def update(self):
        s_t,a_t,r_t,s_t1,done=self.replay_buffer.sample_transitions()

        s_t=torch.from_numpy(s_t).unsqueeze(dim=0).float()
        s_t1=torch.from_numpy(s_t1).unsqueeze(dim=0).float()
        r_t = torch.tensor(r_t).view(-1, 1).float()
        done = torch.tensor(done).view(-1, 1)
        a_t=torch.tensor(a_t).view(-1, 1).type(torch.int64)

        # Predict Q-value de estado actual
        qsa_predict=self._deep_qnetwork(s_t)
        qsa_actions=torch.gather(input=qsa_predict[0], dim=1,index = a_t)
        
        # Calculo de Q-value target (Q-value estado siguiente)
        next_qsa_predict=self._target_deepq_network(s_t1)
        max_next_qsa_predict=torch.max(next_qsa_predict, dim=-1, keepdim=True)[0][0]

        target_qsa=r_t+~done*self._gamma*max_next_qsa_predict

        loss = F.mse_loss(qsa_actions, target_qsa)
        self._deep_qnetwork.zero_grad()
        loss.backward()
        self._optimizer.step()




In [4]:
import gym
env = gym.make('CartPole-v1')
eval_env = gym.make('CartPole-v1')

# Actions are discrete
dim_actions = np.array(env.action_space.n)

# States are continuous
dim_states = env.observation_space.shape[0]

print(dim_states)
print(dim_actions)

4
2


In [5]:
deep_qnetwork = DeepQNetwork(dim_states, dim_actions)
deep_qnetwork

DeepQNetwork(
  (layers): Sequential(
    (0): Linear(in_features=4, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=2, bias=True)
  )
)

In [6]:
target_deepq_network = copy.deepcopy(deep_qnetwork).eval()
target_deepq_network

DeepQNetwork(
  (layers): Sequential(
    (0): Linear(in_features=4, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=2, bias=True)
  )
)

In [7]:
observation=env.reset()
observation

array([ 0.00950862,  0.02172191, -0.02052333, -0.00567765], dtype=float32)

In [8]:
qsa=deep_qnetwork(observation)
qsa

tensor([[579.1338, 609.3038]], grad_fn=<AddmmBackward>)

In [9]:
observation = torch.from_numpy(observation).unsqueeze(dim=0).float()
observation

tensor([[ 0.0095,  0.0217, -0.0205, -0.0057]])

In [10]:
qsa=deep_qnetwork(observation)
qsa

tensor([[579.1338, 609.3038]], grad_fn=<AddmmBackward>)

# 2. Replay Buffer

In [11]:
import gym
import numpy as np

In [12]:
class ReplayBuffer:
    
    def __init__(self, dim_states, dim_actions, max_size, sample_size):

        assert sample_size < max_size, "Sample size cannot be greater than buffer size"
        
        self._buffer_idx     = 0
        self._exps_stored    = 0
        self._buffer_size    = max_size
        self._sample_size    = sample_size

        self._s_t_array      = np.zeros((max_size, dim_states))
        self._a_t_array      = np.zeros((max_size))
        self._r_t_array      = np.zeros((max_size,))
        self._s_t1_array     = np.zeros((max_size, dim_states))
        self._term_t_array   = np.zeros((max_size,), dtype=bool)


    def store_transition(self, s_t, a_t, r_t, s_t1, done_t):

        # Add transition to replay buffer according to self._buffer_idx
        self._s_t_array[self._buffer_idx]=s_t   
        self._a_t_array[self._buffer_idx]=a_t  
        self._r_t_array[self._buffer_idx]=r_t  
        self._s_t1_array[self._buffer_idx]=s_t1 
        self._term_t_array[self._buffer_idx]=done_t

        # Update replay buffer index
        # Aumento de indice y reinicio de indice si superamos capacidad
        self._buffer_idx = (self._buffer_idx + 1) % self._buffer_size
        self._exps_stored += 1

    
    def sample_transitions(self):
        assert self._exps_stored + 1 > self._sample_size, "Not enough samples have been stored to start sampling"
        
        sample_idxs = np.random.choice(self._buffer_size, size=self._sample_size,replace=False)
        
        return (self._s_t_array[sample_idxs],
                self._a_t_array[sample_idxs],
                self._r_t_array[sample_idxs],
                self._s_t1_array[sample_idxs],
                self._term_t_array[sample_idxs])
        

In [13]:
# Ambiente: CartPole
env = gym.make('CartPole-v1')
eval_env = gym.make('CartPole-v1')

# Actions are discrete
dim_actions = np.array(env.action_space.n)

# States are continuous
dim_states = env.observation_space.shape[0]

print(dim_states)
print(dim_actions)

4
2


In [15]:
# Inicialización de memory Buffer
max_size=5
sample_size=3
memory=ReplayBuffer(dim_states, dim_actions, max_size, sample_size)

In [16]:
# Simulación de 6 transiciones
s_t=env.reset()

for i in range(6):
   
    a_t=np.random.randint(2)
    s_t1, r_t, done_t, _ = env.step(a_t)
    print(s_t1, r_t, done_t)

    # Guardar
    memory.store_transition(s_t, a_t, r_t, s_t1, done_t)

    s_t = s_t1

[-0.04151405  0.16197139 -0.02788865 -0.289783  ] 1.0 False
[-0.03827462 -0.03274201 -0.03368431 -0.00602468] 1.0 False
[-0.03892946 -0.22736509 -0.03380481  0.27584302] 1.0 False
[-0.04347676 -0.03177756 -0.02828795 -0.02730738] 1.0 False
[-0.04411231 -0.22648266 -0.0288341   0.25631788] 1.0 False
[-0.04864197 -0.03096115 -0.02370774 -0.04531853] 1.0 False


In [17]:
# Datos almacenados: Conjunto de estados almacenados
memory._s_t1_array

array([[-0.04864197, -0.03096115, -0.02370774, -0.04531853],
       [-0.03827462, -0.03274201, -0.03368431, -0.00602468],
       [-0.03892946, -0.22736509, -0.03380481,  0.27584302],
       [-0.04347676, -0.03177756, -0.02828795, -0.02730738],
       [-0.04411231, -0.22648266, -0.0288341 ,  0.25631788]])

**Se observa como el sexto elemento de la transición de estados, coincide con el primer elemento del conjunto de estados almacenados. Esto debido a que la data se va sobreescribiendo en la medida de que hay nuevos registros.**

In [18]:
# Se guardan a los 5 elementos, los cuales son el max_size del buffer
memory._s_t1_array.shape

(5, 4)

In [19]:
# Sample
memory.sample_transitions()

(array([[-0.04151405,  0.16197139, -0.02788865, -0.289783  ],
        [-0.03827462, -0.03274201, -0.03368431, -0.00602468],
        [-0.03892946, -0.22736509, -0.03380481,  0.27584302]]),
 array([0., 0., 1.]),
 array([1., 1., 1.]),
 array([[-0.03827462, -0.03274201, -0.03368431, -0.00602468],
        [-0.03892946, -0.22736509, -0.03380481,  0.27584302],
        [-0.04347676, -0.03177756, -0.02828795, -0.02730738]]),
 array([False, False, False]))

In [20]:
# Se observa el muestreo de 3 elementos para el conjunto de estados
len(memory.sample_transitions()[0])

3

-----

# Prueba de la dinámica RN y Replay Buffer

In [21]:
s_t,a_t,r_t,s_t1,done=memory.sample_transitions()

In [22]:
s_t=torch.from_numpy(s_t).unsqueeze(dim=0).float()
s_t

tensor([[[-0.0389, -0.2274, -0.0338,  0.2758],
         [-0.0441, -0.2265, -0.0288,  0.2563],
         [-0.0415,  0.1620, -0.0279, -0.2898]]])

In [23]:
s_t1=torch.from_numpy(s_t1).unsqueeze(dim=0).float()
s_t1

tensor([[[-0.0435, -0.0318, -0.0283, -0.0273],
         [-0.0486, -0.0310, -0.0237, -0.0453],
         [-0.0383, -0.0327, -0.0337, -0.0060]]])

In [24]:
r_t = torch.tensor(r_t).view(-1, 1).float()
r_t

tensor([[1.],
        [1.],
        [1.]])

In [25]:
done = torch.tensor(done).view(-1, 1)
done

tensor([[False],
        [False],
        [False]])

In [26]:
a_t=torch.tensor(a_t).view(-1, 1).type(torch.int64)
a_t

tensor([[1],
        [1],
        [0]])

In [27]:
# Predict Q-value de estado actual
qsa_predict=deep_qnetwork(s_t)
print(qsa_predict)
qsa_actions=torch.gather(input=qsa_predict[0], dim=1,index = a_t)
print(qsa_actions)

tensor([[[552.4636, 581.1867],
         [542.0415, 570.2692],
         [477.6007, 502.9378]]], grad_fn=<AddBackward0>)
tensor([[581.1867],
        [570.2692],
        [477.6007]], grad_fn=<GatherBackward>)


In [28]:
qsa_actions.shape

torch.Size([3, 1])

In [29]:
# Calculo de Q-value target (Q-value estado siguiente)
next_qsa_predict=target_deepq_network(s_t1)
next_qsa_predict

tensor([[[502.1877, 528.6159],
         [492.4728, 518.4399],
         [513.3165, 540.2734]]], grad_fn=<AddBackward0>)

In [30]:
max_next_qsa_predict=torch.max(next_qsa_predict, dim=-1, keepdim=True)[0][0]
max_next_qsa_predict

tensor([[528.6159],
        [518.4399],
        [540.2734]], grad_fn=<SelectBackward>)

In [31]:
gamma=0.99

In [32]:
target_qsa=r_t+~done*gamma*max_next_qsa_predict
target_qsa

tensor([[524.3298],
        [514.2556],
        [535.8707]], grad_fn=<AddBackward0>)

In [33]:
target_qsa.shape

torch.Size([3, 1])

In [34]:
optimizer = AdamW(deep_qnetwork.parameters(), lr=0.001)
optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0.01
)

In [35]:
loss = F.mse_loss(qsa_actions, target_qsa)
deep_qnetwork.zero_grad()
loss.backward()
optimizer.step()

---------

# Read métricas 

In [36]:
import pandas as pd

In [38]:
exp_11_1=pd.read_csv("exp_11_1.csv")
exp_11_2=pd.read_csv("exp_11_2.csv")
exp_11_3=pd.read_csv("exp_11_3.csv")
exp_11_4=pd.read_csv("exp_11_4.csv")
exp_11_5=pd.read_csv("exp_11_5.csv")

exp_12_1=pd.read_csv("exp_12_1.csv")
exp_12_2=pd.read_csv("exp_12_2.csv")
exp_12_3=pd.read_csv("exp_12_3.csv")
exp_12_4=pd.read_csv("exp_12_4.csv")
exp_12_5=pd.read_csv("exp_12_5.csv")

exp_13_1=pd.read_csv("exp_13_1.csv")
exp_13_2=pd.read_csv("exp_13_2.csv")
exp_13_3=pd.read_csv("exp_13_3.csv")
exp_13_4=pd.read_csv("exp_13_4.csv")
exp_13_5=pd.read_csv("exp_13_5.csv")


In [None]:
exp_21_1=pd.read_csv("exp_21_1.csv")
exp_21_2=pd.read_csv("exp_21_2.csv")
exp_21_3=pd.read_csv("exp_21_3.csv")
exp_21_4=pd.read_csv("exp_21_4.csv")
exp_21_5=pd.read_csv("exp_21_5.csv")

exp_22_1=pd.read_csv("exp_22_1.csv")
exp_22_2=pd.read_csv("exp_22_2.csv")
exp_22_3=pd.read_csv("exp_22_3.csv")
exp_22_4=pd.read_csv("exp_22_4.csv")
exp_22_5=pd.read_csv("exp_22_5.csv")

exp_23_1=pd.read_csv("exp_23_1.csv")
exp_23_2=pd.read_csv("exp_23_2.csv")
exp_23_3=pd.read_csv("exp_23_3.csv")
exp_23_4=pd.read_csv("exp_23_4.csv")
exp_23_5=pd.read_csv("exp_23_5.csv")

In [None]:
exp_31_1=pd.read_csv("exp_31_1.csv")
exp_31_2=pd.read_csv("exp_31_2.csv")
exp_31_3=pd.read_csv("exp_31_3.csv")
exp_31_4=pd.read_csv("exp_31_4.csv")
exp_31_5=pd.read_csv("exp_31_5.csv")

exp_32_1=pd.read_csv("exp_32_1.csv")
exp_32_2=pd.read_csv("exp_32_2.csv")
exp_32_3=pd.read_csv("exp_32_3.csv")
exp_32_4=pd.read_csv("exp_32_4.csv")
exp_32_5=pd.read_csv("exp_32_5.csv")

exp_33_1=pd.read_csv("exp_33_1.csv")
exp_33_2=pd.read_csv("exp_33_2.csv")
exp_33_3=pd.read_csv("exp_33_3.csv")
exp_33_4=pd.read_csv("exp_33_4.csv")
exp_33_5=pd.read_csv("exp_33_5.csv")