# T3

# policy gradient

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import torch 
import torch.nn as nn
import numpy as np
import gym
from torch.optim import AdamW

**1. Parametrización de política**

In [3]:
class Policy(nn.Module):
    
    def __init__(self, dim_states, dim_actions, continuous_control):
        super(Policy, self).__init__()
        # MLP, fully connected layers, ReLU activations, linear ouput activation
        # dim_states -> 64 -> 64 -> dim_actions

        self.layers = nn.Sequential(
            nn.Linear(dim_states, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, dim_actions)
        )
        
        if continuous_control:
            # trainable parameter
            self.log_std = nn.Parameter(torch.zeros(1, dim_actions))


    def forward(self, input):

        # tensor format
        if isinstance(input, torch.Tensor):
            input=input
            
        else:
            input = torch.from_numpy(input).unsqueeze(dim=0).float()
            
        value = self.layers(input)
        
        return value


In [4]:
env = gym.make('Pendulum-v1')
dim_states = env.observation_space.shape[0]
continuous_control = isinstance(env.action_space, gym.spaces.Box)
dim_actions = env.action_space.shape[0] if continuous_control else env.action_space.n
print(dim_states, dim_actions,continuous_control)

3 1 True


In [5]:
env.action_space.sample()

array([1.6022688], dtype=float32)

In [6]:
RN_policy= Policy(dim_states, dim_actions,continuous_control)
RN_policy

Policy(
  (layers): Sequential(
    (0): Linear(in_features=3, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [7]:
RN_policy.log_std

Parameter containing:
tensor([[0.]], requires_grad=True)

In [8]:
s_t=env.reset()
s_t

array([ 0.41107753, -0.9116004 , -0.29097787], dtype=float32)

In [9]:
action=RN_policy(s_t)
action

tensor([[-0.0529]], grad_fn=<AddmmBackward>)

In [10]:
class PolicyGradients:
    
    def __init__(self, dim_states, dim_actions, lr, gamma, 
                 continuous_control=False, reward_to_go=False, use_baseline=False):
        
        self._learning_rate = lr
        self._gamma = gamma
        
        self._dim_states = dim_states
        self._dim_actions = dim_actions

        self._continuous_control = continuous_control
        self._use_reward_to_go = reward_to_go
        self._use_baseline = use_baseline

        self._policy = Policy(self._dim_states, self._dim_actions, self._continuous_control)
        # Adam optimizer
        self._optimizer = AdamW(self._policy.parameters(), lr=self._learning_rate)

        self._select_action = self._select_action_continuous if self._continuous_control else self._select_action_discrete
        self._compute_loss = self._compute_loss_continuous if self._continuous_control else self._compute_loss_discrete


    def select_action(self, observation):
        return self._select_action(observation)
        

    def _select_action_discrete(self, observation):
        # sample from categorical distribution
        RN_policy=self._policy 
        logits=RN_policy(observation)

        # Probabilidad de cada acción
        probs = torch.softmax(logits, dim=-1)

        # Distribución de probabilidad categorica
        dist = torch.distributions.Categorical(probs)

        # Sample de acción
        action = dist.sample()#.item()
     
        return action


    def _select_action_continuous(self, observation):
        # sample from normal distribution
        # use the log std trainable parameter

        # RN
        RN_policy=self._policy

        # Parametro log std de la RN
        log_std=RN_policy.log_std
        std = torch.exp(log_std)

        # Politica dada la observación (Representa el promedio de la distribución normal que muestrea acciones)
        policy=RN_policy(observation)
        
        # Distribución normal de parametros mean y std, esta se utiliza para muestrear acciones de modo de tal de explorar el espacio de acciones
        dist = torch.distributions.Normal(policy, std)

        # sample de acción
        action = dist.sample()
        
        # Asegurarse de que las acciones están dentro del rango [-1, 1]
        #action = torch.tanh(action)
        
        return action
            

    def update(self, observation_batch, action_batch, advantage_batch):
        # update the policy here
        # you should use self._compute_loss 

        pass
    

    def _compute_loss_discrete(self, observation_batch, action_batch, advantage_batch):
        # use negative logprobs * advantages
        pass


    def _compute_loss_continuous(self, observation_batch, action_batch, advantage_batch):
        # use negative logprobs * advantages
        pass

    
    def estimate_returns(self, rollouts_rew):
        estimated_returns = []
        for rollout_rew in rollouts_rew:
                
            if self._use_reward_to_go:
                # only for part 2
                estimated_return = None
            else:
                estimated_return = None
            
            estimated_returns = np.concatenate([estimated_returns, estimated_return])

        if self._use_baseline:
            # only for part 2
            average_return_baseline = None
            # Use the baseline:
            #estimated_returns -= average_return_baseline

        return np.array(estimated_returns, dtype=np.float32)


    # It may be useful to discount the rewards using an auxiliary function [optional]
    def _discount_rewards(self, rewards):
        pass


**2. Muestreo de trayectorias**

In [74]:
import gym
import time
import datetime
import csv

import numpy as np

import matplotlib.pyplot as plt

#from policy_gradients import PolicyGradients


def perform_single_rollout(env, agent, episode_nb, render=False):

    # Modify this function to return a tuple of numpy arrays containing (observations, actions, rewards).
    # (np.array(obs), np.array(acs), np.array(rws))
    # np.array(obs) -> shape: (time_steps, nb_obs)
    # np.array(acs) -> shape: (time_steps, nb_acs) if actions are continuous, (time_steps,) if actions are discrete
    # np.array(rws) -> shape: (time_steps,)

    obs_list = []
    action_list = []
    reward_list = []

    ob_t = env.reset()
    
    done = False
    episode_reward = 0
    nb_steps = 0

    while not done:
        
        if render:
            env.render()
            time.sleep(1. / 60)

        #action = agent.select_action(ob_t)
        
        action = agent.select_action(ob_t)
    
        #print(action)

        try:    
            ob_t1, reward, done, _ = env.step(action)

        except:
            ob_t1, reward, done, _ = env.step(action.item())


        obs_list.append(ob_t1)
        action_list.append(action)
        reward_list.append(reward)

        ob_t = np.squeeze(ob_t1) # <-- may not be needed depending on gym version

        episode_reward += reward
        
        nb_steps += 1

        if done:
            print(f"Largo del episodio {nb_steps}")
            obs_array = np.array(obs_list)
            action_array = np.array(action_list)
            reward_array = np.array(reward_list)

            return obs_array, action_array, reward_array
    #return None

def sample_rollouts(env, agent, training_iter, min_batch_steps):

    sampled_rollouts = []
    total_nb_steps = 0
    episode_nb = 0
    
    while total_nb_steps < min_batch_steps:

        episode_nb += 1
        #render = training_iter%10 == 0 and len(sampled_rollouts) == 0 # Change training_iter%10 to any number you want
        render=False
        # Use perform_single_rollout to get data 
        # Uncomment once perform_single_rollout works.
        # Return sampled_rollouts
       
        sample_rollout = perform_single_rollout(env, agent, episode_nb, render=render)
        total_nb_steps += len(sample_rollout[0])

        sampled_rollouts.append(sample_rollout)
        
    return sampled_rollouts




In [75]:
env = gym.make('Pendulum-v1')

dim_states = env.observation_space.shape[0]

continuous_control = isinstance(env.action_space, gym.spaces.Box)
dim_actions = env.action_space.shape[0] if continuous_control else env.action_space.n

policy_gradients_agent = PolicyGradients(dim_states=dim_states, 
                                             dim_actions=dim_actions, 
                                             lr=0.005,
                                             gamma=0.99,
                                             continuous_control=continuous_control,
                                             reward_to_go=False,
                                             use_baseline=False)



In [76]:
# Rollout
x1=perform_single_rollout(env, policy_gradients_agent, 1000, render=False)
print(x1[0].shape)
print(x1[1].shape)
print(x1[2].shape)

Largo del episodio 200
(200, 3)
(200,)
(200,)


**El número de filas de las observaciones es igual con el largo del episodio, por lo que se concluye el correcto funcionamiento de la función.**

In [77]:
# Sample rollouts
x2=sample_rollouts(env, policy_gradients_agent, 1000, 5000)

Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200


In [78]:
sampled_obs = np.concatenate([x2[i][0] for i in range(len(x2))])
sampled_action = np.concatenate([x2[i][1] for i in range(len(x2))])
sampled_reward = np.concatenate([x2[i][2] for i in range(len(x2))])
print(sampled_obs.shape)
print(sampled_action.shape)
print(sampled_reward.shape)

(5000, 3)
(5000,)
(5000,)


**El largo del registro de sample rollout es al menos el número de sample mini batch, se concluye que la función funciona.**

In [79]:
env = gym.make('CartPole-v1')

dim_states = env.observation_space.shape[0]

continuous_control = isinstance(env.action_space, gym.spaces.Box)
dim_actions = env.action_space.shape[0] if continuous_control else env.action_space.n

policy_gradients_agent = PolicyGradients(dim_states=dim_states, 
                                             dim_actions=dim_actions, 
                                             lr=0.005,
                                             gamma=0.99,
                                             continuous_control=continuous_control,
                                             reward_to_go=False,
                                             use_baseline=False)


In [80]:
# Rollout
x1=perform_single_rollout(env, policy_gradients_agent, 1000, render=False)
print(x1[0].shape)
print(x1[1].shape)
print(x1[2].shape)

Largo del episodio 11
(11, 4)
(11,)
(11,)


**El número de filas de las observaciones es igual con el largo del episodio, por lo que se concluye el correcto funcionamiento de la función.**

In [81]:
# Sample rollouts
x2=sample_rollouts(env, policy_gradients_agent, 1000, 5000)

Largo del episodio 18
Largo del episodio 50
Largo del episodio 16
Largo del episodio 11
Largo del episodio 20
Largo del episodio 19
Largo del episodio 14
Largo del episodio 21
Largo del episodio 14
Largo del episodio 18
Largo del episodio 20
Largo del episodio 13
Largo del episodio 13
Largo del episodio 22
Largo del episodio 14
Largo del episodio 23
Largo del episodio 40
Largo del episodio 27
Largo del episodio 30
Largo del episodio 21
Largo del episodio 14
Largo del episodio 35
Largo del episodio 14
Largo del episodio 13
Largo del episodio 11
Largo del episodio 22
Largo del episodio 17
Largo del episodio 14
Largo del episodio 17
Largo del episodio 12
Largo del episodio 12
Largo del episodio 15
Largo del episodio 19
Largo del episodio 22
Largo del episodio 37
Largo del episodio 21
Largo del episodio 24
Largo del episodio 15
Largo del episodio 14
Largo del episodio 10
Largo del episodio 19
Largo del episodio 37
Largo del episodio 41
Largo del episodio 14
Largo del episodio 36
Largo del 

In [82]:
sampled_obs = np.concatenate([x2[i][0] for i in range(len(x2))])
sampled_action = np.concatenate([x2[i][1] for i in range(len(x2))])
sampled_reward = np.concatenate([x2[i][2] for i in range(len(x2))])
print(sampled_obs.shape)
print(sampled_action.shape)
print(sampled_reward.shape)

(5010, 4)
(5010,)
(5010,)


**El largo del registro de sample rollout es al menos el número de sample mini batch, se concluye que la función funciona.**

**3. Estimación de retornos**

In [83]:
def estimate_returns( rollouts_rew):
        estimated_returns = []
        for rollout_rew in rollouts_rew:

            # Largo del episodio (largo del reward)
            n_steps = len(rollout_rew[2])
            estimated_return = np.zeros(n_steps)

            if _use_reward_to_go:
            
                estimated_return = None
            else:
                estimated_return = np.zeros(n_steps)

                vec_gammas=np.array([_gamma**j for j in range(n_steps)])

                sum_descount=np.sum(vec_gammas*rollout_rew[2])

                for t in range(n_steps):
                    
                    estimated_return[t] = sum_descount
                    
             
            estimated_returns = np.concatenate([estimated_returns, estimated_return])

        if _use_baseline:
            pass
            #average_return_baseline = np.mean(estimated_returns)
            #estimated_returns -= average_return_baseline

        return np.array(estimated_returns, dtype=np.float32)


In [84]:
_gamma=0.99
_use_reward_to_go=False
_use_baseline=False

In [85]:
env = gym.make('CartPole-v1')

dim_states = env.observation_space.shape[0]

continuous_control = isinstance(env.action_space, gym.spaces.Box)
dim_actions = env.action_space.shape[0] if continuous_control else env.action_space.n

policy_gradients_agent = PolicyGradients(dim_states=dim_states, 
                                             dim_actions=dim_actions, 
                                             lr=0.005,
                                             gamma=0.99,
                                             continuous_control=continuous_control,
                                             reward_to_go=False,
                                             use_baseline=False)

# Sample rollouts (2 episodios): Ejecutar hasta que se generen solo 2 episodios!!
x2=sample_rollouts(env, policy_gradients_agent, 1000, 22)
print("")
print("Vector de retorno")
print(estimate_returns(x2))
print("")
retorno=0
for t,reward in enumerate(x2[0][2]):
    retorno=retorno+(_gamma**t)*reward
print("Retorno Ep 1")
print(retorno)
print("")
retorno=0
for t,reward in enumerate(x2[1][2]):
    retorno=retorno+(_gamma**t)*reward
print("Retorno Ep 2")
print(retorno)
print("")

Largo del episodio 20
Largo del episodio 12

Vector de retorno
[18.209307 18.209307 18.209307 18.209307 18.209307 18.209307 18.209307
 18.209307 18.209307 18.209307 18.209307 18.209307 18.209307 18.209307
 18.209307 18.209307 18.209307 18.209307 18.209307 18.209307 11.361513
 11.361513 11.361513 11.361513 11.361513 11.361513 11.361513 11.361513
 11.361513 11.361513 11.361513 11.361513]

Retorno Ep 1
18.20930624027691

Retorno Ep 2
11.361512828387072



In [86]:
env=gym.make('Pendulum-v1')

dim_states = env.observation_space.shape[0]

continuous_control = isinstance(env.action_space, gym.spaces.Box)
dim_actions = env.action_space.shape[0] if continuous_control else env.action_space.n

policy_gradients_agent = PolicyGradients(dim_states=dim_states, 
                                             dim_actions=dim_actions, 
                                             lr=0.005,
                                             gamma=0.99,
                                             continuous_control=continuous_control,
                                             reward_to_go=False,
                                             use_baseline=False)

# Sample rollouts (2 episodios): Ejecutar hasta que se generen solo 2 episodios!!
x2=sample_rollouts(env, policy_gradients_agent, 1000, 220)

index_episodio_1=x2[0][1].shape[0]

print("")
print("Muestra vector de retorno ep 1")
print(estimate_returns(x2)[index_episodio_1-1])
print("")
print("")
print("Muestra vector de retorno ep 2")
print(estimate_returns(x2)[index_episodio_1+1])
print("")

retorno=0
for t,reward in enumerate(x2[0][2]):
    retorno=retorno+(_gamma**t)*reward
print("Retorno Ep 1")
print(retorno)
print("")

retorno=0
for t,reward in enumerate(x2[1][2]):
    retorno=retorno+(_gamma**t)*reward
print("Retorno Ep 2")
print(retorno)
print("")

Largo del episodio 200
Largo del episodio 200

Muestra vector de retorno ep 1
-436.4161


Muestra vector de retorno ep 2
-493.84952

Retorno Ep 1
-436.4161057424834

Retorno Ep 2
-493.8495181936171



**Se observa que que para ambos ambientes y en cada episodio, los retornos coinciden los calculos obtenidos.**

**4. Policy gradients**

In [121]:
#env = gym.make('CartPole-v1')
env=gym.make('Pendulum-v1')
env.reset()
dim_states = env.observation_space.shape[0]

continuous_control = isinstance(env.action_space, gym.spaces.Box)
dim_actions = env.action_space.shape[0] if continuous_control else env.action_space.n

policy_gradients_agent = PolicyGradients(dim_states=dim_states, 
                                             dim_actions=dim_actions, 
                                             lr=0.005,
                                             gamma=0.99,
                                             continuous_control=continuous_control,
                                             reward_to_go=False,
                                             use_baseline=False)

In [122]:
# Sample rollouts (2 episodios): Ejecutar hasta que se generen solo 2 episodios!!
x2=sample_rollouts(env, policy_gradients_agent, 1000, 220)

Largo del episodio 200
Largo del episodio 200


In [123]:
sampled_obs = np.concatenate([x2[i][0] for i in range(len(x2))])
sampled_action = np.concatenate([x2[i][1] for i in range(len(x2))])
sampled_reward = np.concatenate([x2[i][2] for i in range(len(x2))])
print(sampled_obs.shape)
print(sampled_action.shape)
print(sampled_reward.shape)

(400, 3)
(400,)
(400,)


In [117]:

#policy_gradients_agent._select_action(sampled_obs)

In [None]:
# sample from categorical distribution
RN_policy=self._policy 
logits=RN_policy(observation)

# Probabilidad de cada acción
probs = torch.softmax(logits, dim=-1)

# Distribución de probabilidad categorica
dist = torch.distributions.Categorical(probs)

# Sample de acción
action = dist.sample()#.item()

In [None]:

# Sample de acción
action = dist.sample()#.item()

In [135]:
# Probabilidad de cada acción
probs = torch.softmax(logits, dim=-1)

# Distribución de probabilidad categorica
dist = torch.distributions.Categorical(probs)

dist

Categorical(probs: torch.Size([1, 400, 1]))

In [140]:
dist

Categorical(probs: torch.Size([1, 400, 1]))

In [125]:
torch.log(prob)

tensor([[[nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
         [nan],
        

In [134]:
RN_policy= Policy(dim_states, dim_actions,continuous_control)
logits=RN_policy(sampled_obs)
logits

tensor([[[-1.2509e-01],
         [-1.1723e-01],
         [-1.0329e-01],
         [-1.0381e-01],
         [-1.0271e-01],
         [-9.0547e-02],
         [-8.1581e-02],
         [-8.0820e-02],
         [-7.9781e-02],
         [-7.5685e-02],
         [-6.8269e-02],
         [-5.9142e-02],
         [-5.4989e-02],
         [-4.0590e-02],
         [-2.1496e-02],
         [-3.2188e-04],
         [ 1.5735e-02],
         [ 2.9477e-02],
         [ 3.5354e-02],
         [ 2.8382e-02],
         [ 2.1068e-02],
         [ 1.2018e-02],
         [ 5.9673e-04],
         [-1.7896e-02],
         [-3.6093e-02],
         [-4.8893e-02],
         [-5.5155e-02],
         [-7.1200e-02],
         [-7.1679e-02],
         [-7.1871e-02],
         [-7.1771e-02],
         [-7.0678e-02],
         [-7.0732e-02],
         [-7.1825e-02],
         [-6.9350e-02],
         [-6.9915e-02],
         [-6.4453e-02],
         [-5.7539e-02],
         [-5.4893e-02],
         [-4.5302e-02],
         [-4.0996e-02],
         [-2.810

**5. Reducción de varianza**

In [27]:
def estimate_returns( rollouts_rew):
        estimated_returns = []
        for rollout_rew in rollouts_rew:

            # Largo del episodio (largo del reward)
            n_steps = len(rollout_rew[2])
            estimated_return = np.zeros(n_steps)

            if _use_reward_to_go:
                
                vec_gammas=np.array([_gamma**j for j in range(n_steps)])

                for t in range(n_steps):

                    sum_descount=np.sum(vec_gammas[t:]*rollout_rew[2][t:])
                    estimated_return[t] = sum_descount
    
            else:

                estimated_return = np.zeros(n_steps)

                vec_gammas=np.array([_gamma**j for j in range(n_steps)])

                sum_descount=np.sum(vec_gammas*rollout_rew[2])

                for t in range(n_steps):
                    
                    estimated_return[t] = sum_descount
                     
            estimated_returns = np.concatenate([estimated_returns, estimated_return])

        if _use_baseline:
            
            average_return_baseline = np.mean(estimated_returns)
            estimated_returns -= average_return_baseline

        return np.array(estimated_returns, dtype=np.float32)


In [28]:
env = gym.make('CartPole-v1')

dim_states = env.observation_space.shape[0]

continuous_control = isinstance(env.action_space, gym.spaces.Box)
dim_actions = env.action_space.shape[0] if continuous_control else env.action_space.n

policy_gradients_agent = PolicyGradients(dim_states=dim_states, 
                                             dim_actions=dim_actions, 
                                             lr=0.005,
                                             gamma=0.99,
                                             continuous_control=continuous_control,
                                             reward_to_go=False,
                                             use_baseline=False)

# Sample rollouts (2 episodios): Ejecutar hasta que se generen solo 2 episodios!!
x2=sample_rollouts(env, policy_gradients_agent, 1000, 22)

Largo del episodio 55


In [29]:
# Caso base
_gamma=0.99
_use_reward_to_go=False
_use_baseline=False
test=estimate_returns(x2)
test

array([42.464523, 42.464523, 42.464523, 42.464523, 42.464523, 42.464523,
       42.464523, 42.464523, 42.464523, 42.464523, 42.464523, 42.464523,
       42.464523, 42.464523, 42.464523, 42.464523, 42.464523, 42.464523,
       42.464523, 42.464523, 42.464523, 42.464523, 42.464523, 42.464523,
       42.464523, 42.464523, 42.464523, 42.464523, 42.464523, 42.464523,
       42.464523, 42.464523, 42.464523, 42.464523, 42.464523, 42.464523,
       42.464523, 42.464523, 42.464523, 42.464523, 42.464523, 42.464523,
       42.464523, 42.464523, 42.464523, 42.464523, 42.464523, 42.464523,
       42.464523, 42.464523, 42.464523, 42.464523, 42.464523, 42.464523,
       42.464523], dtype=float32)

In [33]:
# Reward to go
_gamma=0.99
_use_reward_to_go=True
_use_baseline=False
test=estimate_returns(x2)
test

array([42.464523 , 41.464523 , 40.474525 , 39.494427 , 38.524124 ,
       37.56353  , 36.61254  , 35.67106  , 34.738995 , 33.81625  ,
       32.902733 , 31.99835  , 31.103012 , 30.216627 , 29.339106 ,
       28.47036  , 27.610302 , 26.758844 , 25.915901 , 25.081387 ,
       24.255219 , 23.437311 , 22.627584 , 21.825953 , 21.03234  ,
       20.24666  , 19.46884  , 18.698797 , 17.936453 , 17.181734 ,
       16.434563 , 15.694862 , 14.962559 , 14.237578 , 13.519848 ,
       12.809295 , 12.105846 , 11.409433 , 10.719984 , 10.03743  ,
        9.361701 ,  8.692729 ,  8.030447 ,  7.374788 ,  6.725685 ,
        6.0830736,  5.446888 ,  4.8170643,  4.193539 ,  3.576249 ,
        2.9651318,  2.3601255,  1.7611697,  1.1682032,  0.5811664],
      dtype=float32)

In [34]:
# Baseline
_gamma=0.99
_use_reward_to_go=False
_use_baseline=True
test=estimate_returns(x2)
test

array([-2.1316282e-14, -2.1316282e-14, -2.1316282e-14, -2.1316282e-14,
       -2.1316282e-14, -2.1316282e-14, -2.1316282e-14, -2.1316282e-14,
       -2.1316282e-14, -2.1316282e-14, -2.1316282e-14, -2.1316282e-14,
       -2.1316282e-14, -2.1316282e-14, -2.1316282e-14, -2.1316282e-14,
       -2.1316282e-14, -2.1316282e-14, -2.1316282e-14, -2.1316282e-14,
       -2.1316282e-14, -2.1316282e-14, -2.1316282e-14, -2.1316282e-14,
       -2.1316282e-14, -2.1316282e-14, -2.1316282e-14, -2.1316282e-14,
       -2.1316282e-14, -2.1316282e-14, -2.1316282e-14, -2.1316282e-14,
       -2.1316282e-14, -2.1316282e-14, -2.1316282e-14, -2.1316282e-14,
       -2.1316282e-14, -2.1316282e-14, -2.1316282e-14, -2.1316282e-14,
       -2.1316282e-14, -2.1316282e-14, -2.1316282e-14, -2.1316282e-14,
       -2.1316282e-14, -2.1316282e-14, -2.1316282e-14, -2.1316282e-14,
       -2.1316282e-14, -2.1316282e-14, -2.1316282e-14, -2.1316282e-14,
       -2.1316282e-14, -2.1316282e-14, -2.1316282e-14], dtype=float32)

In [35]:
# Baseline & reward-to-go
_gamma=0.99
_use_reward_to_go=True
_use_baseline=True
test=estimate_returns(x2)
test

array([ 22.791773  ,  21.791773  ,  20.801773  ,  19.821672  ,
        18.851374  ,  17.890778  ,  16.939787  ,  15.998307  ,
        15.066242  ,  14.143497  ,  13.22998   ,  12.325598  ,
        11.43026   ,  10.543875  ,   9.666354  ,   8.797608  ,
         7.93755   ,   7.086092  ,   6.243149  ,   5.408635  ,
         4.5824666 ,   3.7645595 ,   2.9548316 ,   2.153201  ,
         1.3595868 ,   0.5739087 ,  -0.20391269,  -0.97395587,
        -1.7362986 ,  -2.4910178 ,  -3.23819   ,  -3.9778903 ,
        -4.7101936 ,  -5.435174  ,  -6.1529045 ,  -6.8634577 ,
        -7.5669055 ,  -8.263319  ,  -8.952767  ,  -9.635323  ,
       -10.311051  , -10.980023  , -11.642305  , -12.297964  ,
       -12.947067  , -13.589679  , -14.225864  , -14.855688  ,
       -15.479213  , -16.096504  , -16.70762   , -17.312626  ,
       -17.911583  , -18.504549  , -19.091585  ], dtype=float32)

**6. Evaluación del algoritmo**