# T3

# policy gradient

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import torch 
import torch.nn as nn
import numpy as np
import gym
from torch.optim import AdamW

**1. Parametrización de política**

In [4]:
class Policy(nn.Module):
    
    def __init__(self, dim_states, dim_actions, continuous_control):
        super(Policy, self).__init__()
        # MLP, fully connected layers, ReLU activations, linear ouput activation
        # dim_states -> 64 -> 64 -> dim_actions

        self.layers = nn.Sequential(
            nn.Linear(dim_states, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, dim_actions)
        )
        
        if continuous_control:
            # trainable parameter
            self.log_std = nn.Parameter(torch.zeros(1, dim_actions))


    def forward(self, input):

        # tensor format
        if isinstance(input, torch.Tensor):
            input=input
            
        else:
            input = torch.from_numpy(input).unsqueeze(dim=0).float()
            
        value = self.layers(input)
        
        return value


In [5]:
env = gym.make('Pendulum-v1')
dim_states = env.observation_space.shape[0]
continuous_control = isinstance(env.action_space, gym.spaces.Box)
dim_actions = env.action_space.shape[0] if continuous_control else env.action_space.n
print(dim_states, dim_actions,continuous_control)

3 1 True


In [6]:
env.action_space.sample()

array([-0.18902148], dtype=float32)

In [7]:
RN_policy= Policy(dim_states, dim_actions,continuous_control)
RN_policy

Policy(
  (layers): Sequential(
    (0): Linear(in_features=3, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [8]:
RN_policy.log_std

Parameter containing:
tensor([[0.]], requires_grad=True)

In [9]:
s_t=env.reset()
s_t

array([-0.94356245,  0.33119458, -0.5466567 ], dtype=float32)

In [10]:
action=RN_policy(s_t)
action

tensor([[-0.0746]], grad_fn=<AddmmBackward>)

In [12]:
class PolicyGradients:
    
    def __init__(self, dim_states, dim_actions, lr, gamma, 
                 continuous_control=False, reward_to_go=False, use_baseline=False):
        
        self._learning_rate = lr
        self._gamma = gamma
        
        self._dim_states = dim_states
        self._dim_actions = dim_actions

        self._continuous_control = continuous_control
        self._use_reward_to_go = reward_to_go
        self._use_baseline = use_baseline

        self._policy = Policy(self._dim_states, self._dim_actions, self._continuous_control)
        # Adam optimizer
        self._optimizer = AdamW(self._policy.parameters(), lr=self._learning_rate)

        self._select_action = self._select_action_continuous if self._continuous_control else self._select_action_discrete
        self._compute_loss = self._compute_loss_continuous if self._continuous_control else self._compute_loss_discrete


    def select_action(self, observation):
        return self._select_action(observation)
        

    def _select_action_discrete(self, observation):
        # sample from categorical distribution
        RN_policy=self._policy 
        logits=RN_policy(observation)

        # Probabilidad de cada acción
        probs = torch.softmax(logits, dim=-1)

        # Distribución de probabilidad categorica
        dist = torch.distributions.Categorical(probs)

        # Sample de acción
        action = dist.sample().item()
     
        return action


    def _select_action_continuous(self, observation):
        # sample from normal distribution
        # use the log std trainable parameter

        # RN
        RN_policy=self._policy

        # Parametro log std de la RN
        log_std=RN_policy.log_std
        std = torch.exp(log_std)

        # Politica dada la observación (Representa el promedio de la distribución normal que muestrea acciones)
        policy=RN_policy(observation)
        
        # Distribución normal de parametros mean y std, esta se utiliza para muestrear acciones de modo de tal de explorar el espacio de acciones
        dist = torch.distributions.Normal(policy, std)

        # sample de acción
        action = dist.sample()
        
        # Asegurarse de que las acciones están dentro del rango [-1, 1]
        action = torch.tanh(action)
        
        return action
            

    def update(self, observation_batch, action_batch, advantage_batch):
        # update the policy here
        # you should use self._compute_loss 

        pass
    

    def _compute_loss_discrete(self, observation_batch, action_batch, advantage_batch):
        # use negative logprobs * advantages
        pass


    def _compute_loss_continuous(self, observation_batch, action_batch, advantage_batch):
        # use negative logprobs * advantages
        pass

    
    def estimate_returns(self, rollouts_rew):
        estimated_returns = []
        for rollout_rew in rollouts_rew:
                
            if self._use_reward_to_go:
                # only for part 2
                estimated_return = None
            else:
                estimated_return = None
            
            estimated_returns = np.concatenate([estimated_returns, estimated_return])

        if self._use_baseline:
            # only for part 2
            average_return_baseline = None
            # Use the baseline:
            #estimated_returns -= average_return_baseline

        return np.array(estimated_returns, dtype=np.float32)


    # It may be useful to discount the rewards using an auxiliary function [optional]
    def _discount_rewards(self, rewards):
        pass


**2. Muestreo de trayectorias**

In [13]:
import gym
import time
import datetime
import csv

import numpy as np

import matplotlib.pyplot as plt

#from policy_gradients import PolicyGradients


def perform_single_rollout(env, agent, episode_nb, render=False):

    # Modify this function to return a tuple of numpy arrays containing (observations, actions, rewards).
    # (np.array(obs), np.array(acs), np.array(rws))
    # np.array(obs) -> shape: (time_steps, nb_obs)
    # np.array(acs) -> shape: (time_steps, nb_acs) if actions are continuous, (time_steps,) if actions are discrete
    # np.array(rws) -> shape: (time_steps,)

    obs_list = []
    action_list = []
    reward_list = []

    ob_t = env.reset()
    
    done = False
    episode_reward = 0
    nb_steps = 0

    while not done:
        
        if render:
            env.render()
            time.sleep(1. / 60)

        action = agent.select_action(ob_t)
        
        ob_t1, reward, done, _ = env.step(action)

        obs_list.append(ob_t1)
        action_list.append(action)
        reward_list.append(reward)

        ob_t = np.squeeze(ob_t1) # <-- may not be needed depending on gym version

        episode_reward += reward
        
        nb_steps += 1

        if done:
            print(f"Largo del episodio {nb_steps}")
            obs_array = np.array(obs_list)
            action_array = np.array(action_list)
            reward_array = np.array(reward_list)

            return obs_array, action_array, reward_array
    #return None

def sample_rollouts(env, agent, training_iter, min_batch_steps):

    sampled_rollouts = []
    total_nb_steps = 0
    episode_nb = 0
    
    while total_nb_steps < min_batch_steps:

        episode_nb += 1
        #render = training_iter%10 == 0 and len(sampled_rollouts) == 0 # Change training_iter%10 to any number you want
        render=False
        # Use perform_single_rollout to get data 
        # Uncomment once perform_single_rollout works.
        # Return sampled_rollouts
       
        sample_rollout = perform_single_rollout(env, agent, episode_nb, render=render)
        total_nb_steps += len(sample_rollout[0])

        sampled_rollouts.append(sample_rollout)
        
    return sampled_rollouts




In [14]:
env = gym.make('Pendulum-v1')

dim_states = env.observation_space.shape[0]

continuous_control = isinstance(env.action_space, gym.spaces.Box)
dim_actions = env.action_space.shape[0] if continuous_control else env.action_space.n

policy_gradients_agent = PolicyGradients(dim_states=dim_states, 
                                             dim_actions=dim_actions, 
                                             lr=0.005,
                                             gamma=0.99,
                                             continuous_control=continuous_control,
                                             reward_to_go=False,
                                             use_baseline=False)



In [15]:
# Rollout
x1=perform_single_rollout(env, policy_gradients_agent, 1000, render=False)
print(x1[0].shape)
print(x1[1].shape)
print(x1[2].shape)

Largo del episodio 200
(200, 3)
(200,)
(200,)


**El número de filas de las observaciones es igual con el largo del episodio, por lo que se concluye el correcto funcionamiento de la función.**

In [16]:
# Sample rollouts
x2=sample_rollouts(env, policy_gradients_agent, 1000, 5000)

Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200
Largo del episodio 200


In [17]:
sampled_obs = np.concatenate([x2[i][0] for i in range(len(x2))])
sampled_action = np.concatenate([x2[i][1] for i in range(len(x2))])
sampled_reward = np.concatenate([x2[i][2] for i in range(len(x2))])
print(sampled_obs.shape)
print(sampled_action.shape)
print(sampled_reward.shape)

(5000, 3)
(5000,)
(5000,)


**El largo del registro de sample rollout es al menos el número de sample mini batch, se concluye que la función funciona.**

In [18]:
env = gym.make('CartPole-v1')

dim_states = env.observation_space.shape[0]

continuous_control = isinstance(env.action_space, gym.spaces.Box)
dim_actions = env.action_space.shape[0] if continuous_control else env.action_space.n

policy_gradients_agent = PolicyGradients(dim_states=dim_states, 
                                             dim_actions=dim_actions, 
                                             lr=0.005,
                                             gamma=0.99,
                                             continuous_control=continuous_control,
                                             reward_to_go=False,
                                             use_baseline=False)


In [19]:
# Rollout
x1=perform_single_rollout(env, policy_gradients_agent, 1000, render=False)
print(x1[0].shape)
print(x1[1].shape)
print(x1[2].shape)

Largo del episodio 20
(20, 4)
(20,)
(20,)


**El número de filas de las observaciones es igual con el largo del episodio, por lo que se concluye el correcto funcionamiento de la función.**

In [20]:
# Sample rollouts
x2=sample_rollouts(env, policy_gradients_agent, 1000, 5000)

Largo del episodio 12
Largo del episodio 15
Largo del episodio 20
Largo del episodio 44
Largo del episodio 24
Largo del episodio 12
Largo del episodio 45
Largo del episodio 14
Largo del episodio 16
Largo del episodio 22
Largo del episodio 26
Largo del episodio 14
Largo del episodio 25
Largo del episodio 15
Largo del episodio 18
Largo del episodio 12
Largo del episodio 9
Largo del episodio 40
Largo del episodio 24
Largo del episodio 25
Largo del episodio 14
Largo del episodio 28
Largo del episodio 24
Largo del episodio 13
Largo del episodio 39
Largo del episodio 13
Largo del episodio 9
Largo del episodio 13
Largo del episodio 12
Largo del episodio 13
Largo del episodio 9
Largo del episodio 16
Largo del episodio 18
Largo del episodio 9
Largo del episodio 20
Largo del episodio 32
Largo del episodio 12
Largo del episodio 13
Largo del episodio 10
Largo del episodio 30
Largo del episodio 21
Largo del episodio 11
Largo del episodio 31
Largo del episodio 14
Largo del episodio 21
Largo del epis

In [21]:
sampled_obs = np.concatenate([x2[i][0] for i in range(len(x2))])
sampled_action = np.concatenate([x2[i][1] for i in range(len(x2))])
sampled_reward = np.concatenate([x2[i][2] for i in range(len(x2))])
print(sampled_obs.shape)
print(sampled_action.shape)
print(sampled_reward.shape)

(5004, 4)
(5004,)
(5004,)


**El largo del registro de sample rollout es al menos el número de sample mini batch, se concluye que la función funciona.**

**3. Estimación de retornos**

In [22]:
_gamma=0.99
_use_reward_to_go=False
_use_baseline=False

In [180]:
y=np.array([i for i in range(10)])*np.array([_gamma**t for t in range(10)])

In [182]:
t=3
y[t:].shape

(7,)

In [185]:
len(y[t:])

7

In [187]:
range(t,len(y))

range(3, 10)

In [188]:
np.array([_gamma**t for t in range(t,len(y))])

array([0.970299  , 0.96059601, 0.95099005, 0.94148015, 0.93206535,
       0.92274469, 0.91351725])

In [193]:
def estimate_returns( rollouts_rew):
        
        estimated_returns = []

        for rollout_rew in rollouts_rew:
            
            # Largo del episodio (largo del reward)
            n_steps = len(rollout_rew[2])
            
            if _use_reward_to_go:
                
                estimated_return = np.zeros(n_steps)

                # Se itera sobre cada interacción agente-ambiente
                for t in range(n_steps):

                    # rollout_rew[2] es el conjunto de reward asociado al episodio
                    rewards_from_t = rollout_rew[2][t:]

                    # Vector de gammas elevado a step
                    vec_gammas=np.array([_gamma**p for p in range(t,len(rollout_rew[2]))])
                  
                    estimated_return[t] = np.sum(rewards_from_t*vec_gammas)
                    
            else:
                # Método de recompensa-acumulada o acumulative-reward
                estimated_return = np.zeros(n_steps)
                
                discounted_reward_sum = 0
                for t in reversed(range(n_steps)):
                    discounted_reward_sum = _gamma * discounted_reward_sum + rollout_rew[2][t]
                    estimated_return[t] = discounted_reward_sum
            
            estimated_returns = np.concatenate([estimated_returns, estimated_return])

        if _use_baseline:
            
            average_return_baseline = np.mean(estimated_returns)
            estimated_returns -= average_return_baseline

        return np.array(estimated_returns, dtype=np.float32)

**validación**

env = gym.make('Pendulum-v1')

dim_states = env.observation_space.shape[0]

continuous_control = isinstance(env.action_space, gym.spaces.Box)
dim_actions = env.action_space.shape[0] if continuous_control else env.action_space.n

policy_gradients_agent = PolicyGradients(dim_states=dim_states, 
                                             dim_actions=dim_actions, 
                                             lr=0.005,
                                             gamma=0.99,
                                             continuous_control=continuous_control,
                                             reward_to_go=False,
                                             use_baseline=False)


**CartPole**

In [233]:
env = gym.make('CartPole-v1')

dim_states = env.observation_space.shape[0]

continuous_control = isinstance(env.action_space, gym.spaces.Box)
dim_actions = env.action_space.shape[0] if continuous_control else env.action_space.n

policy_gradients_agent = PolicyGradients(dim_states=dim_states, 
                                             dim_actions=dim_actions, 
                                             lr=0.005,
                                             gamma=0.99,
                                             continuous_control=continuous_control,
                                             reward_to_go=False,
                                             use_baseline=False)

In [234]:
# Sample rollouts (2 episodios): Ejecutar hasta que se generen solo 2 episodios
x2=sample_rollouts(env, policy_gradients_agent, 1000, 21)

Largo del episodio 15
Largo del episodio 17


In [283]:
index_episodio_1=x2[0][1].shape[0]
index_episodio_1

15

In [284]:
index_episodio_2=x2[1][1].shape[0]
index_episodio_2

17

In [285]:
_gamma=0.99
_use_reward_to_go=False
_use_baseline=False

In [286]:
test=estimate_returns(x2)
test

array([13.994164 , 13.125419 , 12.247898 , 11.361513 , 10.466174 ,
        9.561792 ,  8.648275 ,  7.7255306,  6.793465 ,  5.851985 ,
        4.900995 ,  3.940399 ,  2.9701   ,  1.99     ,  1.       ,
       15.705681 , 14.854223 , 13.994164 , 13.125419 , 12.247898 ,
       11.361513 , 10.466174 ,  9.561792 ,  8.648275 ,  7.7255306,
        6.793465 ,  5.851985 ,  4.900995 ,  3.940399 ,  2.9701   ,
        1.99     ,  1.       ], dtype=float32)

**La primera trayectoria tiene 19 steps, por lo que dentro del array test, el rango [0:18] corresponde al retorno esperado desde ese step. Para el episodio 2, corresponde desde el elemento 19 en adelante ([19:]).**

**Episodio 1**

In [287]:
test[0:index_episodio_1]

array([13.994164 , 13.125419 , 12.247898 , 11.361513 , 10.466174 ,
        9.561792 ,  8.648275 ,  7.7255306,  6.793465 ,  5.851985 ,
        4.900995 ,  3.940399 ,  2.9701   ,  1.99     ,  1.       ],
      dtype=float32)

In [288]:
retorno=0
for t,reward in enumerate(x2[0][2]):
    retorno=retorno+(_gamma**t)*reward
    print(retorno)

1.0
1.99
2.9701
3.940399
4.90099501
5.8519850599
6.793465209301
7.72553055720799
8.64827525163591
9.561792499119552
10.466174574128356
11.361512828387072
12.247897700103202
13.12541872310217
13.994164535871148


In [289]:
retorno=np.sum(test[0:index_episodio_1])
retorno

114.577705

In [290]:
baseline=np.mean(test[0:index_episodio_1])
baseline

7.6385136

**Episodio 2**

In [291]:
test[index_episodio_1:]

array([15.705681 , 14.854223 , 13.994164 , 13.125419 , 12.247898 ,
       11.361513 , 10.466174 ,  9.561792 ,  8.648275 ,  7.7255306,
        6.793465 ,  5.851985 ,  4.900995 ,  3.940399 ,  2.9701   ,
        1.99     ,  1.       ], dtype=float32)

In [292]:
retorno=0
for t,reward in enumerate(x2[1][2]):
    retorno=retorno+(_gamma**t)*reward
    print(retorno)

1.0
1.99
2.9701
3.940399
4.90099501
5.8519850599
6.793465209301
7.72553055720799
8.64827525163591
9.561792499119552
10.466174574128356
11.361512828387072
12.247897700103202
13.12541872310217
13.994164535871148
14.854222890512437
15.705680661607312


In [293]:
retorno=np.sum(test[index_episodio_1:])
retorno

145.13762

In [294]:
baseline=np.mean(test[index_episodio_1:])
baseline

8.537507

**4. Policy gradients**

**5. Reducción de varianza**

**6. Evaluación del algoritmo**