[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/MaxMitre/DeepLearning/blob/main/Semana09_PolicyGradient.ipynb)

# Dependencias


In [None]:
%%capture
!pip install pyglet==1.3.2
!pip install box2d-py
!pip install gym pyvirtualdisplay
!apt-get install -y python-opengl ffmpeg
!pip install tensorflow==2.3.*
!pip install --upgrade tensorflow-probability

In [None]:
!sudo apt-get install xvfb
!pip install xvfbwrapper

In [None]:
import gym
from gym.wrappers.record_video import RecordVideo
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
import random
import glob
import io
import time
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900));
display.start();

# Auxiliar para ver el desempeño del agente

In [None]:
def show_video():
  """Permite la grabación del entorno de Gym y lo muestra."""
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else:
    print("No se encontró el video")

def wrap_env(env):
  env = RecordVideo(env, './video')
  return env

In [None]:
gpu_list = tf.config.experimental.list_physical_devices('GPU')
print('Número de GPUs disponibles: {}'.format(len(gpu_list)))

# Iniciar el ambiente y contruir el agente Policy Gradient

In [None]:
env = wrap_env(gym.make('Acrobot-v1'))
num_features = env.observation_space.shape[0]
num_actions = env.action_space.n
print('Número de características del estado: {}'.format(num_features))
print('Número de acciones posibles: {}'.format(num_actions))

$$\pi : A \rightarrow \mathbb{R}$$

Por definición tenemos que:

$$J(\pi_{\theta}) = \underset{\tau \sim \pi_{\theta}}{E}  [{R(\tau)}] = \underset{\tau \sim \pi_{\theta}}{E} [\sum_{t=0}^{T} r(s_t, a_t)] = \int \pi_\theta(\tau) R(\tau) d\tau $$

Lo que se quiere lograr es que se actualice el parámetro $\theta$ de la función de tal modo que:

$$\theta_{k+1} = \theta_k + \alpha \left. \nabla_{\theta} J(\pi_{\theta}) \right|_{\theta_k}$$

Recordemos un pequeño truco para calcular el gradiente de una función:

$$\nabla_{\theta} \pi_{\theta} (\tau) =  \pi_{\theta} (\tau) \dfrac{\nabla_{\theta} \pi_{\theta} (\tau)}{\pi_{\theta} (\tau)} =  \pi_{\theta} (\tau) \nabla_{\theta} log\pi_{\theta} (\tau)$$

Con todo lo anterior, podemos obtener:

\begin{align*}
\nabla_{\theta} J(\pi_{\theta}) &= \nabla_{\theta} \underset{\tau \sim \pi_{\theta}}{E}     {R(\tau)} & \\
&= \nabla_{\theta} \int_{\tau} P(\tau|\theta) R(\tau) & \text{Expand expectation} \\
&= \int_{\tau} \nabla_{\theta} P(\tau|\theta) R(\tau) & \text{Bring gradient under integral} \\
&= \int_{\tau} P(\tau|\theta) \nabla_{\theta} \log P(\tau|\theta) R(\tau) & \text{Log-derivative trick} \\
&= \underset{\tau \sim \pi_{\theta}}{E}{\nabla_{\theta} \log P(\tau|\theta) R(\tau)} & \text{Return to expectation form} \\
\therefore \nabla_{\theta} J(\pi_{\theta}) &= \underset{\tau \sim \pi_{\theta}}{E}{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) R(\tau)} & \text{Expression for grad-log-prob}
\end{align*}

In [None]:
# Create Neural Network for Policy Gradient-based Agent
class Network(tf.keras.Model):
  def __init__(self):
    super(Network, self).__init__()
    self.dense1 = tf.keras.layers.Dense(32, activation='relu')
    self.out = tf.keras.layers.Dense(num_actions)
    self.dist = tfp.distributions.Categorical

  def call(self, x):
    x = self.dense1(x)
    logits = self.out(x)
    action = self.dist(logits=logits).sample()
    probs = tf.nn.softmax(logits, axis=-1)
    log_probs = tf.nn.log_softmax(logits, axis=-1)
    return logits, action, probs, log_probs

net = Network()
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-2)

# Función para hacer un paso de entrenamiento

In [None]:
@tf.function
def train_step(batch_states, batch_actions, batch_returns):
  with tf.GradientTape() as tape:
    logits, actions, probs, log_probs = net(batch_states)
    action_masks = tf.one_hot(batch_actions, num_actions)
    masked_log_probs = tf.reduce_sum(action_masks * log_probs, axis=-1)
    loss = -tf.reduce_mean(batch_returns * masked_log_probs)
  net_gradients = tape.gradient(loss, net.trainable_variables)
  optimizer.apply_gradients(zip(net_gradients, net.trainable_variables))
  return loss

# Ver el algoritmo y ver su aprendizaje

In [None]:
num_episodes = 100 # @param {type:"integer"}
viz_update_freq = 50 # @param {type: "integer"}
steps_per_train_step = 5000 # @param {type: "integer"}

In [None]:
%%time

last_100_ep_ret, text = [], ''
batch_states, batch_actions, batch_returns = [], [], []
for episode in range(num_episodes):
  if episode % viz_update_freq == 0: # Needed for updating the visualization.
    env.close()
    env = wrap_env(gym.make('Acrobot-v1'))

  # Start a new episode and reset the environment.
  state = env.reset()
  done, ep_rew = False, []
  while not done:
    state_in = np.expand_dims(state, 0)
    # Sample action from policy and take that action in the env.
    logits, action, probs, log_probs = net(state_in)
    next_state, reward, done, info = env.step(action[0].numpy())
    batch_states.append(state)
    batch_actions.append(action[0])
    ep_rew.append(reward)
    state = next_state

  # Create episode returns for policy gradient step.
  episode_ret = sum(ep_rew)
  episode_len = len(ep_rew)
  batch_returns += [episode_ret] * episode_len

  # Keep collecting experience with the current policy.
  if len(batch_states) >= steps_per_train_step:
    # Now that we have enough experience for this policy, train it on-policy.
    loss = train_step(np.array(batch_states), np.array(batch_actions),
                      np.array(batch_returns, dtype=np.float32))
    # Print the performance of the policy.
    ipythondisplay.clear_output()
    text += f"Episode: {episode}, Loss: {loss:.2f}, "\
            f"Return: {np.mean(batch_returns):.2f}\n"
    print(text)
    print('Current agent performance:')
    show_video()
    batch_states, batch_actions, batch_returns = [], [], []

# Ver desempeño despues de entrenarlo

### Correr multiples veces para jugar el juego una y otra vez.

In [None]:
env = wrap_env(gym.make('Acrobot-v1'))
state = env.reset()
ret = 0
while True:
  env.render()
  state = tf.expand_dims(state, axis=0)
  logits, action, probs, log_probs = net(state)
  state, reward, done, info = env.step(action[0].numpy())
  ret += reward
  if done:
    break
env.close()
print('Return on this episode: {}'.format(ret))
show_video()

# Referencias

- [Medium:Introducción al aprendizaje por refuerzo](https://markelsanz14.medium.com/introducci%C3%B3n-al-aprendizaje-por-refuerzo-parte-5-pol%C3%ADticas-de-gradiente-8e92725e9c8f)

- [Intro a policy Optimization](https://spinningup.openai.com/en/latest/spinningup/rl_intro3.html)

# Convergencia en iterar políticas vs cambiar con función de valor: Teoremas