<a href="https://colab.research.google.com/github/Lego514/practice-for-class/blob/main/LunarLander_v2_ACER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Install linux server packages using apt-get and Python packages using pip
!apt-get install swig cmake libopenmpi-dev zlib1g-dev xvfb x11-utils ffmpeg -qq #remove -qq for full output
!pip install stable-baselines[mpi] box2d box2d-kengz pyvirtualdisplay pyglet==1.3.1 --quiet #remove --quiet for full output 
# Stable Baselines only supports tensorflow 1.x for now
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [None]:
#import dependencis required to run & train our model + record a video
import gym
import imageio
import numpy as np
import base64
import IPython
import PIL.Image
import pyvirtualdisplay

# Video stuff 
from pathlib import Path
from IPython import display as ipythondisplay

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import VecVideoRecorder, SubprocVecEnv, DummyVecEnv
from stable_baselines import ACER
from stable_baselines.bench import Monitor
from stable_baselines.results_plotter import load_results, ts2xy
from stable_baselines.ddpg import AdaptiveParamNoiseSpec
from stable_baselines import results_plotter

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [None]:
# set enviorment variables that we will use in our code
env_id = 'LunarLander-v2'
video_folder = '/videos'
video_length = 500


# set our inital enviorment
env = DummyVecEnv([lambda: gym.make(env_id)]) 
obs = env.reset()

In [None]:
print('State shape: ', env.observation_space.shape)
print('Number of actions: ', env.action_space.n)

State shape:  (8,)
Number of actions:  4


In [None]:
# Evaluation Function
def evaluate(model, num_steps=1000):
  """
  Evaluate a RL agent
  :param model: (BaseRLModel object) the RL Agent
  :param num_steps: (int) number of timesteps to evaluate it
  :return: (float) Mean reward for the last 100 episodes
  """
  episode_rewards = [0.0]
  obs = env.reset()
  for i in range(num_steps):
      # _states are only useful when using LSTM policies
      action, _states = model.predict(obs)

      obs, reward, done, info = env.step(action)
      
      # Stats
      episode_rewards[-1] += reward
      if done:
          obs = env.reset()
          episode_rewards.append(0.0)
  # Compute mean reward for the last 100 episodes
  mean_100ep_reward = round(np.mean(episode_rewards[-100:]), 1)
  print("Mean reward:", mean_100ep_reward, "Num episodes:", len(episode_rewards))
  
  return mean_100ep_reward

In [None]:
# Make video
# Set up fake display; otherwise rendering will fail on colab
import os
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

In [None]:
# Record video
def record_video(env_id, model, video_length=500, prefix='', video_folder='videos/'):
  """
  :param env_id: (str)
  :param model: (RL model)
  :param video_length: (int)
  :param prefix: (str)
  :param video_folder: (str)
  """
  eval_env = DummyVecEnv([lambda: gym.make('LunarLander-v2')])
  # Start the video at step=0 and record 500 steps
  eval_env = VecVideoRecorder(env, video_folder=video_folder,
                              record_video_trigger=lambda step: step == 0, video_length=video_length,
                              name_prefix=prefix)

  obs = eval_env.reset()
  for _ in range(video_length):
    action, _ = model.predict(obs)
    obs, _, _, _ = eval_env.step(action)

  # Close the video recorder
  eval_env.close()

In [None]:
# Display video
def show_videos(video_path='', prefix=''):
  html = []
  for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
      video_b64 = base64.b64encode(mp4.read_bytes())
      html.append('''<video alt="{}" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>'''.format(mp4, video_b64.decode('ascii')))
  ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

In [None]:
# Define the model
model = ACER(MlpPolicy, env, verbose=1) 





Instructions for updating:
Use `tf.cast` instead.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Please use `layer.__call__` method instead.




Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor




In [None]:
# Record & show video #0
record_video('LunarLander-v2', model, video_length=1000, prefix='ramdom')
show_videos('videos', prefix='ramdom')
mean_reward_before_train = evaluate(model, num_steps=1000)

Saving video to  /content/videos/ramdom-step-0-to-step-1000.mp4


Mean reward: -162.5 Num episodes: 12


In [None]:
# Define the model
model = ACER('MlpPolicy', env, verbose=1) # add & tweak default parameters, messure your output & improve link to parameters above (it will however work with default)

In [None]:
# Train model
model.learn(total_timesteps=50000)
# Save model
model.save("ACER-LunarLander-v2-50000")

-----------------------------------
| avg_norm_adj        | 18.4      |
| avg_norm_g          | 74.8      |
| avg_norm_grads_f    | 64.8      |
| avg_norm_k          | 2         |
| avg_norm_k_dot_g    | 74.8      |
| entropy             | 29.1      |
| explained_variance  | -6.69e-05 |
| fps                 | 0         |
| loss                | 83        |
| loss_bc             | -0        |
| loss_f              | -25.9     |
| loss_policy         | -25.9     |
| loss_q              | 218       |
| mean_episode_length | 0         |
| mean_episode_reward | 0         |
| norm_grads          | 23.1      |
| norm_grads_policy   | 16.5      |
| norm_grads_q        | 16.1      |
| total_timesteps     | 0         |
-----------------------------------
----------------------------------
| avg_norm_adj        | 12.4     |
| avg_norm_g          | 50.5     |
| avg_norm_grads_f    | 43.7     |
| avg_norm_k          | 2        |
| avg_norm_k_dot_g    | 50.9     |
| entropy             | 29.1     |

In [None]:
# Record & show video 
record_video('LunarLander-v2', model, video_length=1500, prefix='LunarLander-v2-50000')
show_videos('videos', prefix='LunarLander-v2-50000')
# Random Agent, after training
mean_reward_after_train = evaluate(model, num_steps=1500)

Saving video to  /content/videos/LunarLander-v2-50000-step-0-to-step-1500.mp4


Mean reward: -28.1 Num episodes: 3


In [None]:
# Train model
model.learn(total_timesteps=200000)
# Save model
model.save("ACER-LunarLander-v2-200000")

----------------------------------
| avg_norm_adj        | 0.702    |
| avg_norm_g          | 4.86     |
| avg_norm_grads_f    | 4.42     |
| avg_norm_k          | 1.89     |
| avg_norm_k_dot_g    | 4.91     |
| entropy             | 4.55     |
| explained_variance  | -0.103   |
| fps                 | 0        |
| loss                | 2.26     |
| loss_bc             | -0       |
| loss_f              | -0.779   |
| loss_policy         | -0.779   |
| loss_q              | 6.16     |
| mean_episode_length | 0        |
| mean_episode_reward | 0        |
| norm_grads          | 9.32     |
| norm_grads_policy   | 8.74     |
| norm_grads_q        | 3.24     |
| total_timesteps     | 0        |
----------------------------------
----------------------------------
| avg_norm_adj        | 2.81     |
| avg_norm_g          | 24.2     |
| avg_norm_grads_f    | 21.9     |
| avg_norm_k          | 3.26     |
| avg_norm_k_dot_g    | 32.5     |
| entropy             | 7.99     |
| explained_variance

In [None]:
# Record & show video 
record_video('LunarLander-v2', model, video_length=1500, prefix='LunarLander-v2-200000')
show_videos('videos', prefix='LunarLander-v2-200000')
# Random Agent, after training
mean_reward_after_train = evaluate(model, num_steps=10000)

Saving video to  /content/videos/LunarLander-v2-200000-step-0-to-step-1500.mp4


Mean reward: 181.2 Num episodes: 24
