# RLLAB setup scripts for google colab
Install packages with compatible versions

In [0]:
!apt-get -qq install -y xvfb python-opengl > /dev/null 2>&1
!ln -snf /usr/lib/x86_64-linux-gnu/libnvrtc-builtins.so.8.0 /usr/lib/x86_64-linux-gnu/libnvrtc-builtins.so
!apt-get -qq -y install xvfb freeglut3-dev ffmpeg > /dev/null

In [0]:
!pip install -q path.py
!pip install -q pyprind
!pip install -q cached_property
!pip install -q gym==0.7.4
!pip install -q theano==0.8.2
!pip install -q git+https://github.com/neocxi/Lasagne.git@484866cf8b38d878e92d521be445968531646bb8#egg=Lasagne
  
!pip install -q PyOpenGL piglet pyglet pyvirtualdisplay

In [15]:
!pip install box2d-py mako==1.0.7 Pygame JSAnimation imageio



In [4]:
!git clone https://github.com/kekim/rllab.git rllab-git

Cloning into 'rllab-git'...
remote: Enumerating objects: 45, done.[K
remote: Counting objects:   2% (1/45)   [Kremote: Counting objects:   4% (2/45)   [Kremote: Counting objects:   6% (3/45)   [Kremote: Counting objects:   8% (4/45)   [Kremote: Counting objects:  11% (5/45)   [Kremote: Counting objects:  13% (6/45)   [Kremote: Counting objects:  15% (7/45)   [Kremote: Counting objects:  17% (8/45)   [Kremote: Counting objects:  20% (9/45)   [Kremote: Counting objects:  22% (10/45)   [Kremote: Counting objects:  24% (11/45)   [Kremote: Counting objects:  26% (12/45)   [Kremote: Counting objects:  28% (13/45)   [Kremote: Counting objects:  31% (14/45)   [Kremote: Counting objects:  33% (15/45)   [Kremote: Counting objects:  35% (16/45)   [Kremote: Counting objects:  37% (17/45)   [Kremote: Counting objects:  40% (18/45)   [Kremote: Counting objects:  42% (19/45)   [Kremote: Counting objects:  44% (20/45)   [Kremote: Counting objects:  46% (21/45) 

In [0]:
!cp -a ./rllab-git/* .

# REINFORCE on LunarLanderContinous-v2 (OpenAI Gym version)
**Important!**
Before running the following cell, make sure rllab is set up properly in your current runtime by executing codes in RLLAB setup scripts.

**1. Implement REINFORCE Algorithm**

- Import necessary packages
(Execute **once again** if you encounter an error)

In [0]:
from rllab.envs.gym_env import GymEnv
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
from rllab.policies.categorical_mlp_policy import CategoricalMLPPolicy
from rllab.envs.normalized_env import normalize
import numpy as np
import theano
import theano.tensor as TT
from lasagne.updates import adam
from rllab.misc.instrument import run_experiment_lite
import rllab.misc.logger as logger

########## REINFORCE_GYM_Lunvar_v2_With_RUN_EXP_LITE ##########


- Implement REINFORCE algorithm

In [12]:
def REINFORCE(*_):
    # normalize() makes sure that the actions for the environment lies
    # within the range [-1, 1] (only works for environments with continuous actions)

    env = normalize(GymEnv(env_name = "LunarLanderContinuous-v2", force_reset=True, record_video=True))
    # env = normalize(GymEnv(env_name="CartPole-v0", force_reset=True))

    # Initialize a neural network policy with a single hidden layer of 8 hidden units

    policy = GaussianMLPPolicy(env.spec, hidden_sizes=(64,64))
    # policy = CategoricalMLPPolicy(env.spec, hidden_sizes=(64, 64))

    # We will collect 3 trajectories per iteration
    N = 3
    # Each trajectory will have at most 400 time steps
    T = 400
    # Number of iterations
    n_itr = 1000
    # Set the discount factor for the problem
    discount = 0.99
    # Learning rate for the gradient update
    learning_rate = 0.001

    # Construct the computation graph

    # Create a Theano variable for storing the observations
    # We could have simply written `observations_var = TT.matrix('observations')` instead for this example. However,
    # doing it in a slightly more abstract way allows us to delegate to the environment for handling the correct data
    # type for the variable. For instance, for an environment with discrete observations, we might want to use integer
    # types if the observations are represented as one-hot vectors.
    observations_var = env.observation_space.new_tensor_variable(
        'observations',
        # It should have 1 extra dimension since we want to represent a list of observations
        extra_dims=1
    )
    actions_var = env.action_space.new_tensor_variable(
        'actions',
        extra_dims=1
    )
    returns_var = TT.vector('returns')

    # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the
    # distribution of the actions. For a Gaussian policy, it contains the mean and the logarithm of the standard deviation.
    dist_info_vars = policy.dist_info_sym(observations_var)

    # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing
    # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute
    # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class
    # rllab.distributions.DiagonalGaussian
    dist = policy.distribution

    # Exp. #1, Prob.2: Define a proper loss function for REINFORCE
    ###########################################################################
    # Note that we negate the objective, since most optimizers assume a minimization problem
    

    # Get the list of trainable parameters.
    params = policy.get_params(trainable=True)
    grads = theano.grad(surr, params)

    f_train = theano.function(
        inputs=[observations_var, actions_var, returns_var],
        outputs=None,
        updates=adam(grads, params, learning_rate=learning_rate),
        allow_input_downcast=True
    )

    for epoch in range(n_itr):
        ##################################################################
        logger.push_prefix('Epoch #%d | ' % (epoch))
        logger.log("Training started")
        ##################################################################
        paths = []

        for _ in range(N):
            observations = []
            actions = []
            rewards = []

            observation = env.reset()

            for _ in range(T):
                # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains
                # sufficient statistics for the action distribution. It should at least contain entries that would be
                # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym().
                # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is
                # not needed.
                action, _ = policy.get_action(observation)
                # Recall that the last entry of the tuple stores diagnostic information about the environment. In our
                # case it is not needed.
                next_observation, reward, terminal, _ = env.step(action)
                observations.append(observation)
                actions.append(action)
                rewards.append(reward)
                observation = next_observation
                if terminal:
                    # Finish rollout if terminal state reached
                    break


            # Exp. #1, Prob.1: Calculate returns.
            ###########################################################################
            # We need to compute the empirical return for each time step along the
            # trajectory (return to go)
            returns = []





            # The returns are stored backwards in time, so we need to revert it


            paths.append(dict(
                observations=np.array(observations),
                actions=np.array(actions),
                rewards=np.array(rewards),
                returns=np.array(returns)
            ))

        #### Dim. of observation: [Sum of length of all traj, Dim(observation)] ####
        observations = np.concatenate([p["observations"] for p in paths])

        #### Dim. of actions    : [Sum of length of all traj, Dim(action)]      ####
        actions = np.concatenate([p["actions"] for p in paths])

        #### Dim. of returns    : [Sum of length of all traj, ]                 ####
        returns = np.concatenate([p["returns"] for p in paths])

        f_train(observations, actions, returns)
        print('Average Return:', np.mean([sum(p["rewards"]) for p in paths]))
        ############################################################################
        logger.log("Training finished")
        logger.save_itr_params(epoch, params)
        logger.dump_tabular(with_prefix=False)
        logger.pop_prefix()


        logger.record_tabular('Epoch', epoch)
        logger.record_tabular('Steps', epoch*N*T)
        logger.record_tabular('AverageReturn', np.mean(returns))
        logger.record_tabular('StdReturn', np.std(returns))
        logger.record_tabular('MaxReturn', np.max(returns))
        logger.record_tabular('MinReturn', np.min(returns))

        #############################################################################
        
##### Creating & Running a task #####

mypath = './log/vpg_reinforce/'

run_experiment_lite(
    REINFORCE,
    # Number of parallel workers for sampling
    log_dir=mypath,
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    seed=1,
    # plot=True,
)

python /content/scripts/run_experiment_lite.py  --n_parallel '1'  --snapshot_mode 'last'  --seed '1'  --exp_name 'experiment_2019_01_16_07_39_30_0003'  --log_dir './log/vpg_reinforce/'  --use_cloudpickle 'True'  --args_data 'gASVkgoAAAAAAACMF2Nsb3VkcGlja2xlLmNsb3VkcGlja2xllIwOX2ZpbGxfZnVuY3Rpb26Uk5QoaACMD19tYWtlX3NrZWxfZnVuY5STlGgAjA1fYnVpbHRpbl90eXBllJOUjAhDb2RlVHlwZZSFlFKUKEsASwBLG0sJS0dCiAIAAHQAdAFkAWQCZAJkA40DgwF9AXQCfAFqA2QlZAWNAn0CZAZ9A2QHfQRkCH0FZAl9BmQKfQd8AWoEagVkC2QMZA2NAn0IfAFqBmoFZA5kDGQNjQJ9CXQHaghkD4MBfQp8AmoJfAiDAX0LfAJqCn0MfAJqC2QCZBCNAX0NdAxqDXQOfA2DAn0OdAxqD3wIfAl8CmcDZAB0EHwOfA18B2QRjQNkAmQSjQR9D5ABeNh0EXwFgwFEAJABXcp9EHQSahNkE3wQFgCDAQEAdBJqFGQUgwEBAGcAfRF4tHQRfAODAUQAXah9AGcAfRJnAH0TZwB9FHwBahWDAH0VeFx0EXwEgwFEAF1QfQB8AmoWfBWDAVwCfRZ9AHwBahd8FoMBXAR9F30YfRl9AHwSahh8FYMBAQB8E2oYfBaDAQEAfBRqGHwYgwEBAHwXfRV8GZABcgJQAJABcQJXAGcAfRp8EWoYdBl0GmobfBKDAXQaaht8E4MBdBpqG3wUgwF0GmobfBqDAWQVjQSDAQEAceBXAHQaahxkFmQXhAB8EUQAgwGDAX0SdBpqHGQYZBeEAHwRRACDAYMBfRN0GmocZBlkF4QAfBFEAIM

**2. Execute Your Algorithm**

- Activate a virtual display

In [0]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()
import os
os.environ["DISPLAY"] = ":" + str(display.display) + "." + str(display.screen)

- Create & Run a RL task for Acrobot

python /content/scripts/run_experiment_lite.py  --n_parallel '1'  --snapshot_mode 'last'  --seed '1'  --exp_name 'experiment_2019_01_16_07_39_30_0001'  --log_dir './log/vpg_reinforce/'  --use_cloudpickle 'True'  --args_data 'gASVgAoAAAAAAACMF2Nsb3VkcGlja2xlLmNsb3VkcGlja2xllIwOX2ZpbGxfZnVuY3Rpb26Uk5QoaACMD19tYWtlX3NrZWxfZnVuY5STlGgAjA1fYnVpbHRpbl90eXBllJOUjAhDb2RlVHlwZZSFlFKUKEsASwBLG0sJS0dChgIAAHQAdAFkAWQCZAONAoMBfQF0AnwBagNkJWQFjQJ9AmQGfQNkB30EZAh9BWQJfQZkCn0HfAFqBGoFZAtkDGQNjQJ9CHwBagZqBWQOZAxkDY0CfQl0B2oIZA+DAX0KfAJqCXwIgwF9C3wCagp9DHwCagtkAmQQjQF9DXQMag10DnwNgwJ9DnQMag98CHwJfApnA2QAdBB8DnwNfAdkEY0DZAJkEo0EfQ+QAXjYdBF8BYMBRACQAV3KfRB0EmoTZBN8EBYAgwEBAHQSahRkFIMBAQBnAH0ReLR0EXwDgwFEAF2ofQBnAH0SZwB9E2cAfRR8AWoVgwB9FXhcdBF8BIMBRABdUH0AfAJqFnwVgwFcAn0WfQB8AWoXfBaDAVwEfRd9GH0ZfQB8EmoYfBWDAQEAfBNqGHwWgwEBAHwUahh8GIMBAQB8F30VfBmQAXIAUACQAXEAVwBnAH0afBFqGHQZdBpqG3wSgwF0GmobfBODAXQaaht8FIMBdBpqG3wagwFkFY0EgwEBAHHeVwB0GmocZBZkF4QAfBFEAIMBgwF9EnQaahxkGGQXhAB8EUQAgwGDAX0TdBpqHGQZZBeEAHwRRACDAYM

**3. Average Reward Plotting**

- You can evaluate how your agent is being trained with reward it gets in every iteration. 
- Whenever you execute the code 'run_experiment_lite', it will generate a experiment directory.
- (/content/log/vpg_reinforce/)
- Please update the value of '**mypath**' and specify your new experiment directory name. 

In [0]:
import os.path as osp
import numpy as np
import csv
import matplotlib.pyplot as plt
import json
import joblib
from glob import glob
import os

# mypath = './log/vpg_reinforce/'

plots = []
legends = []
returns = []
with open(osp.join(mypath, 'progress.csv'), 'rt') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if row['AverageReturn']:
            returns.append(float(row['AverageReturn']))
returns = np.array(returns)
plots.append(plt.plot(returns)[0])
legends.append('AverageReturn')
plt.legend(plots, legends)
plt.show()

**4. Play Videos of your Agent Behavior**

- You can watch how your agent's behavior improves.
- If you haven't update 'mypath' in the code above, you need to update it here.
- (/content/log/vpg_reinforce/')

In [0]:
from IPython import display as pythondisplay
# from pyvirtualdisplay import Display

# from matplotlib.pyplot import imshow
import matplotlib.pyplot as plt
from matplotlib import animation
from JSAnimation import IPython_display
from IPython.display import HTML

import imageio

from os import listdir
from os.path import isfile, join

def plot_movie_js(image_array, filename):
    dpi = 10.0
    xpixels, ypixels = image_array[0].shape[0], image_array[0].shape[1]
    fig = plt.figure(figsize=(ypixels/(dpi), xpixels/(dpi)), dpi=dpi)
    fig.suptitle(filename, fontsize=160)
    # fig.set_xlabel(filename, fontsize=160)
    # fig.xlabel(filename, fontsize=160)
    im = plt.figimage(image_array[0])

    def animate(i):
        im.set_array(image_array[i])
        return (im,)
    
    anim = animation.FuncAnimation(fig, animate, frames=len(image_array))
    pythondisplay.display(IPython_display.display_animation(anim))

# mypath = './log/vpg_reinforce/'
mypath += 'gym_log/'
mp4files = [f for f in listdir(mypath) if f.endswith(".mp4")]
mp4files.sort()


for filename in mp4files:
    vid = imageio.get_reader(join(mypath, filename),  'ffmpeg')
    # print(len(vid))
    # print(vid.get_data(0).shape)

    screenlist = []
    for i in range(len(vid)):
        image = vid.get_data(i)
        screenlist.append(image)
        # fig = plt.figure()
        # fig.suptitle('image #{}'.format(i), fontsize=20)
        # plt.imshow(image)

    plot_movie_js(screenlist, filename)
