# BipedalWalker-v2 with stable_baselines library 
The aim of this super quick tutorial is to give a brief insight into stable_baselines library that offers many RL models ready to use within gym environments. It's very useful for model comparison. 

We need to find optimal policy for BipedalWalker-v2, which requires models capable of handling:
* continous state space, 
* continous action space.

One of such models is PPO2 (kind of Proximal Policy Optimization), which will be used for this example. Because stable_baselines gives us PPO2 implementation as a black-box, we can only play with hiperparameters to choose the best combination. A cell below trains 32 different agents and saves mean rewards from 100 test episodes to result table.

In [29]:
import time
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

import gym
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2

from sklearn.model_selection import ParameterGrid

# Create the environment
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run


params = {
    "gammas": [0.99, 0.999],
    "n_steps":  [64, 128],
    "learning_rate" :  [0.00025, 0.002],
    "max_grad_norm" : [0.5, 1],
    "nminibatches":  [4, 8]
}
# Define the model

param_grid = ParameterGrid(params)

df = pd.DataFrame({'gammas':[], 
                   'n_steps': [], 
                   'learning_rate': [], 
                   'max_grad_norm': [], 
                   'nminibatches': [], 
                   'rewards': []})

for param_id in range(len(param_grid)):
    gamma = param_grid[param_id]["gammas"]
    n_steps = param_grid[param_id]["n_steps"]
    learning_rate = param_grid[param_id]["learning_rate"]
    max_grad_norm = param_grid[param_id]["max_grad_norm"]
    nminibatches = param_grid[param_id]["nminibatches"]
    
    

    model = PPO2(MlpPolicy, 
                 env, 
                 verbose=0, 
                 gamma=gamma, 
                 n_steps=n_steps, 
                 ent_coef=0.01, 
                 learning_rate=learning_rate, 
                 vf_coef=0.5, 
                 max_grad_norm=max_grad_norm, 
                 lam=0.95, 
                 nminibatches=nminibatches, 
                 noptepochs=4, 
                 cliprange=0.2)

    n_epochs = 25000
    train_start = time.time()
    model.learn(total_timesteps=n_epochs)
    print('train time: {}s'.format(time.time() - train_start))

    test_start = time.time()
    reward_history = []
    for episode in range(100):
        obs = env.reset()
        done = False
        rewards_in_episode = 0
        while not done:
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            rewards_in_episode += reward[0]
        reward_history.append(rewards_in_episode)
    mean_reward = sum(reward_history)/len(reward_history)
    
    _params = {
    "gammas": [gamma],
    "n_steps": [n_steps],
    "learning_rate": [learning_rate],
    "max_grad_norm": [max_grad_norm],
    "nminibatches": [nminibatches],
    "rewards": [mean_reward]
    }
    
    df = df.append(pd.DataFrame.from_dict(_params))
    print('test time: {}s'.format(time.time() - test_start))
    print(mean_reward)
    
df = df.reset_index()

train time: 28.499265432357788s
test time: 7.15754508972168s
-113.71032981138535
train time: 34.18337059020996s
test time: 79.4946768283844s
-85.901246414206
train time: 22.919082403182983s
test time: 29.26411747932434s
-99.96409903066568
train time: 26.913942575454712s
test time: 42.41841101646423s
-106.04775242702682
train time: 27.160589933395386s
test time: 74.47736740112305s
-99.01373604460521
train time: 34.53701186180115s
test time: 7.246541738510132s
-106.27053628453444
train time: 23.31827998161316s
test time: 67.67654037475586s
-94.01656457732297
train time: 26.832749605178833s
test time: 21.82031273841858s
-115.6099921328768
train time: 26.974185466766357s
test time: 99.27304792404175s
-134.76463289532037
train time: 32.98280644416809s
test time: 4.908333778381348s
-104.90583229940498
train time: 22.78860330581665s
test time: 61.065229654312134s
-118.11756724076089
train time: 25.998509645462036s
test time: 92.72901725769043s
-101.6604349305459
train time: 26.26438021659851s

### Select the best hiperparameter set based on maximum reward criterion.

In [40]:
best_setup = df[df.rewards == df.rewards.max()]

In [41]:
best_setup

Unnamed: 0,index,gammas,n_steps,learning_rate,max_grad_norm,nminibatches,rewards
1,0,0.99,64.0,0.00025,0.5,8.0,-85.901246


### Train Agent with the best hiperparameters set
This time we will set number of training epochs to 500000 to obtain finest result.

In [55]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run

model = PPO2(MlpPolicy, 
                 env, 
                 verbose=0, 
                 gamma=best_setup.gammas.values[0], 
                 n_steps=int(best_setup.n_steps.values[0]), 
                 ent_coef=0.01, 
                 learning_rate=best_setup.learning_rate.values[0], 
                 vf_coef=0.5, 
                 max_grad_norm=best_setup.max_grad_norm.values[0], 
                 lam=0.95, 
                 nminibatches=int(best_setup.nminibatches.values[0]), 
                 noptepochs=4, 
                 cliprange=0.2)

n_epochs = 500000
train_start = time.time()
model.learn(total_timesteps=n_epochs)
print('train time: {}s'.format(time.time() - train_start))

train time: 648.748372554779s


### Let's see how it walks!

In [52]:
obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs)
    obs, reward, done, info = env.step(action)
    env.render()

### Record rendered trials and save it to MP4 file.

In [54]:
from stable_baselines.common.vec_env import VecVideoRecorder

env_id = 'BipedalWalker-v2'
video_folder = 'Pictures/'
video_length = 1003
env = DummyVecEnv([lambda: gym.make(env_id)])
env = VecVideoRecorder(env, video_folder,
                      record_video_trigger=lambda x: x == 0, video_length=video_length,
                      name_prefix="random-agent-{}".format(env_id))

obs = env.reset()
rewards_in_episode = 0
for i in range(video_length+1):
    action, _states = model.predict(obs)
    obs, reward, done, info = env.step(action)     
    rewards_in_episode += reward[0]
    
    if done:
        print('Sum of rewards in one episode: {}'.format(rewards_in_episode))
        rewards_in_episode = 0
        
    env.render()
env.close()

Sum of rewards in one episode: -103.02607073885156
Sum of rewards in one episode: -123.62441200384637
Sum of rewards in one episode: -81.0249242716236
Sum of rewards in one episode: -92.34792243220727
Sum of rewards in one episode: -113.7854693334084
Sum of rewards in one episode: -102.65251336753136
Sum of rewards in one episode: -87.37850658874959
Saving video to  /home/waldemar/Pictures/random-agent-BipedalWalker-v2-step-0-to-step-1003.mp4
