# Deep RL Home Work 1

Author: Kay Ke (kayke@uw.edu)
Update: Sep 12, 2017

In [1]:
# The typical imports
%matplotlib inline

# DotDict
class DotDict(dict):
    def __getattr__(self, name):
        return self[name]
args = DotDict({
    'envname':'Hopper-v1',
    'expert_policy_file' : 'experts/Hopper-v1.pkl',
    'num_rollouts' : 10,
    'render' : True,
    'max_timesteps' : 400,
    'verbose' : 0,
    'dagger' : True
})

In [2]:
#!/usr/bin/env python

"""
Code to load an expert policy and generate roll-out data for behavioral cloning.
Example usage:
    python run_expert.py experts/Humanoid-v1.pkl Humanoid-v1 --render \
            --num_rollouts 20

Author of this script and included expert policies: Jonathan Ho (hoj@openai.com)

- The structures (functions, class design) are borrowed from
    https://github.com/EbTech/CS294/blob/master/hw1/run_expert.py
    just to get familiar with TF + Keras
- Kay, 2017 Sep 11.
"""

import argparse
import gym
import load_policy
from keras.models import Sequential
from keras.layers import Dense
import numpy as np
import pickle
import tensorflow as tf
import tf_util

# Suppress instruction set warning on mac
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

# Immitation Learning: learn a mapping from observations to actions.
class Agent:
    def __init__(self, dim):
        self.model = Sequential();
        self.model.add(Dense(64, input_dim=dim[0], activation='relu'))
        self.model.add(Dense(units=dim[1]))
        self.model.compile(loss='mse', optimizer='sgd')

    def train(self, training_data, batch_size, epochs, verbose):
        print("Train the agent with %i training data, batch_size %i, epochs %i" % (training_data[0].shape[0], batch_size, epochs))
        self.history = self.model.fit(training_data[0], training_data[1],
                    batch_size, epochs, verbose)
        print("Training loss at last epoch %f" % np.mean(self.history.history['loss'][-1]))

    def save_training_history(self):
        print(self.history.history.keys())

        import matplotlib.pyplot as plt
        fig, ax = plt.subplots( nrows=1, ncols=2 )
        ax.plot(self.history.history['loss'])
        #ax.plot(self.history.history['val_loss'])
        ax.set_title('model loss')
        ax.set_ylabel('loss')
        ax.set_xlabel('epoch')
        #ax.legend(['train', 'test'], loc='upper left')
        fig.savefig('training_report.png')   # save the figure to file
        plt.close(fig)    # close the figure

    def act(self, obs):
        X = np.expand_dims(obs, 0)
        return self.model.predict(X, batch_size=200, verbose=0)

# Simple wrapper around policy function to have an act function
class Expert:
    def __init__(self, expert_policy_file):
        print('loading and building expert policy')
        self.policy_fn = load_policy.load_policy(expert_policy_file)
        print('loaded and built')

    def act(self, obs):
        return self.policy_fn(obs[None,:])
    
class Stimulator:
    def __init__(self, envname):
        self.init(envname)
        self.envname = envname
        
    def init(self, envname):
        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
        self.env = gym.make(envname)
    
    def stimulate(self, agent, max_steps, num_rollouts, render, verbose):
        with self.session.as_default():
            returns = []
            observations = []
            actions = []
            for i in range(num_rollouts):
                if verbose > 0:
                    print('iter', i)
                obs = self.env.reset()
                done = False
                totalr = 0.
                steps = 0
                while not done:
                    action = agent.act(obs)
                    observations.append(obs)
                    actions.append(np.squeeze(action))
                    obs, r, done, _ = self.env.step(action)
                    totalr += r
                    steps += 1
                    if render:
                        self.env.render()
                    if steps % 100 == 0 and verbose >=2:
                        print("%i/%i"%(steps, max_steps))
                    if steps >= max_steps:
                        break
                if verbose >= 3 and steps < max_steps:
                    print('Died prematurely at step %i' % steps)
                if verbose >= 1:
                    print('rollout %i/%i return=%f' % (i+1, num_rollouts, totalr))
                returns.append(totalr)
            if verbose > 0:
                print('Return summary: mean=%f, std=%f' % (np.mean(returns), np.std(returns)))

            return (np.array(observations), np.array(actions))
    
    def label_obs(self, expert, obs):
        with self.session.as_default():
            return expert.policy_fn(stimulated_env)
    
    def close(self):
        self.session.close()
        
    def reset(self):
        self.close()
        self.init(self.envname)

# Parsing argument to main
def parse_argument():
    parser = argparse.ArgumentParser()
    parser.add_argument('expert_policy_file', type=str)
    parser.add_argument('envname', type=str)
    parser.add_argument('--render', action='store_true')
    parser.add_argument("--max_timesteps", type=int, default=500)
    parser.add_argument('--num_rollouts', type=int, default=1,
                        help='Number of expert roll outs')
    parser.add_argument('--dagger', action='store_true') 

    # Verbose Level
    parser.add_argument('-v', '--verbose', type=int, choices=[0, 1, 2], default=1)

    return parser.parse_args()

Using TensorFlow backend.


In [3]:
# Load Expert
expert = Expert(args.expert_policy_file)

loading and building expert policy
obs (1, 11) (1, 11)
loaded and built


In [4]:
# Generate training data
stimulator = Stimulator(args.envname)
training_data = stimulator.stimulate(expert, max_steps=300, num_rollouts=20, render=False, verbose=0)
dim = (training_data[0].shape[-1], training_data[1].shape[-1])

[2017-09-13 01:28:41,600] Making new env: Hopper-v1


In [5]:
agent = Agent(dim)
agent.train(training_data, batch_size=512, epochs=80, verbose=0)
agent_dagger = Agent(dim)
agent_dagger.train(training_data, batch_size=512, epochs=80, verbose=0)

Train the agent with 6000 training data, batch_size 512, epochs 80
Training loss at last epoch 0.131083
Train the agent with 6000 training data, batch_size 512, epochs 80
Training loss at last epoch 0.126318


In [9]:
stimulator.reset()
for i in range(0, 10):
    # Behavior Cloning
    print("iter %i" % i)
    training_data = stimulator.stimulate(expert, max_steps=args.max_timesteps, num_rollouts=args.num_rollouts, render=False, verbose=0)
    agent.train(training_data, batch_size=512, epochs=50, verbose=0)

[2017-09-13 01:30:06,591] Making new env: Hopper-v1


iter 0
Train the agent with 6000 training data, batch_size 512, epochs 50
Training loss at last epoch 0.036458
iter 1
Train the agent with 6000 training data, batch_size 512, epochs 50
Training loss at last epoch 0.033599
iter 2
Train the agent with 6000 training data, batch_size 512, epochs 50
Training loss at last epoch 0.033191
iter 3
Train the agent with 6000 training data, batch_size 512, epochs 50
Training loss at last epoch 0.031302
iter 4
Train the agent with 6000 training data, batch_size 512, epochs 50
Training loss at last epoch 0.030127
iter 5
Train the agent with 6000 training data, batch_size 512, epochs 50
Training loss at last epoch 0.029688
iter 6
Train the agent with 6000 training data, batch_size 512, epochs 50
Training loss at last epoch 0.027383
iter 7
Train the agent with 6000 training data, batch_size 512, epochs 50
Training loss at last epoch 0.026560
iter 8
Train the agent with 6000 training data, batch_size 512, epochs 50
Training loss at last epoch 0.025161
i

In [10]:
stimulator.reset()
for i in range(0, 20):
    # DAgger
    print("iter %i" % i)
    (stimulated_env, _) = stimulator.stimulate(agent_dagger, max_steps=args.max_timesteps, num_rollouts=args.num_rollouts, render=False, verbose=0)
    labels = stimulator.label_obs(expert, stimulated_env)
    agent_dagger.train((stimulated_env, labels), batch_size=512, epochs=50, verbose=0)


[2017-09-13 01:31:00,903] Making new env: Hopper-v1


iter 0
Train the agent with 240 training data, batch_size 512, epochs 50
Training loss at last epoch 0.077217
iter 1
Train the agent with 240 training data, batch_size 512, epochs 50
Training loss at last epoch 0.021364
iter 2
Train the agent with 240 training data, batch_size 512, epochs 50
Training loss at last epoch 0.013072
iter 3
Train the agent with 240 training data, batch_size 512, epochs 50
Training loss at last epoch 0.009990
iter 4
Train the agent with 240 training data, batch_size 512, epochs 50
Training loss at last epoch 0.009654
iter 5
Train the agent with 240 training data, batch_size 512, epochs 50
Training loss at last epoch 0.008960
iter 6
Train the agent with 240 training data, batch_size 512, epochs 50
Training loss at last epoch 0.008367
iter 7
Train the agent with 240 training data, batch_size 512, epochs 50
Training loss at last epoch 0.008445
iter 8
Train the agent with 240 training data, batch_size 512, epochs 50
Training loss at last epoch 0.006585
iter 9
Tra

In [11]:
_ = stimulator.stimulate(agent, args.max_timesteps, args.num_rollouts, args.render, verbose=1)
_ = stimulator.stimulate(agent_dagger, args.max_timesteps, args.num_rollouts, args.render, verbose=1)

iter 0
rollout 1/20 return=7.539343
iter 1
rollout 2/20 return=9.260624
iter 2
rollout 3/20 return=7.927104
iter 3
rollout 4/20 return=9.921639
iter 4
rollout 5/20 return=7.869126
iter 5
rollout 6/20 return=10.394251
iter 6
rollout 7/20 return=9.751810
iter 7
rollout 8/20 return=8.731466
iter 8
rollout 9/20 return=8.789494
iter 9
rollout 10/20 return=7.467604
iter 10
rollout 11/20 return=8.146308
iter 11
rollout 12/20 return=7.500677
iter 12
rollout 13/20 return=9.004497
iter 13
rollout 14/20 return=8.860869
iter 14
rollout 15/20 return=8.144460
iter 15
rollout 16/20 return=9.336771
iter 16
rollout 17/20 return=7.071375
iter 17
rollout 18/20 return=8.285126
iter 18
rollout 19/20 return=8.519834
iter 19
rollout 20/20 return=8.030722
Return summary: mean=8.527655, std=0.872773
iter 0
rollout 1/20 return=6.994500
iter 1
rollout 2/20 return=6.896234
iter 2
rollout 3/20 return=7.242112
iter 3
rollout 4/20 return=7.006553
iter 4
rollout 5/20 return=7.208704
iter 5
rollout 6/20 return=7.08404