In [1]:
from ilqr import iLQR
import gym
import numpy as np
import pandas as pd
import time

from aprl.agents import MujocoFiniteDiffDynamicsBasic, MujocoFiniteDiffDynamicsPerformance, MujocoFiniteDiffCost
from aprl.envs.mujoco_costs import ReacherCost, InvertedPendulumCost



Logging to /tmp/openai-2019-01-28-19-51-30-126014
Choosing the latest nvidia driver: /usr/lib/nvidia-396, among ['/usr/lib/nvidia-375', '/usr/lib/nvidia-396']
Choosing the latest nvidia driver: /usr/lib/nvidia-396, among ['/usr/lib/nvidia-375', '/usr/lib/nvidia-396']


In [2]:
# Environment setup
env = gym.make('InvertedPendulum-v2').unwrapped
env.frame_skip = 1
env.seed(42)
_obs = env.reset()
cost = InvertedPendulumCost()

  handle_disconnected(elem)
  handle_disconnected(rval[i])


In [3]:
# Planning setup
N = 100  # planning horizon
us_init = np.array([env.action_space.sample() for _ in range(N)])

dynamics = {
    # Uses env.step(u) directly and finite difference on qpos and qvel directly.
    'my_basic': MujocoFiniteDiffDynamicsBasic(env),
    # Sets ctrl to u directly then uses MuJoCo's forwardSkip to compute qacc.
    # Computes finite difference on qacc, then estimates derivative of qpos and qvel.
    'my_performance': MujocoFiniteDiffDynamicsPerformance(env),
}
x0s = {k: dyn.get_state() for k, dyn in dynamics.items()}

# Finite difference cost

In [None]:
finite_cost = MujocoFiniteDiffCost(env)
finite_ilqr = iLQR(dynamics, finite_cost, N)

In [None]:
finite_xs, finite_us = finite_ilqr.fit(x0, us_init, n_iterations=100, on_iteration=on_iteration)

# Analytic cost

In [23]:
def on_iteration(iteration_count, xs, us, J_opt, accepted, converged):
    info = "converged" if converged else ("accepted" if accepted else "failed")
    print("iteration", iteration_count, info, J_opt, xs[-1])

cost = InvertedPendulumCost()
ilqrs = {k: iLQR(dyn, cost, N) for k, dyn in dynamics.items()}
xs = {}
us = {}
print(ilqrs.keys())
for k, ilqr in ilqrs.items():
    start = time.time()
    print('*** Fitting {} ***'.format(k))
    x0 = x0s[k]
    xs[k], us[k] = ilqr.fit(x0, us_init, n_iterations=100, on_iteration=on_iteration)
    end = time.time()
    print('*** Fitted {} in {}s ***'.format(k, end - start))

dict_keys(['my_basic', 'my_performance'])
*** Fitting my_basic ***
iteration -1 converged 1254.1825846967397 [-9.86996600e-01 -1.57327650e+00 -5.51750950e-02 -3.54690729e-05]
iteration 0 accepted 852.9627858033095 [-1.01370599e+00 -1.57327351e+00  2.38054025e-01 -3.55693692e-05]
iteration 1 accepted 809.9983625007201 [-8.27336981e-01 -1.57327626e+00 -1.66667273e+00 -3.46451641e-08]
iteration 2 accepted 596.2592602954812 [ 9.87487650e-01 -1.57327638e+00 -1.60471163e-02 -9.50567926e-08]
iteration 3 accepted 540.2789667693905 [ 9.90841826e-01 -1.57327637e+00 -7.96357494e-04 -9.53590082e-09]
iteration 4 accepted 519.6621392227048 [ 9.88080675e-01 -1.57327637e+00 -2.53225247e-03 -3.06157464e-08]
iteration 5 accepted 499.91228428923984 [ 9.89757586e-01 -1.57327637e+00 -5.16232834e-03 -6.21905170e-08]
iteration 6 accepted 490.10483502879066 [ 9.89323730e-01 -1.57327636e+00 -1.21622744e-02 -1.43293788e-07]
iteration 7 accepted 465.5953423942555 [ 9.89820828e-01 -1.57327635e+00 -2.67697362e-02 

# Receding horizon

In [None]:
from ilqr.controller import RecedingHorizonController

def receding(underlying):
    k = 'receding_' + underlying
    dynamics[k] = dynamics[underlying]
    x0s[k] = x0s[underlying]
    controller = RecedingHorizonController(x0s[k], ilqrs[underlying])
    rew = []
    xs[k] = []
    us[k] = []
    for x, u in controller.control(us_init, subsequent_n_iterations=10):
        ob, r, done, info = env.step(u)
        xs[k].append(x)
        us[k].append(u)
        rew.append(r)
        print('iteration', len(rew), r, x, u)
        if len(rew) == N:
            break

In [None]:
receding('my_basic')
receding('my_performance')

# Rollouts

In [24]:
import time

def rollout(env, dynamics, x0, us, render=False):
    dynamics.set_state(x0)
    if render:
        env.render()
    rew = []
    actual_xs = []
    for u in us:
        _obs, r, done, info = env.step(u)
        if done:
            print('warning: early termination! (assuming zero-reward from now on)')
            break
        rew.append(r)
        actual_xs.append(dynamics.get_state())
        if render:
            env.render()
            time.sleep(0.02)
    return rew, actual_xs

In [25]:
rews = {}
actual_xs = {}
for k, solved_us in us.items():
    print(k)
    rews[k], actual_xs[k] = rollout(env.unwrapped, dynamics[k], x0s[k], solved_us, render=True)
rewards = {k: sum(r) for k, r in rews.items()}
lengths = {k: len(r) for k, r in rews.items()}
pd.DataFrame({'rewards': rewards, 'lengths': lengths})

my_basic
my_performance


Unnamed: 0,rewards,lengths
my_basic,100.0,100
my_performance,100.0,100
