In [1]:
from ilqr import iLQR
import gym
import numpy as np
import pandas as pd

from aprl.agents import MujocoFiniteDiffDynamics, MujocoFiniteDiffCost
from aprl.agents.mujoco_control import MujocoFiniteDiffDynamicsLowLevel, MujocoFiniteDiffDynamicsWarmstart



Logging to /tmp/openai-2019-01-25-21-18-45-125526
Choosing the latest nvidia driver: /usr/lib/nvidia-396, among ['/usr/lib/nvidia-375', '/usr/lib/nvidia-396']
Choosing the latest nvidia driver: /usr/lib/nvidia-396, among ['/usr/lib/nvidia-375', '/usr/lib/nvidia-396']


In [2]:
# Environment setup
env = gym.make('Reacher-v2').unwrapped
env.seed(42)
_obs = env.reset()

In [3]:
# Planning setup
N = 100  # planning horizon
us_init = np.array([env.action_space.sample() for _ in range(N)])

dynamics = {
    # Uses mujoco_py's MjSimState. This saves time, qpos, qvel, act and udd_state.
    #'mujoco_py': MujocoFiniteDiffDynamics(env),
    # Uses my MujocoRelevantSimState, which contains all fields that MuJoCo's derivative.cpp copies.
    #'my_all': MujocoFiniteDiffDynamicsLowLevel(env, kind='all'),
    # All fields I think matter; excludes qfrc and xfrc_applied
    #'my_recommended': MujocoFiniteDiffDynamicsLowLevel(env, kind='recommended'),
    # qpos, qvel and qacc; no warmstart.
    #'my_basic_plus': MujocoFiniteDiffDynamicsLowLevel(env, kind='basic_plus'),
    # As above, but restricts to fields qpos & qvel. 
    # I expect this to match mujoco_py, since time does not matter, and act and udd_state are blank for Reacher.
    'my_basic': MujocoFiniteDiffDynamicsLowLevel(env, kind='basic'),
    # Like my_basic, but saving qacc_warmstart.
    'my_warmstart': MujocoFiniteDiffDynamicsWarmstart(env),
}
x0s = {k: dyn.get_state() for k, dyn in dynamics.items()}

# Finite difference cost

In [None]:
finite_cost = MujocoFiniteDiffCost(env)
finite_ilqr = iLQR(dynamics, finite_cost, N)

In [None]:
finite_xs, finite_us = finite_ilqr.fit(x0, us_init, n_iterations=100, on_iteration=on_iteration)

# Analytic cost

In [4]:
import theano
from theano import tensor as T
from ilqr.cost import AutoDiffCost

# Reacher, Gym observation:
# obs[0:1]: xs; np.cos(qpos[0:2]) (qpos[0] is joint0, qpos[1] is joint1)
# obs[2:3]: ys; np.sin(qpos[0:2]);
# obs[4:5]: goal x and y; qpos[2:]; (target_x and target_y)
# obs[6:7]: theta dot
# obs[8:9]: xy of fingertip - target

def make_reacher_cost(kind, control_weight=1.0):
    # qpos[0:3]: theta of joint 0, theta of joint 1; target x and y.
    qpos_inputs = [T.dscalar('theta'), T.dscalar('phi'), T.dscalar('targetx'), T.dscalar('targety')]
    # qvel: derivatives of the above; note target x and y are constant so have derivative zero.
    qvel_inputs = [T.dscalar('thetadot'), T.dscalar('phidot'), T.dscalar('_zero1'), T.dscalar('_zero2')]
    # qacc: second derivatives of qpos. We don't actually use these in the cost.
    qacc_inputs = [T.dscalar('_acc{}'.format(i)) for i in range(len(qpos_inputs))]
    # qacc_warmstart: same shape as qacc
    qacc_warmstart_inputs = [T.dscalar('_accwarm{}'.format(i)) for i in range(len(qpos_inputs))]
    # qfrc_applied: same shape as qacc
    qfrc_applied_inputs = [T.dscalar('_qfrc_applied{}'.format(i)) for i in range(len(qpos_inputs))]
    # xfrc_applied: (5,6)
    xfrc_applied_inputs = [T.dscalar('_xfrc_applied{}'.format(i)) for i in range(5 * 6)]
    if kind == 'mujoco_py':
        # Reacher, MJSimState.flatten():
        # obs[0]: time step, obs[1:4]: qpos[0:3]; obs[5:8]: qvel[0:3]
        # In general might include action and udd_state, but not for Reacher.
        x_inputs = [T.dscalar('_time')] + qpos_inputs + qvel_inputs
    elif kind == 'my_all':
        # Reacher, MujocoRelevantState.flatten()
        x_inputs = qpos_inputs + qvel_inputs + qacc_inputs + qacc_warmstart_inputs + qfrc_applied_inputs + xfrc_applied_inputs
    elif kind == 'my_recommended':
        x_inputs = qpos_inputs + qvel_inputs + qacc_inputs + qacc_warmstart_inputs
    elif kind == 'my_basic_plus':
        x_inputs = qpos_inputs + qvel_inputs + qacc_inputs
    elif kind in ['my_basic', 'my_warmstart']:
        x_inputs = qpos_inputs + qvel_inputs
    else:
        raise ValueError("Unrecognised kind: '{}'".format(kind))
    u_inputs = [T.dscalar('thetadotdot'), T.dscalar('phidotdot')]
    qpos = T.stack(qpos_inputs)
    u = T.stack(u_inputs)
    
    control_cost = T.dot(u, u)
    target_xpos = qpos[2:4]
    body1_xpos = 0.1 * T.stack([T.cos(qpos[0]), T.sin(qpos[0])])
    fingertip_xpos_delta = 0.11 * T.stack([T.cos(qpos[1]), T.sin(qpos[1])])
    fingertip_xpos = body1_xpos + fingertip_xpos_delta
    delta = fingertip_xpos - target_xpos
    state_cost = T.sqrt(T.dot(delta, delta))
    l = state_cost + control_weight * control_cost
    l_terminal = T.zeros(())
    return AutoDiffCost(l, l_terminal, x_inputs, u_inputs)

In [5]:
def on_iteration(iteration_count, xs, us, J_opt, accepted, converged):
    info = "converged" if converged else ("accepted" if accepted else "failed")
    print("iteration", iteration_count, info, J_opt, xs[-1])

costs = {k: make_reacher_cost(k) for k in dynamics.keys()}
ilqrs = {k: iLQR(dyn, costs[k], N) for k, dyn in dynamics.items()}
xs = {}
us = {}
print(ilqrs.keys())
for k, ilqr in ilqrs.items():
    print('*** Fitting {} ***'.format(k))
    x0 = x0s[k]
    xs[k], us[k] = ilqr.fit(x0, us_init, n_iterations=100, on_iteration=on_iteration)

dict_keys(['my_basic', 'my_warmstart'])
*** Fitting my_basic ***
iteration -1 converged 75.56507394094349 [ 7.77702008  1.21356644  0.02243766  0.07369059  3.36928471 -8.26249354
  0.          0.        ]
iteration 0 accepted 24.703256282665553 [ 7.76242431  1.35041403  0.02243766  0.07369059  3.60363846 -8.19340603
  0.          0.        ]
iteration 1 accepted 18.55274583274787 [ 7.42913402  1.4810794   0.02243766  0.07369059  3.8427397  -8.08958063
  0.          0.        ]
iteration 2 accepted 15.279133090029 [ 7.03478229  1.74744537  0.02243766  0.07369059  4.07205907 -7.59386489
  0.          0.        ]
iteration 3 accepted 15.239257588663513 [ 7.21736301  1.78609219  0.02243766  0.07369059  4.15196273 -7.25084597
  0.          0.        ]
iteration 4 accepted 14.684800849193042 [ 7.04007353  2.73728913  0.02243766  0.07369059 -2.32950265 -1.83821299
  0.          0.        ]
iteration 5 accepted 14.269744785140134 [ 5.55968126  2.93255467  0.02243766  0.07369059 -5.49310446 -1.

iteration 59 accepted 11.240245950742368 [6.38352597 2.49819734 0.02243766 0.07369059 0.68103235 1.0381164
 0.         0.        ]
iteration 60 accepted 11.236903946056715 [6.40119186 2.50963456 0.02243766 0.07369059 0.75301441 1.05955418
 0.         0.        ]
iteration 61 failed 11.23690394605672 [6.40119186 2.50963456 0.02243766 0.07369059 0.75301441 1.05955418
 0.         0.        ]
iteration 62 failed 11.23690394605672 [6.40119186 2.50963456 0.02243766 0.07369059 0.75301441 1.05955418
 0.         0.        ]
iteration 63 failed 11.23690394605672 [6.40119186 2.50963456 0.02243766 0.07369059 0.75301441 1.05955418
 0.         0.        ]
iteration 64 accepted 11.219803260915253 [6.38422498 2.49685761 0.02243766 0.07369059 0.76231462 1.06222308
 0.         0.        ]
iteration 65 accepted 11.219783020218237 [6.38479641 2.49580295 0.02243766 0.07369059 0.76300924 1.06177546
 0.         0.        ]
iteration 66 converged 11.219777048828897 [6.38796744 2.49602055 0.02243766 0.07369059

iteration 52 failed 11.243411616266048 [6.3809456  2.49475621 0.02243766 0.07369059 0.64109733 1.02809222
 0.         0.        ]
iteration 53 accepted 11.241486607912092 [6.38403333 2.50404076 0.02243766 0.07369059 0.65579549 1.03733036
 0.         0.        ]
iteration 54 failed 11.241486607912083 [6.38403333 2.50404076 0.02243766 0.07369059 0.65579549 1.03733036
 0.         0.        ]
iteration 55 accepted 11.240825958885125 [6.39342956 2.51170036 0.02243766 0.07369059 0.67039936 1.04236941
 0.         0.        ]
iteration 56 failed 11.240825958885127 [6.39342956 2.51170036 0.02243766 0.07369059 0.67039936 1.04236941
 0.         0.        ]
iteration 57 failed 11.240825958885127 [6.39342956 2.51170036 0.02243766 0.07369059 0.67039936 1.04236941
 0.         0.        ]
iteration 58 accepted 11.240517910017063 [6.3819835  2.49702439 0.02243766 0.07369059 0.67911918 1.03651265
 0.         0.        ]
iteration 59 accepted 11.24024594731029 [6.38352598 2.49819734 0.02243766 0.07369059

# Receding horizon

In [6]:
from ilqr.controller import RecedingHorizonController

def receding(underlying):
    k = 'receding_' + underlying
    dynamics[k] = dynamics[underlying]
    x0s[k] = x0s[underlying]
    controller = RecedingHorizonController(x0s[k], ilqrs[underlying])
    rew = []
    xs[k] = []
    us[k] = []
    for x, u in controller.control(us_init, subsequent_n_iterations=10):
        ob, r, done, info = env.step(u)
        xs[k].append(x)
        us[k].append(u)
        rew.append(r)
        print('iteration', len(rew), r, x, u)
        if len(rew) == N:
            break

In [7]:
receding('my_basic')

iteration 1 -0.5342466440375117 [[-2.51713244e-02 -3.13228967e-03  2.24376640e-02  7.36905865e-02
  -3.59075061e-03 -1.20677992e-03  0.00000000e+00  0.00000000e+00]
 [-2.44457471e-02  2.53625943e-02  2.24376640e-02  7.36905865e-02
   7.58877078e-02  2.84122219e+00  0.00000000e+00  0.00000000e+00]] [[0.02035164 0.71786165]]
iteration 2 -0.44724372876823926 [[-0.02444575  0.02536259  0.02243766  0.07369059  0.07588771  2.84122219
   0.          0.        ]
 [-0.02214726  0.1077051   0.02243766  0.07369059  0.15370759  5.38455189
   0.          0.        ]] [[0.02029876 0.65653563]]
iteration 3 -0.3357802246549365 [[-0.02214726  0.1077051   0.02243766  0.07369059  0.15370759  5.38455189
   0.          0.        ]
 [-0.0183502   0.23618477  0.02243766  0.07369059  0.2257704   7.45650989
   0.          0.        ]] [[0.0191779  0.55020601]]
iteration 4 -0.27915353514153274 [[-0.0183502   0.23618477  0.02243766  0.07369059  0.2257704   7.45650989
   0.          0.        ]
 [-0.01304661  0.4

iteration 32 -0.06137798855952696 [[ 0.38677499  2.32050322  0.02243766  0.07369059  1.48713376 -6.38933449
   0.          0.        ]
 [ 0.41883805  2.18736651  0.02243766  0.07369059  1.71840465 -6.92256152
   0.          0.        ]] [[ 0.06583285 -0.1666098 ]]
iteration 33 -0.04441794640285346 [[ 0.41883805  2.18736651  0.02243766  0.07369059  1.71840465 -6.92256152
   0.          0.        ]
 [ 0.45540921  2.0437468   0.02243766  0.07369059  1.93798354 -7.4376935
   0.          0.        ]] [[ 0.0640312 -0.1647038]]
iteration 34 -0.06638011439725525 [[ 0.45540921  2.0437468   0.02243766  0.07369059  1.93798354 -7.4376935
   0.          0.        ]
 [ 0.49625479  1.89016625  0.02243766  0.07369059  2.14588467 -7.91875989
   0.          0.        ]] [[ 0.06217489 -0.15867373]]
iteration 35 -0.0474326006812158 [[ 0.49625479  1.89016625  0.02243766  0.07369059  2.14588467 -7.91875989
   0.          0.        ]
 [ 0.54132235  1.72760369  0.02243766  0.07369059  2.36015859 -8.33610796
 

iteration 62 -0.10152010437624491 [[ 0.89887567 -2.93118756  0.02243766  0.07369059  0.49717177  3.87791665
   0.          0.        ]
 [ 0.91300933 -2.84611733  0.02243766  0.07369059  0.91479725  4.62661111
   0.          0.        ]] [[0.10796618 0.20846933]]
iteration 63 -0.09879767027341924 [[ 0.91300933 -2.84611733  0.02243766  0.07369059  0.91479725  4.62661111
   0.          0.        ]
 [ 0.93526797 -2.74677714  0.02243766  0.07369059  1.30974342  5.30514589
   0.          0.        ]] [[0.10433563 0.19449435]]
iteration 64 -0.09362414807149824 [[ 0.93526797 -2.74677714  0.02243766  0.07369059  1.30974342  5.30514589
   0.          0.        ]
 [ 0.9652698  -2.63383428  0.02243766  0.07369059  1.68916512  5.98686822
   0.          0.        ]] [[0.10240257 0.19869241]]
iteration 65 -0.0808648111775787 [[ 0.9652698  -2.63383428  0.02243766  0.07369059  1.68916512  5.98686822
   0.          0.        ]
 [ 1.00314829 -2.50855369  0.02243766  0.07369059  2.09730989  6.53934864
   

iteration 94 -0.1911318768535486 [[4.56087084 1.40659392 0.02243766 0.07369059 8.48627399 4.42042857
  0.         0.        ]
 [4.72798451 1.49339086 0.02243766 0.07369059 8.22595389 4.25980201
  0.         0.        ]] [[-0.02345668 -0.01840149]]
iteration 95 -0.17215104774319445 [[4.72798451 1.49339086 0.02243766 0.07369059 8.22595389 4.25980201
  0.         0.        ]
 [4.88993281 1.57646198 0.02243766 0.07369059 7.96972241 4.04801779
  0.         0.        ]] [[-0.0237175  -0.03212732]]
iteration 96 -0.18256840550314948 [[4.88993281 1.57646198 0.02243766 0.07369059 7.96972241 4.04801779
  0.         0.        ]
 [5.04587126 1.65574466 0.02243766 0.07369059 7.62526236 3.88081095
  0.         0.        ]] [[-0.04727903 -0.02193585]]
iteration 97 -0.18480631666830943 [[5.04587126 1.65574466 0.02243766 0.07369059 7.62526236 3.88081095
  0.         0.        ]
 [5.1940471  1.7314861  0.02243766 0.07369059 7.19375045 3.69395896
  0.         0.        ]] [[-0.0709852  -0.02774186]]
itera

# Rollouts

In [8]:
import time

def rollout(env, dynamics, x0, us, render=False):
    dynamics.set_state(x0)
    if render:
        env.render()
    rew = []
    actual_xs = []
    for u in us:
        _obs, r, done, info = env.step(u)
        rew.append(r)
        actual_xs.append(dynamics.get_state())
        assert not done
        if render:
            env.render()
            time.sleep(0.02)
    return rew, actual_xs

In [13]:
rews = {}
actual_xs = {}
for k, solved_us in us.items():
    print(k)
    rews[k], actual_xs[k] = rollout(env.unwrapped, dynamics[k], x0s[k], solved_us, render=True)
rewards = {k: sum(r) for k, r in rews.items()}
lengths = {k: len(r) for k, r in rews.items()}
pd.DataFrame({'rewards': rewards, 'lengths': lengths})

my_basic
my_warmstart
receding_my_basic


Unnamed: 0,rewards,lengths
my_basic,-15.842267,100
my_warmstart,-15.842267,100
receding_my_basic,-15.70971,100
