In [1]:
from ilqr import iLQR
import gym
import numpy as np
import pandas as pd

from aprl.agents import MujocoFiniteDiffDynamics, MujocoFiniteDiffCost
from aprl.agents.mujoco_control import MujocoFiniteDiffDynamicsLowLevel, MujocoFiniteDiffDynamicsWarmstart



Logging to /tmp/openai-2019-01-26-14-32-14-962475
Choosing the latest nvidia driver: /usr/lib/nvidia-396, among ['/usr/lib/nvidia-375', '/usr/lib/nvidia-396']
Choosing the latest nvidia driver: /usr/lib/nvidia-396, among ['/usr/lib/nvidia-375', '/usr/lib/nvidia-396']


In [2]:
# Environment setup
env = gym.make('Reacher-v2').unwrapped
env.seed(21)
_obs = env.reset()

In [3]:
# Planning setup
N = 100  # planning horizon
us_init = np.array([env.action_space.sample() for _ in range(N)])

dynamics = {
    # Uses mujoco_py's MjSimState. This saves time, qpos, qvel, act and udd_state.
    #'mujoco_py': MujocoFiniteDiffDynamics(env),
    # Uses my MujocoRelevantSimState, which contains all fields that MuJoCo's derivative.cpp copies.
    #'my_all': MujocoFiniteDiffDynamicsLowLevel(env, kind='all'),
    # All fields I think matter; excludes qfrc and xfrc_applied
    #'my_recommended': MujocoFiniteDiffDynamicsLowLevel(env, kind='recommended'),
    # qpos, qvel and qacc; no warmstart.
    #'my_basic_plus': MujocoFiniteDiffDynamicsLowLevel(env, kind='basic_plus'),
    # As above, but restricts to fields qpos & qvel. 
    # I expect this to match mujoco_py, since time does not matter, and act and udd_state are blank for Reacher.
    'my_basic': MujocoFiniteDiffDynamicsLowLevel(env, kind='basic'),
    # Like my_basic, but saving qacc_warmstart.
    'my_warmstart': MujocoFiniteDiffDynamicsWarmstart(env),
}
x0s = {k: dyn.get_state() for k, dyn in dynamics.items()}

# Finite difference cost

In [None]:
finite_cost = MujocoFiniteDiffCost(env)
finite_ilqr = iLQR(dynamics, finite_cost, N)

In [None]:
finite_xs, finite_us = finite_ilqr.fit(x0, us_init, n_iterations=100, on_iteration=on_iteration)

# Analytic cost

In [4]:
import theano
from theano import tensor as T
from ilqr.cost import AutoDiffCost

# Reacher, Gym observation:
# obs[0:1]: xs; np.cos(qpos[0:2]) (qpos[0] is joint0, qpos[1] is joint1)
# obs[2:3]: ys; np.sin(qpos[0:2]);
# obs[4:5]: goal x and y; qpos[2:]; (target_x and target_y)
# obs[6:7]: theta dot
# obs[8:9]: xy of fingertip - target

def make_reacher_cost(kind, control_weight=1.0):
    # qpos[0:3]: theta of joint 0, theta of joint 1; target x and y.
    qpos_inputs = [T.dscalar('theta'), T.dscalar('phi'), T.dscalar('targetx'), T.dscalar('targety')]
    # qvel: derivatives of the above; note target x and y are constant so have derivative zero.
    qvel_inputs = [T.dscalar('thetadot'), T.dscalar('phidot'), T.dscalar('_zero1'), T.dscalar('_zero2')]
    # qacc: second derivatives of qpos. We don't actually use these in the cost.
    qacc_inputs = [T.dscalar('_acc{}'.format(i)) for i in range(len(qpos_inputs))]
    # qacc_warmstart: same shape as qacc
    qacc_warmstart_inputs = [T.dscalar('_accwarm{}'.format(i)) for i in range(len(qpos_inputs))]
    # qfrc_applied: same shape as qacc
    qfrc_applied_inputs = [T.dscalar('_qfrc_applied{}'.format(i)) for i in range(len(qpos_inputs))]
    # xfrc_applied: (5,6)
    xfrc_applied_inputs = [T.dscalar('_xfrc_applied{}'.format(i)) for i in range(5 * 6)]
    if kind == 'mujoco_py':
        # Reacher, MJSimState.flatten():
        # obs[0]: time step, obs[1:4]: qpos[0:3]; obs[5:8]: qvel[0:3]
        # In general might include action and udd_state, but not for Reacher.
        x_inputs = [T.dscalar('_time')] + qpos_inputs + qvel_inputs
    elif kind == 'my_all':
        # Reacher, MujocoRelevantState.flatten()
        x_inputs = qpos_inputs + qvel_inputs + qacc_inputs + qacc_warmstart_inputs + qfrc_applied_inputs + xfrc_applied_inputs
    elif kind == 'my_recommended':
        x_inputs = qpos_inputs + qvel_inputs + qacc_inputs + qacc_warmstart_inputs
    elif kind == 'my_basic_plus':
        x_inputs = qpos_inputs + qvel_inputs + qacc_inputs
    elif kind in ['my_basic', 'my_warmstart']:
        x_inputs = qpos_inputs + qvel_inputs
    else:
        raise ValueError("Unrecognised kind: '{}'".format(kind))
    u_inputs = [T.dscalar('thetadotdot'), T.dscalar('phidotdot')]
    qpos = T.stack(qpos_inputs)
    u = T.stack(u_inputs)
    
    control_cost = T.dot(u, u)
    target_xpos = qpos[2:4]
    body1_xpos = 0.1 * T.stack([T.cos(qpos[0]), T.sin(qpos[0])])
    fingertip_xpos_delta = 0.11 * T.stack([T.cos(qpos[1]), T.sin(qpos[1])])
    fingertip_xpos = body1_xpos + fingertip_xpos_delta
    delta = fingertip_xpos - target_xpos
    state_cost = T.sqrt(T.dot(delta, delta))
    l = state_cost + control_weight * control_cost
    l_terminal = T.zeros(())
    return AutoDiffCost(l, l_terminal, x_inputs, u_inputs)

In [5]:
def on_iteration(iteration_count, xs, us, J_opt, accepted, converged):
    info = "converged" if converged else ("accepted" if accepted else "failed")
    print("iteration", iteration_count, info, J_opt, xs[-1])

costs = {k: make_reacher_cost(k) for k in dynamics.keys()}
ilqrs = {k: iLQR(dyn, costs[k], N) for k, dyn in dynamics.items()}
xs = {}
us = {}
print(ilqrs.keys())
for k, ilqr in ilqrs.items():
    print('*** Fitting {} ***'.format(k))
    x0 = x0s[k]
    xs[k], us[k] = ilqr.fit(x0, us_init, n_iterations=100, on_iteration=on_iteration)

dict_keys(['my_basic', 'my_warmstart'])
*** Fitting my_basic ***
iteration -1 converged 84.27510736564943 [ 7.79198693  1.98838794 -0.08007569  0.15573813  3.36959087 -6.12580201
  0.          0.        ]
iteration 0 accepted 32.78745332055802 [ 9.79664724  2.10611239 -0.08007569  0.15573813  3.58715141 -6.0514958
  0.          0.        ]
iteration 1 accepted 25.409100892170347 [ 7.68950636  2.12747417 -0.08007569  0.15573813  3.84096417 -5.95721632
  0.          0.        ]
iteration 2 accepted 24.847739716151878 [ 9.54534789  2.04409452 -0.08007569  0.15573813  4.78573992 -6.08036618
  0.          0.        ]
iteration 3 accepted 23.99286131200891 [ 5.41964153  1.68061858 -0.08007569  0.15573813  4.04113738 -6.27247195
  0.          0.        ]
iteration 4 accepted 23.91893008593514 [ 3.30241624  1.58951725 -0.08007569  0.15573813 -6.46371275 -5.43425168
  0.          0.        ]
iteration 5 failed 23.918930085935145 [ 3.30241624  1.58951725 -0.08007569  0.15573813 -6.46371275 -5.43

iteration 1 accepted 25.409100850874957 [ 7.68950637  2.12747418 -0.08007569  0.15573813  3.84096421 -5.95721634
  0.          0.        ]
iteration 2 accepted 24.847739297587115 [ 9.54534775  2.04409426 -0.08007569  0.15573813  4.78573989 -6.08036724
  0.          0.        ]
iteration 3 accepted 23.99285771863283 [ 5.41964212  1.68062636 -0.08007569  0.15573813  4.04113664 -6.27245247
  0.          0.        ]
iteration 4 accepted 23.918928826026 [ 3.30242729  1.58951852 -0.08007569  0.15573813 -6.46366063 -5.43424621
  0.          0.        ]
iteration 5 failed 23.918928826026 [ 3.30242729  1.58951852 -0.08007569  0.15573813 -6.46366063 -5.43424621
  0.          0.        ]
iteration 6 failed 23.918928826026 [ 3.30242729  1.58951852 -0.08007569  0.15573813 -6.46366063 -5.43424621
  0.          0.        ]
iteration 7 failed 23.918928826026 [ 3.30242729  1.58951852 -0.08007569  0.15573813 -6.46366063 -5.43424621
  0.          0.        ]
iteration 8 accepted 23.4767854419953 [ 2.9365

iteration 61 failed 18.557423086389587 [ 9.69184323  1.1736284  -0.08007569  0.15573813  7.3599066  -3.39248044
  0.          0.        ]
iteration 62 failed 18.557423086389587 [ 9.69184323  1.1736284  -0.08007569  0.15573813  7.3599066  -3.39248044
  0.          0.        ]
iteration 63 accepted 18.553331467980076 [ 9.69626954  1.17855602 -0.08007569  0.15573813  7.32406967 -3.38888065
  0.          0.        ]
iteration 64 failed 18.553331467980076 [ 9.69626954  1.17855602 -0.08007569  0.15573813  7.32406967 -3.38888065
  0.          0.        ]
iteration 65 accepted 18.55331282360215 [ 9.69640518  1.16435858 -0.08007569  0.15573813  7.32307342 -3.40654365
  0.          0.        ]
iteration 66 accepted 18.5531690905916 [ 9.69654174  1.1764953  -0.08007569  0.15573813  7.32285535 -3.39112126
  0.          0.        ]
iteration 67 failed 18.553169090591606 [ 9.69654174  1.1764953  -0.08007569  0.15573813  7.32285535 -3.39112126
  0.          0.        ]
iteration 68 failed 18.55316909

# Receding horizon

In [8]:
from ilqr.controller import RecedingHorizonController

def receding(underlying):
    k = 'receding_' + underlying
    dynamics[k] = dynamics[underlying]
    x0s[k] = x0s[underlying]
    controller = RecedingHorizonController(x0s[k], ilqrs[underlying])
    rew = []
    xs[k] = []
    us[k] = []
    for x, u in controller.control(us_init, subsequent_n_iterations=10):
        ob, r, done, info = env.step(u)
        xs[k].append(x)
        us[k].append(u)
        rew.append(r)
        print('iteration', len(rew), r, x, u)
        if len(rew) == N:
            break

In [9]:
receding('my_basic')

iteration 1 -0.7017488979482824 [[-0.01698329  0.03921564 -0.08007569  0.15573813  0.00389678  0.00462729
   0.          0.        ]
 [-0.00720982  0.06430297 -0.08007569  0.15573813  0.97023672  2.49580404
   0.          0.        ]] [[0.24454251 0.62926835]]
iteration 2 -0.6408562860343747 [[-0.00720982  0.06430297 -0.08007569  0.15573813  0.97023672  2.49580404
   0.          0.        ]
 [ 0.02052554  0.13644192 -0.08007569  0.15573813  1.80054138  4.71071016
   0.          0.        ]] [[0.21495578 0.57193837]]
iteration 3 -0.5731476356984957 [[ 0.02052554  0.13644192 -0.08007569  0.15573813  1.80054138  4.71071016
   0.          0.        ]
 [ 0.06360841  0.25025534 -0.08007569  0.15573813  2.50541435  6.66412203
   0.          0.        ]] [[0.18736204 0.51696011]]
iteration 4 -0.5292948741919571 [[ 0.06360841  0.25025534 -0.08007569  0.15573813  2.50541435  6.66412203
   0.          0.        ]
 [ 0.11968261  0.40052677 -0.08007569  0.15573813  3.1000531   8.35737867
   0.     

iteration 33 -0.18848850824050475 [[ 2.17251535  1.94201803 -0.08007569  0.15573813  3.47861767 -3.4825319
   0.          0.        ]
 [ 2.24067032  1.87447422 -0.08007569  0.15573813  3.33735102 -3.27254811
   0.          0.        ]] [[-0.01827964  0.03562751]]
iteration 34 -0.23773231648539542 [[ 2.24067032  1.87447422 -0.08007569  0.15573813  3.33735102 -3.27254811
   0.          0.        ]
 [ 2.30538698  1.81161208 -0.08007569  0.15573813  3.13499063 -3.01452431
   0.          0.        ]] [[-0.03442112  0.04880683]]
iteration 35 -0.2179487868143441 [[ 2.30538698  1.81161208 -0.08007569  0.15573813  3.13499063 -3.01452431
   0.          0.        ]
 [ 2.36578271  1.75408138 -0.08007569  0.15573813  2.90534806 -2.73946076
   0.          0.        ]] [[-0.04232715  0.05439758]]
iteration 36 -0.2172548578882569 [[ 2.36578271  1.75408138 -0.08007569  0.15573813  2.90534806 -2.73946076
   0.          0.        ]
 [ 2.42131661  1.70214899 -0.08007569  0.15573813  2.64889778 -2.45472622

iteration 65 -0.3168873714257109 [[ 3.10252189  1.56654364 -0.08007569  0.15573813  5.70651405  1.30922856
   0.          0.        ]
 [ 3.22455524  1.59492777 -0.08007569  0.15573813  6.49420226  1.52844835
   0.          0.        ]] [[0.22755214 0.0619849 ]]
iteration 66 -0.3252290789420915 [[ 3.22455524  1.59492777 -0.08007569  0.15573813  6.49420226  1.52844835
   0.          0.        ]
 [ 3.36234155  1.62796263 -0.08007569  0.15573813  7.28181293  1.77421225
   0.          0.        ]] [[0.23146235 0.06979459]]
iteration 67 -0.3384779303342012 [[ 3.36234155  1.62796263 -0.08007569  0.15573813  7.28181293  1.77421225
   0.          0.        ]
 [ 3.51559078  1.66578376 -0.08007569  0.15573813  8.04058967  2.00711694
   0.          0.        ]] [[0.22810324 0.06778531]]
iteration 68 -0.343692331827644 [[ 3.51559078  1.66578376 -0.08007569  0.15573813  8.04058967  2.00711694
   0.          0.        ]
 [ 3.68344135  1.70812018 -0.08007569  0.15573813  8.74213902  2.22578981
   0.  

iteration 97 -0.17988801930221487 [[ 8.66819848  1.80676037 -0.08007569  0.15573813  3.24225253 -2.94197201
   0.          0.        ]
 [ 8.7282849   1.75011751 -0.08007569  0.15573813  2.76797286 -2.72304078
   0.          0.        ]] [[-0.10360191  0.04057608]]
iteration 98 -0.17081124785865467 [[ 8.7282849   1.75011751 -0.08007569  0.15573813  2.76797286 -2.72304078
   0.          0.        ]
 [ 8.77959449  1.69781492 -0.08007569  0.15573813  2.36433293 -2.50793338
   0.          0.        ]] [[-0.0881301   0.04070355]]
iteration 99 -0.16072364413412438 [[ 8.77959449  1.69781492 -0.08007569  0.15573813  2.36433293 -2.50793338
   0.          0.        ]
 [ 8.82508769  1.64816518 -0.08007569  0.15573813  2.18558397 -2.4572082
   0.          0.        ]] [[-0.03333495  0.00026996]]
iteration 100 -0.1751267741925953 [[ 8.82508769  1.64816518 -0.08007569  0.15573813  2.18558397 -2.4572082
   0.          0.        ]
 [ 8.86439834  1.6028697  -0.08007569  0.15573813  1.74694466 -2.0736171

# Rollouts

In [6]:
import time

def rollout(env, dynamics, x0, us, render=False):
    dynamics.set_state(x0)
    if render:
        env.render()
    rew = []
    actual_xs = []
    for u in us:
        _obs, r, done, info = env.step(u)
        rew.append(r)
        actual_xs.append(dynamics.get_state())
        assert not done
        if render:
            env.render()
            time.sleep(0.02)
    return rew, actual_xs

In [10]:
rews = {}
actual_xs = {}
for k, solved_us in us.items():
    print(k)
    rews[k], actual_xs[k] = rollout(env.unwrapped, dynamics[k], x0s[k], solved_us, render=True)
rewards = {k: sum(r) for k, r in rews.items()}
lengths = {k: len(r) for k, r in rews.items()}
pd.DataFrame({'rewards': rewards, 'lengths': lengths})

my_basic
my_warmstart
receding_my_basic


Unnamed: 0,rewards,lengths
my_basic,-21.774164,100
my_warmstart,-23.890482,100
receding_my_basic,-22.676585,100
