In [3]:
from ilqr import iLQR
import gym
import numpy as np

from aprl.agents import MujocoFiniteDiffDynamics, MujocoFiniteDiffCost

In [4]:
def on_iteration(iteration_count, xs, us, J_opt, accepted, converged):
    info = "converged" if converged else ("accepted" if accepted else "failed")
    print("iteration", iteration_count, info, J_opt, xs[-1])

In [6]:
# Environment setup
env = gym.make('Reacher-v2').unwrapped
env.reset()
dynamics = MujocoFiniteDiffDynamics(env)
x0 = dynamics.get_state()
N = 100  # planning horizon
us_init = np.array([env.action_space.sample() for _ in range(N)])

# Finite difference cost

In [None]:
finite_cost = MujocoFiniteDiffCost(env)
finite_ilqr = iLQR(dynamics, finite_cost, N)

In [None]:
finite_xs, finite_us = finite_ilqr.fit(x0, us_init, n_iterations=100, on_iteration=on_iteration)

# Analytic cost

In [16]:
import theano
from theano import tensor as T
from ilqr.cost import AutoDiffCost

def make_reacher_cost(idx=1, control_weight=1.0):
    x_inputs = [T.dscalar('_x1'), T.dscalar('theta'), T.dscalar('phi'), T.dscalar('targetx'), T.dscalar('targety'),
                T.dscalar('_x5'), T.dscalar('_x6'), T.dscalar('_x7'), T.dscalar('_x8')]
    u_inputs = [T.dscalar('thetadotdot'), T.dscalar('phidotdot')]
    x = T.stack(x_inputs)
    u = T.stack(u_inputs)
    
    control_cost = T.dot(u, u)
    target_xpos = x[idx + 2:idx + 4]
    body1_xpos = 0.1 * T.stack([T.cos(x[idx]), T.sin(x[idx])])
    fingertip_xpos_delta = 0.11 * T.stack([T.cos(x[idx + 1]), T.sin(x[idx + 1])])
    fingertip_xpos = body1_xpos + fingertip_xpos_delta
    delta = fingertip_xpos - target_xpos
    state_cost = T.sqrt(T.dot(delta, delta))
    l = state_cost + control_weight * control_cost
    l_terminal = T.zeros(())
    return AutoDiffCost(l, l_terminal, x_inputs, u_inputs)

In [15]:
analytic_cost = make_reacher_cost()
analytic_ilqr = iLQR(dynamics, analytic_cost, N)

In [8]:
analytic_xs, analytic_us = analytic_ilqr.fit(x0, us_init, n_iterations=100, on_iteration=on_iteration)

iteration 0 accepted 33.4698504269303 [ 2.         -5.09353249 -0.74258911  0.02911302  0.09613908  3.24488738
  7.76164312  0.          0.        ]
iteration 1 accepted 25.033259680453238 [ 2.         -4.39817071 -1.08032056  0.02911302  0.09613908  3.27963159
  7.82319857  0.          0.        ]
iteration 2 accepted 24.915132847913195 [ 2.         -4.52497897 -1.13612821  0.02911302  0.09613908  3.25764047
  7.61786103  0.          0.        ]
iteration 3 failed 24.915132847913185 [ 2.         -4.52497897 -1.13612821  0.02911302  0.09613908  3.25764047
  7.61786103  0.          0.        ]
iteration 4 accepted 23.910246633603272 [ 2.         -5.10145115 -2.01388726  0.02911302  0.09613908  2.69600267
  5.7191289   0.          0.        ]
iteration 5 accepted 17.343974310182524 [ 2.         -4.3189089  -0.33449095  0.02911302  0.09613908  2.61041562
  5.6297516   0.          0.        ]
iteration 6 accepted 17.324230489024952 [ 2.         -4.3866409  -0.32131299  0.02911302  0.096139

# Alternative FD

In [17]:
from aprl.agents.mujoco_control import MujocoFiniteDiffDynamicsLowLevel
better_dynamics = MujocoFiniteDiffDynamicsLowLevel(env)
better_x0 = better_dynamics.get_state()
better_analytic_cost = make_reacher_cost(idx=0)
better_analytic_ilqr = iLQR(better_dynamics, better_analytic_cost, N)
better_analytic_xs, better_analytic_us = analytic_ilqr.fit(x0, us_init, n_iterations=100, on_iteration=on_iteration)

iteration 0 accepted 33.4698504269303 [ 2.         -5.09353249 -0.74258911  0.02911302  0.09613908  3.24488738
  7.76164312  0.          0.        ]
iteration 1 accepted 25.033259680453238 [ 2.         -4.39817071 -1.08032056  0.02911302  0.09613908  3.27963159
  7.82319857  0.          0.        ]
iteration 2 accepted 24.915132847913195 [ 2.         -4.52497897 -1.13612821  0.02911302  0.09613908  3.25764047
  7.61786103  0.          0.        ]
iteration 3 failed 24.915132847913185 [ 2.         -4.52497897 -1.13612821  0.02911302  0.09613908  3.25764047
  7.61786103  0.          0.        ]
iteration 4 accepted 23.910246633603272 [ 2.         -5.10145115 -2.01388726  0.02911302  0.09613908  2.69600267
  5.7191289   0.          0.        ]
iteration 5 accepted 17.343974310182524 [ 2.         -4.3189089  -0.33449095  0.02911302  0.09613908  2.61041562
  5.6297516   0.          0.        ]
iteration 6 accepted 17.324230489024952 [ 2.         -4.3866409  -0.32131299  0.02911302  0.096139

# Receding horizon

In [None]:
from ilqr.controller import RecedingHorizonController

controller = RecedingHorizonController(x0, analytic_ilqr)  # can also use finite_ilqr
rew = []
receding_xs = []
receding_us = []
for x, u in controller.control(us_init, subsequent_n_iterations=10):
    ob, r, done, info = env.step(u)
    receding_xs.append(x)
    receding_us.append(u)
    rew.append(r)
    print('iteration', len(rew), r, x, u)
    if len(rew) == 50:
        break

# Rollouts

In [9]:
import time

def rollout(env, dynamics, x0, us, render=False):
    dynamics.set_state(x0)
    if render:
        env.render()
    rew = []
    actual_xs = []
    for u in us:
        _obs, r, done, info = env.step(u)
        rew.append(r)
        actual_xs.append(dynamics.get_state())
        assert not done
        if render:
            env.render()
            time.sleep(0.05)
    return rew, actual_xs

In [19]:
rew, actual_xs = rollout(env.unwrapped, dynamics, x0, better_analytic_us, render=True)