# Gym Environment Interface: minimal example

In [1]:
import robotic as ry
import gymnasium as gym
import numpy as np
print('ry version:', ry.__version__, ry.compiled())

ry version: 0.1.7 compile time: Feb 13 2024 12:22:55


In [2]:
# A basic configuration, where the orange box is to be pushed to the target with the stick

C = ry.Config()
C.addFile(ry.raiPath('scenarios/pandaSingle.g'))
C.view(False)

C.addFrame('box') \
    .setShape(ry.ST.ssBox, size=[.1,.1,.1,.005]) .setColor([1,.5,0]) \
    .setPosition([.1,.35,.9]) \
    .setMass(.1)

C.addFrame('stick', 'l_gripper') \
    .setShape(ry.ST.capsule, size=[.3,.02]) .setColor([.5,1,0]) \
    .setRelativePosition([0,0,-.13])

C.addFrame('target') \
    .setShape(ry.ST.marker, size=[.1]) .setColor([0,1,0]) \
    .setPosition([.5,.0,.7]) \

C.setJointState([.0], ['l_panda_joint2']) #only cosmetics
C.setJointState([.02], ['l_panda_finger_joint1']) #only cosmetics

q0 = C.getJointState()
X0 = C.getFrameState()

C.view()

0

In [3]:
# Generic gym environment, instantiating pyhsx multibody sim, with velocity control
# the arguments C, time_limit, and reward_fct define the problem

class RaiGym(gym.Env):
    metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 4}
    tau = .05
    time = 0.

    def __init__(self, C, time_limit, reward_fct, render_mode=None):
        self.C = C
        self.time_limit = time_limit
        self.reward_fct = reward_fct
        self.render_mode = render_mode
        #self.limits = self.C.getJointLimits()
        self.limits = [-10., 10.]
        self.q0 = self.C.getJointState()
        self.X0 = self.C.getFrameState()

        self.observation_space = gym.spaces.box.Box(self.limits[0], self.limits[1], shape=(self.q0.size,), dtype=np.float32)
        self.action_space = gym.spaces.box.Box(low=-1., high=1., shape=(self.q0.size,), dtype=np.float32)

        assert render_mode is None or render_mode in self.metadata["render_modes"]
        self.render_mode = render_mode

        self.sim = ry.Simulation(self.C, ry.SimulationEngine.physx, 0)

    def __del__(self):
        del self.sim
        del self.C
        
    def step(self, action):
        self.sim.step(action, self.tau, ry.ControlMode.velocity)
        self.time += self.tau
        
        observation = self.C.getJointState()
        reward = self.reward_fct(C)
        terminated = (self.time >= self.time_limit)
        info = {"no": "additional info"}

        return observation, reward, terminated, False, info
        
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)

        self.time = 0.
        self.sim.setState(X0, q0)
        self.sim.resetSplineRef()

        observation = self.C.getJointState()
        info = {"no": "additional info"}

        if self.render_mode == "human":
            self.C.view(False)

        return observation, info
        
    def render(self):
        self.C.view(False, f'RaiGym time {self.time} / {self.time_limit}')
        if self.render_mode == "rgb_array":
            return self.C.view_getRgb()


In [4]:
# reward function

def reward_function(C):
    touch, _ = C.eval(ry.FS.negDistance, ["stick", "box"])
    dist, _ = C.eval(ry.FS.positionDiff, ["box", "target"])
    r = touch[0] - np.linalg.norm(dist)
    return r

In [5]:
g = RaiGym(C, 10., reward_function)

In [6]:
# basic test

g.reset()
v = np.zeros(g.q0.size)
v[0] = -1.
print(v)
t = 0
while True:
    t += 1
    ret = g.step(v)
    if ret[2]:
        break;
    print("reward: ", ret[1])
    if not (t%10):
        g.render()

[-1.  0.  0.  0.  0.  0.  0.]
reward:  -0.5893210723155677
reward:  -0.5747704604290415
reward:  -0.5607111276733189
reward:  -0.5630630200485639
reward:  -0.5524492814971189
reward:  -0.5490595054403244
reward:  -0.5450311473737579
reward:  -0.5405927227372814
reward:  -0.5357799838867628
reward:  -0.5307265738687247
reward:  -0.5266044574494736
reward:  -0.5220843731557211
reward:  -0.5187751552671973
reward:  -0.5146539024644406
reward:  -0.5106346294164319
reward:  -0.5069171041641415
reward:  -0.5028811562461459
reward:  -0.49920249292239577
reward:  -0.4956529159405116
reward:  -0.4919866111875385
reward:  -0.4882585681405122
reward:  -0.4846028546796664
reward:  -0.48103014149541995
reward:  -0.47766487000996394
reward:  -0.4744564425913193
reward:  -0.47114841059972473
reward:  -0.4680937740787613
reward:  -0.4647754586856758
reward:  -0.4618417588663975
reward:  -0.45899672438976946
reward:  -0.45594942627170015
reward:  -0.4533128286247184
reward:  -0.4504513890345941
reward:

In [7]:
# train a stable baslines

from stable_baselines3 import A2C, SAC

In [8]:
model = SAC("MlpPolicy", g, verbose=1)
#model = A2C("MlpPolicy", g, verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [9]:
model.learn(total_timesteps=1_000)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -117     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 68       |
|    time_elapsed    | 11       |
|    total_timesteps | 800      |
| train/             |          |
|    actor_loss      | -16.7    |
|    critic_loss     | 0.127    |
|    ent_coef        | 0.811    |
|    ent_coef_loss   | -2.47    |
|    learning_rate   | 0.0003   |
|    n_updates       | 699      |
---------------------------------


<stable_baselines3.sac.sac.SAC at 0x7f577b7b60a0>

In [10]:
# play the policy

obs, info = g.reset()
for t in range(100):
    action, _state = model.predict(obs, deterministic=True)
    ret = g.step(action)
    if not (t%10):
        g.render()

In [11]:
del model
del g
del C