In [1]:
!mkdir -p videos
%load_ext tensorboard

In [1]:
import os
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

import gym
from gym.wrappers.monitoring.video_recorder import VideoRecorder
import pybullet
import pybulletgym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env, SubprocVecEnv

def record_video(env, model, name, video_length=1000):
    """
    :param env_id: (str)
    :param model: (RL model)
    :param video_length: (int)
    """
    video_recorder = VideoRecorder(env, path="./videos/{}.mp4".format(name))
    obs = env.reset()
    steps = 0
    episodes = 0
    epi_length = 0
    epi_length_history = []
    total_reward = 0.
    total_reward_history = []

    while True:
        steps += 1
        epi_length += 1
        env.render(mode='human')
        video_recorder.capture_frame()
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        total_reward += reward
        if done:
            print("Episode: {0}\tLength: {1}\tScore: {2}"
                  .format(episodes, epi_length, total_reward))
            episodes += 1
            epi_length_history.append(epi_length)
            epi_length = 0
            total_reward_history.append(total_reward)
            total_reward = 0.
            env.reset()
        if steps >= video_length:
            if episodes == 0:
                episodes = 1
                epi_length_history.append(epi_length)
                total_reward_history.append(total_reward)
            break
    print()
    print("Average reward: {0}".format(sum(total_reward_history) / len(total_reward_history)))
    print("Average episode length: {0}".format(sum(epi_length_history) / len(epi_length_history)))

    video_recorder.close()

import base64
from pathlib import Path

from IPython import display as ipythondisplay

def show_videos(video_path='', prefix=''):
    """
    Taken from https://github.com/eleurent/highway-env
    
    :param video_path: (str) Path to the folder containing videos
    :param prefix: (str) Filter the video, showing only the only starting with this prefix
    """
    html = []
    for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
        video_b64 = base64.b64encode(mp4.read_bytes())
        html.append('''<video alt="{}" autoplay 
                      loop controls style="height: 400px;">
                      <source src="data:video/mp4;base64,{}" type="video/mp4" />
                  </video>'''.format(mp4, video_b64.decode('ascii')))
    ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

def train(cls, name, video_length=500, total_timesteps=100000, n_envs=2, resume_from=None, render_before=True):
    env = make_vec_env(cls, n_envs=n_envs, vec_env_cls=SubprocVecEnv)
    # env = cls()
    if resume_from:
        model = PPO.load(resume_from, env)
    else:
        model = PPO(MlpPolicy, env, verbose=0, n_steps=256, tensorboard_log='./tensorboard')
    # Use a separate environement for evaluation
    with cls(render=True) as eval_env:
        # Random Agent, before training
        if render_before:
            record_video(eval_env, model, name="{}-before".format(name))
        model.learn(total_timesteps=total_timesteps)
        record_video(eval_env, model, name="{}-after".format(name))
        model.save(path="{}-trained.zip".format(name))


def view_model(cls, path, name, video_length):
    eval_env = cls(render=True)
    model = PPO.load(path)
    record_video(eval_env, model, name=name, video_length=video_length)


In [109]:
from pybulletgym.envs.mujoco.envs.locomotion.ant_env import AntMuJoCoEnv

class AntMuJoCoCustomTargetEnv(AntMuJoCoEnv):
    def __init__(self, render=False):
        AntMuJoCoEnv.__init__(self, render)
        self.robot.parts

    def step(self, a):
        self.robot.apply_action(a)
        self.scene.global_step()
        state = self.robot.calc_state()
        
#         xposbefore = self.get_body_com("torso")[0]
#         self.do_simulation(a, self.frame_skip)
#         xposafter = self.get_body_com("torso")[0]
#         forward_reward = (xposafter - xposbefore)/self.dt
#         ctrl_cost = .5 * np.square(a).sum()
#         contact_cost = 0.5 * 1e-3 * np.sum(np.square(np.clip(self.sim.data.cfrc_ext, -1, 1)))
#         survive_reward = 1.0
#         reward = forward_reward - ctrl_cost - contact_cost + survive_reward
        
#         notdone = np.isfinite(state).all() and state[2] >= 0.2 and state[2] <= 1.0
#         done = not notdone

        z0=state[0]
        x01=state[1]
        x04=state[4]
        x07=state[7]
        x10=state[10]
        xx=(x01+x04+x07+x10)/4.0 - 3.0
        x13=state[13]
        x14=state[14]
        # reward = np.exp(-(z0 - 1.8)**2) * ( -np.abs(xx) - np.abs(yy))
        reward =  np.exp(-(z0 - 2.0)**2) * 1/(np.abs(xx)) * np.abs(x13) * np.abs(x14)
        done = not np.isfinite(state).all() or z0 < 0.26 or np.abs(xx) < 1.0
        
        return state, reward, done, {}



300000 프레임을 학습하는데 10분정도 걸립니다.

In [110]:
train(AntMuJoCoCustomTargetEnv,
      name="Ant1", total_timesteps=300000, render_before=False)



WalkerBase::__init__

Average reward: 0.24194597725837605
Average episode length: 1000.0


In [111]:
show_videos(video_path='./videos', prefix="Ant1")