In [None]:
# Clone repository
import os, sys

import yaml

os.chdir("/content")
if not os.path.isdir("RL_DEMO"):
  !git clone https://github.com/Kyu3224/RL_DEMO.git
else:
  print("Cloned Directory already exists")

os.chdir("/content/RL_DEMO")
print("Current Directory: ", os.getcwd())

sys.path.insert(0, "/content/RL_DEMO")
os.environ["MUJOCO_GL"] = "egl"

In [None]:
# Install dependencies
!pip install torch numpy tensorboard gymnasium==0.29.1 protobuf==4.25.3 stable-baselines3==2.3.0 mujoco==3.1.5 imageio

In [None]:
import numpy as np
import importlib

import os
import gc
import time
import imageio

from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv
from IPython.display import Video, display
from pathlib import Path

import src.go1_mujoco_env as go1_env
DEFAULT_CAMERA_CONFIG = {
    "azimuth": 90.0,
    "distance": 3.0,
    "elevation": -25.0,
    "lookat": np.array([0., 0., 0.]),
    "fixedcamid": 0,
    "trackbodyid": -1,
    "type": 2,
}

policy_cfg_path = Path("/content/RL_DEMO/params.yaml")
with policy_cfg_path.open("r", encoding="utf-8") as f:
    policy_cfg = yaml.safe_load(f)

In [None]:
importlib.reload(go1_env)

# Train
MODEL_DIR = "models"
LOG_DIR = "logs"

os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

vec_env = make_vec_env(
    go1_env.Go1MujocoEnv,
    env_kwargs={"prj_path": "/content/RL_DEMO"},
    n_envs=policy_cfg["n_envs"],
    seed=policy_cfg["seed"],
    vec_env_cls=SubprocVecEnv,
)

train_time = time.strftime("%Y-%m-%d_%H-%M-%S")
run_name = f"{train_time}"

model_path = f"{MODEL_DIR}/{run_name}"
print(
    f"Training on {policy_cfg['n_envs']} parallel training environments and saving models to '{model_path}'"
)

# Evaluate the model every eval_frequency for 5 episodes and save
# it if it's improved over the previous best model.
eval_callback = EvalCallback(
    vec_env,
    best_model_save_path=model_path,
    log_path=LOG_DIR,
    eval_freq=policy_cfg["eval_freq"],
    n_eval_episodes=5,
    deterministic=True,
    render=False,
)

model = PPO("MlpPolicy", vec_env, verbose=1, tensorboard_log=LOG_DIR)

model.learn(
    total_timesteps=policy_cfg["total_timestep"],
    reset_num_timesteps=False,
    progress_bar=True,
    tb_log_name=run_name,
    callback=eval_callback,
)
# Save final model
model.save(f"{model_path}/final_model")

vec_env.close()

del model
del eval_callback
del vec_env

gc.collect()

In [None]:
# Test
importlib.reload(go1_env)
model_path = "/content/RL_DEMO/src/models/pretrained/final_model"

env = go1_env.Go1MujocoEnv(
    prj_path="/content/RL_DEMO",
    render_mode="rgb_array",
    camera_name="tracking",
    width=540,
    height=360,
)
inter_frame_sleep = 0.0

model = PPO.load(path=model_path, env=env, verbose=1)

num_episodes = 1
total_reward = 0
total_length = 0

video_path = "/content/rollout.mp4"

frames = []

for ep in range(num_episodes):
    obs, _ = env.reset()
    done = False
    ep_reward = 0
    ep_len = 0

    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = env.step(action)

        frame = env.render()     # rgb_array 반환
        frames.append(frame)

        ep_reward += reward
        ep_len += 1
        done = terminated or truncated

    print(f"[Episode {ep}] len={ep_len}, reward={ep_reward}")

env.close()

imageio.mimsave(video_path, frames, fps=30)
print("Saved video to:", video_path)

video_path = "/content/rollout.mp4"

display(
    Video(
        video_path,
        embed=True,
        html_attributes="controls autoplay loop"
    )
)