In [None]:
# step_001 - no control at all
# step_002 - stand up reward and parallel feet penalty
# step_003 - step_002 and forward reward
# step_004 - step_003 and control penalty, knee flex reward and foot up reward
# step_007 - stand up reward only
# step_008 - step_007 and forward reward

current_step = 'step_007'

In [None]:
!apt install swig cmake ffmpeg xvfb python3-opengl
!pip install pyvirtualdisplay imageio[ffmpeg]

In [None]:
import os

NVIDIA_ICD_CONFIG_PATH = '/usr/share/glvnd/egl_vendor.d/10_nvidia.json'
if not os.path.exists(NVIDIA_ICD_CONFIG_PATH):
  with open(NVIDIA_ICD_CONFIG_PATH, 'w') as f:
    f.write("""{
    "file_format_version" : "1.0.0",
    "ICD" : {
        "library_path" : "libEGL_nvidia.so.0"
    }
}
""")

%env MUJOCO_GL=egl

from pyvirtualdisplay import Display
virtual_display = Display(visible=0, size=(1920, 1080))
virtual_display.start()

In [None]:
# Prepare to load data from google drive
from google.colab import drive
import datetime

# CONNECT TO GOOGLE DRIVE
gdrive_path = '/content/drive'
drive.mount(gdrive_path)

# DEFINE WORK DIRECTORY
workDir = os.path.join(gdrive_path, 'My Drive', 'OP3_TRAINNING', current_step)
print('WorkDir:', workDir)

log_dir = os.path.join(workDir, datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
print('LogDir:', log_dir)

# create folder if it doesn't exists
if not os.path.exists(log_dir):
  os.makedirs(log_dir)

tf_log_dir = os.path.join(log_dir, 'tensorboard_logs')
print('TfLogDir:', tf_log_dir)

# create folder if it doesn't exists
if not os.path.exists(tf_log_dir):
  os.makedirs(tf_log_dir)

# best models
to_load_path = os.path.join(workDir, 'best')
print('to_load_path:', to_load_path)
algos = ["a2c", "ddpg", "ppo", "sac", "td3"]
for algo in algos:
  algo_load_path = os.path.join(to_load_path, algo)
  print('AlgoLoadPath:', algo_load_path)
  if not os.path.exists(algo_load_path):
    os.makedirs(algo_load_path)

In [None]:
# clean content folder
import os
import shutil

# location
location = "/content"

# directories
dirs = ["sample_data"]

for dir in dirs:
    path = os.path.join(location, dir)
    try:
        shutil.rmtree(path)
    except OSError as e:
        print("Error: %s : %s" % (path, e.strerror))

In [None]:
# Install Darwin Model
model_path = '/content/op3'
%cd /

if os.path.isdir(model_path):
  print(f"The directory '{model_path}' exists - git pull")
  %cd {model_path}
  !git pull
else:
  print(f"The directory '{model_path}' does not exist - git clone")
  !git clone https://github.com/Gianzanti/op3_model.git {model_path}


In [None]:
%cd /
!pip install -e {model_path}

In [None]:
# Install RL Zoo
trainner_path = '/content/rl-zoo'
%cd /

if os.path.isdir(trainner_path):
  print(f"The directory '{trainner_path}' exists - git pull")
  %cd {trainner_path}
  !git pull
else:
  print(f"The directory '{trainner_path}' does not exist - git clone")
  !git clone https://github.com/Gianzanti/rl-zoo.git {trainner_path}


In [None]:
%cd /
!pip install -e {trainner_path}

In [17]:
%cd {trainner_path}

algos = {
  "a2c": {
    "lr": 7e-4, "dev": "cpu"
  },
  'ddpg': {
    'lr': 1e-3, 'dev': 'cuda'
  },
  'ppo': {
    'lr': 3e-4, 'dev': 'cpu'
  },
  'sac': {
    'lr': 3e-4, 'dev': 'cuda'
  },
  'td3': {
    'lr': 1e-3, 'dev': 'cuda'
  }
}

n_timestep = 3_000_000
save_freq = min(100_000, int(n_timestep / 10))
eval_freq = min(200_000, int(n_timestep / 10))
max_episode_steps = 1000
wrapper = [{"gymnasium.wrappers.TimeLimit": {"max_episode_steps": max_episode_steps}}]
n_envs = 20
n_eval_envs = 3
eval_episodes = 30

# weights
keep_alive_weight = 1.0
control_weight = 0.00 #1e-3
target_distance = 100.0
velocity_weight = 0.00 #3.0
reach_target_reward = 100.0
knee_flex_weight = 0.00 #1e-3
feet_up_weight = 0.00 #1e-3
feet_misalign_weight = 0.00 #0.5
max_timestep = 800

for algo, value in algos.items():
  print('Training:', algo)
  config = f'research_config/{algo}.yml'
  best = os.path.join(to_load_path, algo, 'best_model.zip')
  if not os.path.exists(best):
    best = ''
  else:
    best = f'-i "{best}"'

  train_cmd = f'python3 train.py --algo {algo} --env DarwinOp3-v3 -conf {config} \
-f "{log_dir}" --tensorboard-log "{tf_log_dir}" --save-freq {save_freq} \
--vec-env subproc --eval-freq {eval_freq} --n-eval-envs {n_eval_envs} --eval-episodes {eval_episodes} \
--env-kwargs keep_alive_weight:{keep_alive_weight} control_weight:{control_weight} \
target_distance:{target_distance} velocity_weight:{velocity_weight} \
reach_target_reward:{reach_target_reward} knee_flex_weight:{knee_flex_weight} \
feet_up_weight:{feet_up_weight} feet_misalign_weight:{feet_misalign_weight} \
max_timestep:{max_timestep} --hyperparams n_envs:{n_envs} learning_rate:{value["lr"]} \
n_timesteps:{n_timestep} env_wrapper:"{wrapper}" --device {value["dev"]} {best} -P'

  print(train_cmd)
  !{train_cmd}

  video_cmd = f'python3 -m rl_zoo3.record_video --algo {algo} \
--env DarwinOp3-v3 -n 3000 --load-best -o "{log_dir}" -f "{log_dir}"'
  print(video_cmd)
  !{video_cmd}


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| time/                     |          |
|    total_timesteps        | 2800000  |
| train/                    |          |
|    actor_loss             | -2.68    |
|    critic_loss            | 0.0166   |
|    learning_rate          | 0.001    |
|    n_updates              | 139499   |
----------------------------------------
[2KNew best mean reward!
[2K----------------------------------------
| mean_episode/             |          |
|    _pos_x                 | -0.0239  |
|    _pos_y                 | -0.0589  |
|    _pos_z                 | 0.28     |
|    _vel_x                 | 0.000399 |
|    _vel_y                 | -0.00455 |
|    _vel_z                 | 0.00218  |
|    info_control           | 135      |
|    info_dst_org           | 0.182    |
|    info_feet_height       | 0.0617   |
|    info_feet_misalignment | 0.133    |
|    info_knee_angvel       | 1.09     |
|    info_timestep          | 444      |
|  