In [None]:
!git clone https://github.com/microsoft/MoCapAct.git
%cd MoCapAct
!pip install -e .
%cd /content/


## Import Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import copy, cv2, pickle, torch, random
from tqdm import tqdm
from dm_control.locomotion.tasks.reference_pose import types
from dm_control.locomotion.tasks.reference_pose import utils as _utils
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from mocapact import observables
from mocapact.sb3 import utils
from mocapact.envs import tracking

from stable_baselines3 import TD3, SAC
# from sac_modified import SAC

from stable_baselines3.common.logger import configure
from stable_baselines3.common.utils import get_device
from torch.utils.tensorboard import SummaryWriter
from typing import Optional

from fastdtw import fastdtw
from scipy.spatial.distance import chebyshev

from math import *
import matplotlib.pyplot as plt

# dynamic gpt model
from mingpt.model import GPT
from mingpt.trainer import Trainer
from mingpt.utils import set_seed, setup_logging, CfgNode as CN

# vqvae model
from vqvae import VQVAE
from quantizer import get_quantizer

# stpos model
from stpos import StPosAE


  for plugin in metadata.entry_points().get(entry_point, []):


pygame 2.1.3 (SDL 2.26.5, Python 3.10.10)
Hello from the pygame community. https://www.pygame.org/contribute.html


  if not hasattr(tensorboard, "__version__") or LooseVersion(
  ) < LooseVersion("1.15"):


## GPT Dynamic Model

In [2]:
def get_config():

    C = CN()

    # device
    C.device = 'cuda'

    # system
    C.system = CN()
    C.system.seed = 3407
    C.system.work_dir = 'E:/MoCAP/MCDH/model_based_planner/Experiment_Inverse_Reward_without_priorotized_replay/out/dynamic'

    # model
    C.model = GPT.get_default_config()
    C.model.model_type = 'gpt-nano'

    # trainer
    C.trainer = Trainer.get_default_config()
    C.trainer.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
    return C

# get default config and overrides from the command line, if any
root_dir = "E:/MoCAP/MCDH/model_based_planner/Experiment_Inverse_Reward_without_priorotized_replay"
config = get_config()
config.trainer.device='cuda'
# config.trainer.batch_size = 2
config.model.model_type = None
config.model.n_layer = 2
config.model.n_head = 2
config.model.n_embd = 512
config.trainer.learning_rate = 1e-4
# config.trainer.num_workers = 0 # windows only
# config.merge_from_args(sys.argv[1:])
setup_logging(config)
set_seed(config.system.seed)

# load the VQ-VAE model
vqvae_model = VQVAE(input_dim=1, hidden_dim=44, num_embeddings=200, embedding_dim=44)
try:
    vqvae_model.load_state_dict(torch.load(f"{root_dir}/vqvae_check_point_44_200.pt"))
    print('[VQ-VAE Loading] : Successful.')
except Exception as e:
    print(f'[VQ-VAE Loading] : Failed, {e}')

# load StPos model
stpos_model = StPosAE(input_ch=1, hidden_ch=150, x_dim=206, z_dim=149)
try:
    print(f"[StPos Loading] : Successful.")
    stpos_model.load_state_dict(torch.load(f"{root_dir}/stpos_150_206_149.pt"))
except Exception as e:
    print(f"[StPos Loading] : Failed, {e}")


# construct the training dataset
dynamic_actions = 5

# construct the model
config.model.vocab_size = 200 + 200 + 1
config.model.block_size = dynamic_actions * (24 + 56)

# print dynamic configs
print(config)

# load model's weights using checkpoint
gpt_model = GPT(config.model)
try:
    gpt_model.load_state_dict(torch.load(f"{root_dir}/dynamic_gpt_2_2_512_400_401.pt"))
    print(f'[GPT loading] : Successful.')
except Exception as e:
    print(f"[GPT loading] : Faild. {e}")


[VQ-VAE Loading] : Successful.
[StPos Loading] : Successful.
device: cuda
system:
    seed: 3407
    work_dir: E:/MoCAP/MCDH/model_based_planner/Experiment_Inverse_Reward_without_priorotized_replay/out/dynamic
model:
    model_type: None
    n_layer: 2
    n_head: 2
    n_embd: 512
    vocab_size: 401
    block_size: 400
    embd_pdrop: 0.1
    resid_pdrop: 0.1
    attn_pdrop: 0.1
trainer:
    device: cuda
    num_workers: 4
    max_iters: None
    batch_size: 2
    learning_rate: 0.0001
    betas: (0.9, 0.95)
    weight_decay: 0.1
    grad_norm_clip: 1.0

number of parameters: 6.72M
[GPT loading] : Successful.


## Environment Wrapper

In [3]:
# region Dynamic Model Parameters
# ps_mode : state=[real_state, pos_state]
# DYNAMIC : STATE_SIZE,25*ACTION_SIZE -> 25*POS_SIZE
STATE_SIZE = 206
POS_SIZE = 149
ACTION_SIZE = 56
NUM_OF_CELLS = 25
# endregion

import gymnasium
from gymnasium import spaces

class DynaBasedEnv(gymnasium.Env):
    """Custom Environment that follows gym interface."""
    metadata = {"render_modes": ["human"], "render_fps": 30}
    def __init__(
            self, 
            main_env,
            expert_traj,
            keys = None,
            observation_space = None,
            ):
        super().__init__()

        self.main_env = main_env
        self.expert_traj = expert_traj
        self.action_space = spaces.Box(low=float('-1'), high=float('1'), shape=(ACTION_SIZE,), dtype=np.float32)
        self.observation_space = spaces.Box(low=float('-inf'), high=float('inf'), shape=(STATE_SIZE,), dtype=np.float32)
        self.core_state = None
        self._keys = keys
        self._observation_space = observation_space

    def transform_observation(self, observations):
        obs = []
        for k in self._keys:
            if k in observations:
                obs.append(observations[k])
            else:
                tmp = list(observations.values())[0]
                shape = list(tmp.shape)
                shape[-1] = self._observation_space[k].shape[0]
                obs.append(torch.full(shape, torch.nan, device=tmp.device))
        obs = np.concatenate(obs, axis=-1)
        return obs

    def step(self, action):
        self.core_state, rew, done, _ = self.main_env.step(action)
        return self.transform_observation(self.core_state), rew, done, False, {"TimeLimit.truncated":None}
        
    def reset(self, seed=None, options=None):
        self.core_state, done = self.main_env.reset(), False
        return self.transform_observation(self.core_state), {"TimeLimit.truncated":None}

    def render(self, mode='human', close=False):
        return self.main_env.render("rgb_array")
          
    def close(self):
        self.main_env.close()

## Initialize Models

In [4]:
# region Important Notes
# most important parameters to set
# training rate 3e-4
# depth and width of the value and policy networks 3*1024
# entropy coefficient 0.7
# make the buffer size 100K
import warnings
warnings.filterwarnings('ignore')
# endregion

# region control flags
# training initialization or storing
INITIALIZE_TRAINING_INSTANCE = False
load_networks = True
training_mode = True
training_warm_up_phase = False
load_expert_policy = False
logger_start_state = 0 if INITIALIZE_TRAINING_INSTANCE else 0
# endregion

# region file path
# path information
root_folder                 = "E:\MoCAP\MCDH\\root_1"
reference_trajectory_path   = f"{root_folder}/traj_info.np"
feature_extractor_info      = f"{root_folder}/feature_extractor_info.pkl"
# endregion

dataset = types.ClipCollection(ids=['CMU_075_09'], start_steps=[0], end_steps=[194])

# referenct trajectory information
with open(reference_trajectory_path, "rb") as f: reference_info = pickle.load(f)
with open(feature_extractor_info   , "rb") as f: fe_info = pickle.load(f)

# environment

# policy model
policy_kwargs = dict(
    net_arch=dict(pi=3*[512], qf=3*[512]),
    activation_fn=torch.nn.LeakyReLU,
)

lr_schedule = 1e-4
format_strings = ['csv', 'tensorboard', 'stdout']


one_tensor = torch.tensor([1.])
target_bodies = reference_info['body_positions'].reshape(-1,93)
target_joints = reference_info['joints'].reshape(-1,ACTION_SIZE)
target_reference = np.concatenate((target_bodies,target_joints), axis=1)

# region algorithm functions
def r_to_i(x,y): return np.array([z for z in range(3*x,3*y)])

def r_norm(x,y): 
    return np.sum(np.mean(np.einsum("i,ji->ji",np.sqrt(np.square(x - y)+1e-6),mask[:-1]), axis=1))+\
           np.mean(np.einsum("i,i->i",np.sqrt(np.square(x - y)+1e-6),mask[-1]))
        
def traj_dist(epS, start_step,result_index=-1, traj_length=25, result_array=None, multi_thread=False) : 
    dtw_distance, _ = fastdtw(epS.reshape(traj_length,POS_SIZE).cpu().detach().numpy(),
                              target_reference[start_step:start_step+traj_length,:], 
                              dist=r_norm)
    # print(f"[DEBUG] : DTW distance is {dtw_distance}")

    return dtw_distance


# endregion

# region algorithm parameters

# region main loop params
NUM_TRAJS = 25
NUM_ITERS = 1000
number_of_predicted_trajs = 100
# endregion


mask = np.zeros((7,POS_SIZE))
# _pos_indeces = np.array([list(range(3*x,3*x+3,1)) for x in [2,3,7,8,19,22,26,29,10,16]]).reshape(-1)
_ang_indeces = 93+np.array([1,2,3,4,5,6,8,9,10,11,12,13,33,34,38,45,46,50]).reshape(-1)

pos_coef = 0.5
mask[0, [
    3 * 1 + 2,
    3 * 6 + 2,
]] = pos_coef
mask[1, [
    3 * 4 + 2,
    3 * 9 + 2,
]] = pos_coef
mask[2, [
    3 * 2 + 2,
    3 * 7 + 2,
]] = pos_coef
mask[3, [
    3 * 3 + 2,
    3 * 8 + 2,
]] = pos_coef
mask[4, [
    36, 37, 38,
]] = pos_coef
mask[5, [
    45, 46, 47,
]] = pos_coef
mask[6,  _ang_indeces,] = .3




  root_folder                 = "E:\MoCAP\MCDH\\root_1"


## Policy Wrapper

In [9]:
import torch.nn as nn

class policy_net(nn.Module):
    def __init__(self, policy_model, dynamic_model, vqvae_model, stpos_model, action_encoder, action_decoder, device=torch.device("cpu")):
        super(policy_net, self).__init__()
        self.policy    = policy_model.to(device)
        self.dynamic   = dynamic_model.to(device)
        self.vqvae     = vqvae_model.to(device)
        self.transform = stpos_model.to(device)
        self.act_en    = action_encoder
        self.act_dc    = action_decoder
        self.device    = device

    def forward(self, x): 
        with torch.no_grad(): 
            return self.predictive_sampling(x)
    
    def predictive_sampling(self,S0,n_actions=10, n_trajs=10, sigma=0.125):
        # TODO don't forget to add the return value!!!
        # initial actions
        self.policy.eval()
        _,trial_actions,trial_states = self.dynamic.generate(
            idx=S0,
            n_actions=n_actions,
            actor_network=self.policy, 
            vqvae=self.vqvae, 
            act_encoder=self.act_en, 
            act_decoder=self.act_dc,
            perturbed_trajs=n_trajs,
            mean=0.0,
            sigma=sigma,
            device=self.device,
        )
        transformed_states = self.transform(trial_states.reshape(-1,trial_states.shape[-1]).unsqueeze(1)).reshape((-1,n_actions,149))
        # estimated rewards
        traj_distances = []
        for result_index in range(transformed_states.shape[0]): 
            traj_distances += [
                traj_dist(
                    transformed_states.reshape(S0.shape[0]*n_trajs,-1)[result_index],
                    start_step=int(round(194*S0[result_index//n_trajs,206-1].item())),
                    result_index=-1,
                    traj_length=n_actions,)
            ]
        # best trajectory
        ind = torch.tensor(traj_distances).reshape(-1,n_trajs).argmin(1)
        return trial_actions[:,0,:][ind + n_trajs*torch.arange(ind.shape[0])]

## DAgger Algorithm

In [None]:
import numpy as np
# from imitation.util.util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv
import tempfile
from imitation.algorithms import bc
from imitation.algorithms.dagger import SimpleDAggerTrainer
from stable_baselines3 import DDPG
from stable_baselines3.ddpg.policies import MlpPolicy

# referenct trajectory information
root_folder                = "E:\MoCAP\MCDH\\root_1"
reference_trajectory_path  = f"{root_folder}/traj_info.np"
feature_extractor_info     = f"{root_folder}/feature_extractor_info.pkl"
with open(reference_trajectory_path, "rb") as f: reference_info = pickle.load(f)
with open(feature_extractor_info   , "rb") as f: fe_info = pickle.load(f)

env  = tracking.MocapTrackingGymEnv(dataset, task_kwargs=dict(ghost_offset=np.array([0., 0., 0.])),)
denv = DynaBasedEnv(env, reference_info, keys=fe_info[0], observation_space=fe_info[1])
vec_env = DummyVecEnv([lambda: denv for _ in range(1)])

expert = DDPG(MlpPolicy, denv, verbose=1, device=torch.device("cuda"))
expert_policy = policy_net(
    policy_model=expert.policy,
    dynamic_model=gpt_model,
    vqvae_model=vqvae_model,
    stpos_model=stpos_model,
    action_encoder=lambda x: (100*x+300)//1,
    action_decoder=lambda z: (z-300)/100,
    device=torch.device("cuda")
)

next_action = expert_policy(
    torch.rand((10, 206)).to(torch.device("cuda"))
)


# bc_trainer = bc.BC(
#     observation_space=vec_env.observation_space,
#     action_space=vec_env.action_space,
#     rng=np.random.default_rng(),
#     device=torch.device("cpu"),
# )
# with tempfile.TemporaryDirectory(prefix="dagger_example_") as tmpdir:
#     print(tmpdir)
#     dagger_trainer = SimpleDAggerTrainer(
#         venv=vec_env,
#         scratch_dir=tmpdir,
#         expert_policy=expert,
#         bc_trainer=bc_trainer,
#         rng=np.random.default_rng(),
#     )

#     dagger_trainer.train(2000, rollout_round_min_timesteps=20)