## Import Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import copy, cv2, pickle, torch, gym, random
from tqdm import tqdm
from dm_control.locomotion.tasks.reference_pose import types
from dm_control.locomotion.tasks.reference_pose import utils as _utils
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from mocapact import observables
from mocapact.sb3 import utils
from mocapact.envs import tracking

from stable_baselines3 import TD3, SAC
# from sac_modified import SAC

from stable_baselines3.common.logger import configure
from stable_baselines3.common.utils import get_device
from torch.utils.tensorboard import SummaryWriter
from gym import spaces
from typing import Optional
expert_path = "E:\MoCAP\MCDH\data\experts\CMU_075_09-0-203\eval_rsi\model"
expert = utils.load_policy(expert_path, observables.TIME_INDEX_OBSERVABLES)
from fastdtw import fastdtw
from scipy.spatial.distance import chebyshev


pygame 2.1.3 (SDL 2.26.5, Python 3.10.10)
Hello from the pygame community. https://www.pygame.org/contribute.html


  if not hasattr(tensorboard, "__version__") or LooseVersion(
  ) < LooseVersion("1.15"):


## Modified Environment

In [2]:
# region Dynamic Model Parameters
# ps_mode : state=[real_state, pos_state]
# DYNAMIC : STATE_SIZE,25*ACTION_SIZE -> 25*POS_SIZE
STATE_SIZE = 206
POS_SIZE = 149
ACTION_SIZE = 56
NUM_OF_CELLS = 25
# endregion


class DynaBasedEnv(gym.Env):
    """Custom Environment that follows gym interface."""

    metadata = {"render_modes": ["human"], "render_fps": 30}

    def __init__(
            self, 
            main_env,
            expert_traj,
            reward_estimator = None,
            keys = None,
            observation_space = None,
            render_mode: Optional[str] = None,
            ):
        super().__init__()
        # parent environment
        self.main_env = main_env
        # reference trajectory
        self.expert_traj = expert_traj
        target_bodies = self.expert_traj['body_positions'].reshape(-1,93)
        target_joints = self.expert_traj['joints'].reshape(-1,ACTION_SIZE)
        self.target_state = torch.tensor(np.concatenate((target_bodies,target_joints), axis=-1)).numpy()#[:,self.__indeces]
        # modified spaces
        self.action_space = spaces.Box(low=float('-1'), high=float('1'), shape=(ACTION_SIZE,), dtype=np.float32)
        self.observation_space = spaces.Box(low=float('-inf'), high=float('inf'), shape=(STATE_SIZE,), dtype=np.float32)
        self.render_mode = render_mode
        self.reward_estimator = reward_estimator
        # core environment information
        self.core_state = None
        self._keys = keys
        self._observation_space = observation_space

    def transform_observation(self, observations):
        obs = []
        for k in self._keys:
            if k in observations:
                obs.append(observations[k])
            else:
                tmp = list(observations.values())[0]
                shape = list(tmp.shape)
                shape[-1] = self._observation_space[k].shape[0]
                obs.append(torch.full(shape, torch.nan, device=tmp.device))
        obs = np.concatenate(obs, axis=-1)
        return obs

    def step(self, action, ps_mode=False):

        # observation, reward, terminated, truncated, info
        self.core_state, rew, done, _ = self.main_env.step(action)
        
        
        # trajectory tracking reward
        _current_time = self.main_env.dm_env._task._time_step
        
        next_state = np.concatenate((_utils.get_features(physics=self.main_env.dm_env.physics, walker=self.main_env.dm_env._task._walker, props=[])['body_positions'].reshape(-1),
                    np.array(self.main_env.dm_env.physics.bind(self.main_env.dm_env._task._walker.mocap_joints).qpos)), axis=-1)
        

        if ps_mode:
            return (self.transform_observation(self.core_state), next_state), rew, done, False, {"TimeLimit.truncated":None}

        if not ps_mode:
            
            # rew = 0.9*rew + 0.1*self.reward_estimator(state=torch.from_numpy(self.transform_observation(self.core_state)), action=torch.from_numpy(action)).detach().cpu().numpy()
            # original open ai error
            # rew = 1.3*rew - 0.1*np.sum(np.max(np.einsum("i,ji->ji",np.abs(next_state- self.target_state[_current_time]),mask), axis=1)) -0.2*np.mean(np.abs(self.core_state['walker/joints_vel']))

            return self.transform_observation(self.core_state), rew, done, False, {"TimeLimit.truncated":None}
        
        

    def reset(self, seed=None, options=None, ps_mode=False):
        self.core_state, done = self.main_env.reset(), False
        if ps_mode:
            return [self.transform_observation(self.core_state),np.concatenate((
                (_utils.get_features(physics=self.main_env.dm_env.physics, walker=self.main_env.dm_env._task._walker, props=[])['body_positions'].reshape(-1),
                                np.array(self.main_env.dm_env.physics.bind(self.main_env.dm_env._task._walker.mocap_joints).qpos))
            ), axis=-1)], done
        else:
            return self.transform_observation(self.core_state), done

    def render(self, mode='human', close=False):
        return self.main_env.render("rgb_array")
          
    def close(self):
        self.main_env.close()

## Body Markers

In [40]:
points = np.array([
       [ 0.16101618, -0.40060299,  1.08903083],
       [ 0.29373521, -0.4193849 ,  0.97654074],
       [ 0.43153917, -0.52788541,  0.52336356],
       [ 0.51678509, -0.3457796 ,  0.06685116],
       [ 0.58474457, -0.51940714,  0.0323095 ],
       [ 0.16101618, -0.40060299,  1.08903083],
       [ 0.05369995, -0.46572327,  0.96711407],
       [-0.00865177, -0.63607209,  0.51630879],
       [-0.11421121, -0.40321956,  0.08797496],
       [-0.12202077, -0.58772649,  0.04491287],
       [ 0.16101618, -0.40060299,  1.08903083],
       [ 0.15931813, -0.41555789,  1.22439721],
       [ 0.16280991, -0.43823503,  1.35872984],
       [ 0.17017723, -0.4625986 ,  1.49319125],
       [ 0.16344592, -0.49144109,  1.59850858],
       [ 0.18221923, -0.49999375,  1.75323714],
       [ 0.30875609, -0.55831932,  1.76740717],
       [ 0.17017723, -0.4625986 ,  1.49319125],
       [ 0.37878143, -0.40217638,  1.59958084],
       [ 0.41094637, -0.34053994,  1.27472563],
       [ 0.54416648, -0.42036814,  1.12196594],
       [ 0.61077624, -0.46028207,  1.04558644],
       [ 0.66472814, -0.5038426 ,  0.99001949],
       [ 0.61855088, -0.47893582,  1.06775833],
       [ 0.17017723, -0.4625986 ,  1.49319125],
       [-0.02078486, -0.57638904,  1.58843049],
       [-0.14163758, -0.60054813,  1.27992522],
       [-0.21789441, -0.74652476,  1.13733707],
       [-0.25602266, -0.81951275,  1.06604331],
       [-0.2897067 , -0.84907655,  0.98918458],
       [-0.25383986, -0.84774668,  1.07594689]])

# points[4][2] += 0.5

In [41]:
import numpy as np
import plotly.graph_objects as go

# Assuming 'points' is a tensor with shape (40, 3)
x = points[:, 0]
y = points[:, 1]
z = points[:, 2]

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=x,
    y=y,
    z=z,
    text=[str(i) if i not in list(range(10,15)) else f">>>{i}" for i in range(len(x))],  # Convert indices to strings
    mode='markers+text',
    marker=dict(
        size=5,
        color=np.arange(len(x)),
        colorscale='Viridis',
        opacity=0.8
    ),
    textposition='top center'
)])

# Customize the layout
fig.update_layout(
    scene=dict(
        xaxis_title='X',
        yaxis_title='Y',
        zaxis_title='Z'
    ),
    margin=dict(l=0, r=0, b=0, t=0)
)

fig.show()
# 2,7,12,20,27
# 3x,3x+2
# [2,7,19,26,10,16]

In [None]:
# def r_to_i(x,y): return np.array([z for z in range(3*x,3*y)])
# # np.max(d[1:5])
# # np.max(d[6:10])
# # np.max(d[18:24])
# # np.max(d[25:31])
# # np.max(d[10:15])

## Sequential Dynamics Model

In [3]:
# region Dynamic Model Functions
class CustomDataset(Dataset):
    def __init__(self, X, y, device=torch.device("cpu")):
        self.X = torch.tensor(np.array(X), dtype=torch.float).to(device)  # Convert input features to tensors
        self.y = torch.tensor(np.array(y), dtype=torch.float).to(device)  # Convert labels to tensors

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x = self.X[idx]
        label = self.y[idx]
        return x, label

class T_Hat_(nn.Module):
    def __init__(self, input_size, hidden_layer_frac, const=100, dropout_= 0.2):
        super(T_Hat_, self).__init__()
        self.input_size = input_size
        hidden_layer1 = 10*(int(const+hidden_layer_frac[0]*input_size)//10)
        hidden_layer2 = 10*(int(const+hidden_layer_frac[1]*input_size)//10)
        self.fc1 = nn.Linear(input_size, hidden_layer1)
        self.fc2 = nn.Linear(hidden_layer1, hidden_layer2)
        self.fc3 = nn.Linear(hidden_layer2, STATE_SIZE)
        self.dropout = nn.Dropout(dropout_)
        self.float()
    def forward(self, x):
        # observation bounds are (-inf, inf)
        return torch.tanh(self.fc3(self.dropout(torch.tanh(self.fc2(self.dropout(torch.tanh(self.fc1(x))))))))
    
class MultiStepDynamics(nn.Module):
    def __init__(self, num_of_cells, hidden_layer_frac, dropout=0.4):
        super(MultiStepDynamics, self).__init__()
        self.cells = nn.ModuleList().float()
        self.input_size = [STATE_SIZE+ACTION_SIZE]+[POS_SIZE+ACTION_SIZE for _ in range(num_of_cells-1)]
        for i in range(num_of_cells) : self.cells.append(T_Hat_(self.input_size[i], hidden_layer_frac, dropout))
        self.decoder = nn.Linear(STATE_SIZE,POS_SIZE)

    def forward(self, x):
        # x is full state : S0+25A -> 25$
        outputs = []
        states = torch.tensor(x[:,:STATE_SIZE])
        for i,sub_net in enumerate(self.cells):
            next_state = sub_net(torch.cat((states, x[:,STATE_SIZE+i*ACTION_SIZE:STATE_SIZE+(i+1)*ACTION_SIZE]), dim=1))
            next_state = self.decoder(next_state)
            states = torch.cat((states,next_state), dim=1)[:,-POS_SIZE:]
            outputs.append(next_state)

        return torch.cat(outputs, dim=1)
    
    def Nthforward(self, x, N):
        # x is partial state : N'th module length is S+A if N=0 else 2*S+A
        return self.cells[N](x)
# endregion

# region Dynamic Model Testing
d1 = MultiStepDynamics(NUM_OF_CELLS,[0.6,0.5], dropout=0.2)
torch.save(d1.state_dict(), "E:\MoCAP\MCDH\\root_1\d1_init.pt")

# for net in d1.cells:
print(d1)
# endregion

MultiStepDynamics(
  (cells): ModuleList(
    (0): T_Hat_(
      (fc1): Linear(in_features=262, out_features=150, bias=True)
      (fc2): Linear(in_features=150, out_features=130, bias=True)
      (fc3): Linear(in_features=130, out_features=206, bias=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (1-24): 24 x T_Hat_(
      (fc1): Linear(in_features=205, out_features=120, bias=True)
      (fc2): Linear(in_features=120, out_features=100, bias=True)
      (fc3): Linear(in_features=100, out_features=206, bias=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
  )
  (decoder): Linear(in_features=206, out_features=149, bias=True)
)


  torch.save(d1.state_dict(), "E:\MoCAP\MCDH\\root_1\d1_init.pt")


In [4]:
# transformers dynamic with special tokens and attention head
class T_Hat_(nn.Module):
    def __init__(self, input_size, hidden_layer_frac, const=100, dropout_= 0.2):
        super(T_Hat_, self).__init__()
        self.input_size = input_size
        hidden_layer1 = 10*(int(const+hidden_layer_frac[0]*input_size)//10)
        hidden_layer2 = 10*(int(const+hidden_layer_frac[1]*input_size)//10)
        self.fc1 = nn.Linear(input_size, hidden_layer1)
        self.fc2 = nn.Linear(hidden_layer1, hidden_layer2)
        self.fc3 = nn.Linear(hidden_layer2, STATE_SIZE)
        self.dropout = nn.Dropout(dropout_)
        self.float()
    def forward(self, x):
        # observation bounds are (-inf, inf)
        return torch.tanh(self.fc3(self.dropout(torch.tanh(self.fc2(self.dropout(torch.tanh(self.fc1(x))))))))
    
class MultiStepDynamics(nn.Module):
    def __init__(self, num_of_cells, hidden_layer_frac, dropout=0.4):
        super(MultiStepDynamics, self).__init__()
        self.cells = nn.ModuleList().float()
        self.input_size = [STATE_SIZE+ACTION_SIZE]+[POS_SIZE+ACTION_SIZE for _ in range(num_of_cells-1)]
        for i in range(num_of_cells) : self.cells.append(T_Hat_(self.input_size[i], hidden_layer_frac, dropout))
        self.decoder = nn.Linear(STATE_SIZE,POS_SIZE)

    def forward(self, x):
        # x is full state : S0+25A -> 25$
        outputs = []
        states = torch.tensor(x[:,:STATE_SIZE])
        for i,sub_net in enumerate(self.cells):
            next_state = sub_net(torch.cat((states, x[:,STATE_SIZE+i*ACTION_SIZE:STATE_SIZE+(i+1)*ACTION_SIZE]), dim=1))
            next_state = self.decoder(next_state)
            states = torch.cat((states,next_state), dim=1)[:,-POS_SIZE:]
            outputs.append(next_state)

        return torch.cat(outputs, dim=1)
    
    def Nthforward(self, x, N):
        # x is partial state : N'th module length is S+A if N=0 else 2*S+A
        return self.cells[N](x)
# endregion

# region Dynamic Model Testing
d1 = MultiStepDynamics(NUM_OF_CELLS,[0.6,0.5], dropout=0.2)
torch.save(d1.state_dict(), "E:\MoCAP\MCDH\\root_1\d1_init.pt")

# for net in d1.cells:
print(d1)
# endregion

MultiStepDynamics(
  (cells): ModuleList(
    (0): T_Hat_(
      (fc1): Linear(in_features=262, out_features=150, bias=True)
      (fc2): Linear(in_features=150, out_features=130, bias=True)
      (fc3): Linear(in_features=130, out_features=206, bias=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (1-24): 24 x T_Hat_(
      (fc1): Linear(in_features=205, out_features=120, bias=True)
      (fc2): Linear(in_features=120, out_features=100, bias=True)
      (fc3): Linear(in_features=100, out_features=206, bias=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
  )
  (decoder): Linear(in_features=206, out_features=149, bias=True)
)


  torch.save(d1.state_dict(), "E:\MoCAP\MCDH\\root_1\d1_init.pt")


## Train Dynamics

In [None]:
# first run main training cell!
Train_Dynamics        = True
Collect_Expert_Trajs  = True
TRAIN_DYNAMICS_INITIALIZATION_ONLY = True
d1 = MultiStepDynamics(NUM_OF_CELLS, hidden_layer_frac=[0.6,0.5], dropout=0.2)
# d1.load_state_dict(torch.load("E:/MoCAP/MCDH/root_1/d1_init.pt"))
# with open("E:/MoCAP/MCDH/root_1/dyna_replay_buffer_backup.pt", "rb") as f:
#     dynamic_dataset = pickle.load(f)
dynamic_dataset=[[],[]]

_sub_episode_num = 0
_episode_num_bias = 0

for episode_num in range(0,40*30,40):
    # region Collect Trajectories
    episode_num += _episode_num_bias
    # truncate the dataset
    dynamic_dataset[0]=dynamic_dataset[0][-2000:]
    dynamic_dataset[1]=dynamic_dataset[1][-2000:]
    
    if Collect_Expert_Trajs:
        for traj_num in tqdm(range(100)):

            # recording video of robot performance
            _current_eps_frames = []

            n_state, done = denv.reset(ps_mode=True)
            real_states, pos_states, actions = [], [], []
            episode_start = copy.deepcopy(denv.main_env.dm_env._task._time_step)
            while not done:
                # capture current frame
                _current_eps_frames += [denv.render()]
                action = expert.predict(denv.core_state, deterministic=True)[0]
                real_states += [n_state[0]]
                pos_states += [n_state[1]]
                actions += [action]
                # FIX: n_state, reward, done, _, _ = denv.step(action.float().detach().numpy())
                n_state, reward, done, _, _ = denv.step(action, ps_mode=True)
            
            # Update Dynamics Dataset
            real_states += [n_state[0]]
            pos_states += [n_state[1]]
            if len(real_states) >= 25:
                for start in range(len(real_states)-25+1-1):
                    dynamic_dataset[0] += [ np.concatenate([real_states[start]]+actions[start:start+25], axis=-1) ]
                    dynamic_dataset[1] += [ np.concatenate(pos_states[start+1:start+1+25], axis=-1) ]
            else:
                for start in range(len(real_states)):
                    dynamic_dataset[0] += [ np.concatenate([real_states[start]]+actions[start:]+Z[:25-len(actions[start:])], axis=-1) ]
                    dynamic_dataset[1] += [ np.concatenate(pos_states[start+1:]+[pos_states[-1] for _ in range(25-len(pos_states[start+1:]))], axis=-1) ]
    # endregion

    # region Train Dynamics
    if Train_Dynamics:
        for sub_episode_num in range(20):
            # Create an instance of the custom dataset
            indeces = random.sample(range(len(dynamic_dataset[1])), 2000)
            dataset = CustomDataset([dynamic_dataset[0][__i] for __i in indeces], [dynamic_dataset[1][__i] for __i in indeces], device=torch.device("cuda"))
            # Train Dynamics
            training_function(dataset, 1024, num_epochs, d1, writer, dynamic_start_state, _sub_episode_num+sub_episode_num, device=torch.device("cuda"))
            torch.save(d1.state_dict(), "E:/MoCAP/MCDH/root_1/d1_init.pt")
            print(f"{sub_episode_num}: [INFO] : Dynamics Model Stored Successfully.")
        _sub_episode_num += sub_episode_num
    # endregion


## Learn Reward Via Inverse Reinforcement Learning

In [5]:
# Define the IRL dataset
class IRLDataset(Dataset):
    def __init__(self, expert_states, expert_actions, agent_states, agent_actions):
        self.expert_states  = expert_states
        self.expert_actions = expert_actions
        self.agent_states   = agent_states
        self.agent_actions  = agent_actions

    def __len__(self):
        return min(self.expert_states.shape[0],self.agent_states.shape[0])

    def __getitem__(self, idx):
        return (self.expert_states[idx], self.expert_actions[idx], self.agent_states[idx], self.agent_actions[idx])

# Define discriminator network
class Discriminator(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=64, writer=None):
        super(Discriminator, self).__init__()
        self.fc1 = nn.Linear(state_dim + action_dim, hidden_dim, dtype=torch.float32)
        self.fc2 = nn.Linear(hidden_dim, 1, dtype=torch.float32)
        self.float()
        self.writer = writer
        self.steps = 0

    def forward(self, state, action):
        return torch.sigmoid(self.fc2(torch.relu(self.fc1(torch.cat([state, action], dim=-1)))))

    def train(self, expert_data, agent_data, num_epochs=32, batch_size=32, threshold=None, lr=1e-3):
        dataset = IRLDataset(expert_data[0], expert_data[1], agent_data[0], agent_data[1])
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        disc_optimizer = optim.Adam(self.parameters(), lr=lr)
        self.cpu()
        for _ in tqdm(list(range(num_epochs))):
            for expert_state, expert_action, agent_state, agent_action in dataloader:
                expert_state, expert_action, agent_state, agent_action = expert_state.squeeze().cpu(), expert_action.squeeze().cpu(), agent_state.squeeze().cpu(), agent_action.squeeze().cpu()
                disc_optimizer.zero_grad()
                expert_output = self(expert_state, expert_action)
                agent_output  = self(agent_state, agent_action)
                disc_loss = -(torch.log(expert_output) + torch.log(1 - agent_output)).mean()
                disc_loss.backward()
                disc_optimizer.step()
                if self.writer is not None:
                    self.writer.add_scalar("reward discriminator loss", disc_loss.item(), self.steps)
                    self.steps += 1
                if threshold is not None:
                    if disc_loss.item() < threshold: return

## Train Model Along Sequential Dynamics

In [6]:
TRAIN_DYNAMICS_INITIALIZATION_ONLY = False

In [7]:
# region Important Notes
# most important parameters to set
# training rate 3e-4
# depth and width of the value and policy networks 3*1024
# entropy coefficient 0.7
# make the buffer size 100K
import warnings
warnings.filterwarnings('ignore')
# endregion

# region control flags
# training initialization or storing
INITIALIZE_TRAINING_INSTANCE = False
load_replay_buffer = True
load_physics = True
load_reward = False
load_networks = True
training_mode = True
training_dynamic = True
training_reward = False
load_dynamic_replay_buffer = False
training_warm_up_phase = False
discriminator_warm_up_phase = False
load_expert_policy = False
logger_start_state = 0 if INITIALIZE_TRAINING_INSTANCE else 0
dynamic_start_state = 0 if INITIALIZE_TRAINING_INSTANCE else 0
# endregion

# region file path
# path information
root_folder                 = "E:\MoCAP\MCDH\\root_1"
policy_model_path           = f"{root_folder}\\sac_model_3 - Copy.zip"
dynamic_model_path          = f"{root_folder}\\d1_init.pt"
reward_model_path           = f"{root_folder}\\r1_64.pt"
dynamic_model_backup_path   = f"{root_folder}\\d1_init.pt"
reward_model_backup_path    = f"{root_folder}\\r1_64_backup.pt"
replay_buffer_path          = f"{root_folder}\\replay_buffer.pt"
dynamic_model_replay_buffer = f"{root_folder}\dyna_replay_buffer_backup.pt"
logger_path                 = f"{root_folder}\logs"
logger2_path                = logger_path
reference_trajectory_path   = f"{root_folder}/traj_info.np"
feature_extractor_info      = f"{root_folder}/feature_extractor_info.pkl"
# endregion

# region algorithm models

# tensorboard summary writer
writer = SummaryWriter(logger2_path)

# expert info
dataset = types.ClipCollection(ids=['CMU_075_09'], start_steps=[0], end_steps=[194])

# referenct trajectory information
with open(reference_trajectory_path, "rb") as f: reference_info = pickle.load(f)
with open(feature_extractor_info   , "rb") as f: fe_info = pickle.load(f)

# environment
env  = tracking.MocapTrackingGymEnv(dataset, task_kwargs=dict(ghost_offset=np.array([0., 0., 0.])),)
denv = DynaBasedEnv(env, reference_info, None, keys=fe_info[0], observation_space=fe_info[1])

# policy model
policy_kwargs = dict(
    net_arch=dict(pi=3*[1024], qf=3*[1024]),
    activation_fn=torch.nn.LeakyReLU,
)

lr_schedule = 1e-4
format_strings = ['csv', 'tensorboard', 'stdout']

model = TD3("MlpPolicy", 
            denv,
            policy_kwargs=policy_kwargs,
            gamma=0.95,
            batch_size=4096,
            learning_rate=lr_schedule,
            verbose=1,
            device=get_device("cuda"),)

# dynamic model
d1 = MultiStepDynamics(NUM_OF_CELLS, hidden_layer_frac=[0.6,0.5], dropout=0.2)

# reward model
r1 = Discriminator(action_dim=denv.action_space.shape[0], state_dim=denv.observation_space.shape[0], hidden_dim=512, writer=writer)
# endregion

# region load models
# load models
if load_reward:
    r1.load_state_dict(torch.load(reward_model_path))
    print(f"[INFO] : Reward Loaded Successfully.")
else:
    r1 = Discriminator(action_dim=ACTION_SIZE, state_dim=STATE_SIZE, writer=None)
    print(f"[INFO] : Reward Initialized Successfully.")
if load_networks : 
    custom_objects = { 'learning_rate': lr_schedule, 'batch_size': 8000}
    model = TD3.load(policy_model_path, env=denv, custom_objects=custom_objects)
    print(f"[INFO] : Model Loaded Successfully.")
if load_replay_buffer : 
    model.load_replay_buffer(replay_buffer_path)
    print(f"[INFO] : Buffer Loaded Successfully.")
    print(f"[INFO] : Buffer Size IS {model.replay_buffer.pos}")
else : 
    model.replay_buffer.reset()
    print(f"[INFO] : Buffer Reset Was Successfull.")
if load_physics : 
    d1.load_state_dict(torch.load(dynamic_model_backup_path))
    print(f"[INFO] : Physics Loaded Successfully.")
if load_dynamic_replay_buffer:
    with open(dynamic_model_replay_buffer, "rb") as f:
        dynamic_dataset = pickle.load(f)
    print(f"[INFO] : Physics Buffer Loaded Successfully.")
    print(f"[INFO] : Physics Buffer Size IS {len(dynamic_dataset[0])}")
else: 
    dynamic_dataset = [[], []]
    print(f"[INFO] : Physics Buffer Initialized Successfully.")
# endregion

# region initialization
logger = configure(logger_path, format_strings)
model.set_logger(logger)
one_tensor = torch.tensor([1.])
# _pos_indeces = np.array(list(range(30)))

gamma = torch.tensor([0.95**n for n in range(241)]).double()
action_gamma = torch.tensor([[x]*ACTION_SIZE for x in gamma[:25]]).reshape(-1)
target_bodies = reference_info['body_positions'].reshape(-1,93)
target_joints = reference_info['joints'].reshape(-1,ACTION_SIZE)
target_reference = np.concatenate((target_bodies,target_joints), axis=1)

# _pos_target_trajs = torch.tensor(np.concatenate((target_bodies,target_joints), axis=-1))[:,_pos_indeces]
# _ang_target_trajs = torch.tensor(np.concatenate((target_bodies,target_joints), axis=-1))[:,_ang_indeces]
# target_trajs = torch.tensor(np.concatenate((target_bodies,target_joints), axis=-1))
# endregion

# region algorithm functions
def r_to_i(x,y): return np.array([z for z in range(3*x,3*y)])

def r_norm(x,y): 
    # print(f"[DEBUG] : Error matrix = {np.max(np.einsum('i,ji->ji',np.abs(x - y),mask), axis=1)}")
    return np.sum(np.mean(np.einsum("i,ji->ji",np.sqrt(np.square(x - y)+1e-6),mask[:-1]), axis=1))+\
           np.mean(np.einsum("i,i->i",np.sqrt(np.square(x - y)+1e-6),mask[-1]))
        
def traj_dist(epS, start_step,result_index=-1, traj_length=25, result_array=None, multi_thread=False) : 
    dtw_distance, _ = fastdtw(epS.reshape(traj_length,POS_SIZE).detach().numpy(),
                              target_reference[start_step:start_step+traj_length,:], 
                              dist=r_norm)
    # print(f"[DEBUG] : DTW distance is {dtw_distance}")

    if not multi_thread : return [dtw_distance, one_tensor]
    result_array[result_index][0] = dtw_distance
    result_array[result_index][1] = 1.0

def predictive_sampling(S0, start_step, predictive_trials=100, temperature=0.1):
    model.policy.eval()
    actions = torch.tensor([])
    current_state = torch.tensor(S0)
    # estimate main trajectory
    for i in range(25) :
        next_action = torch.clamp(torch.tensor(model.predict(current_state.detach().numpy(), deterministic=False)[0]), min=-1.0, max=+1.0)
        # next_action = torch.clamp(temperature*torch.tensor(expert.predict(denv.core_state, deterministic=True)[0])+(1-temperature)*torch.empty(ACTION_SIZE).uniform_(-0.05, 0.05), min=-1.0, max=+1.0)
        # next_action = torch.clamp(torch.tensor(expert.predict(denv.core_state, deterministic=False)[0]), min=-1.0, max=+1.0)
        actions = torch.cat((actions, next_action), dim=-1)
        current_state = d1.Nthforward(x=torch.cat((current_state if i==0 else d1.decoder(current_state), next_action), dim=-1), N=i)
    # disturb main trajectory
    disturbed_actions = torch.clamp(actions + torch.normal(torch.zeros((predictive_trials, int(actions.shape[-1]))), 0.125), min=-1.0, max=+1.0)
    disturbed_trajs = torch.cat((torch.tile(current_state, dims=(predictive_trials,1)), disturbed_actions), dim=-1)
    # feed trajectories to dynamic model
    predicted_trajs = d1(disturbed_trajs)
    
    # threads = []
    global number_of_predicted_trajs
    traj_distances = []
    for result_index in range(number_of_predicted_trajs) : 
        traj_distances += [traj_dist(predicted_trajs[result_index,:], start_step,result_index=-1, traj_length=25, multi_thread=False)]


    traj_distances = torch.tensor(traj_distances)
    # print(f"[DEBUG] : mean trajectory errors = {torch.mean(traj_distances[:,0])}")
    # estimate errors
    return disturbed_actions[torch.argmin(traj_distances[:,0]),:][:ACTION_SIZE], torch.min(traj_distances[:,0])

# training loop
def training_loop(dataloader, num_epochs, optimizer, writer, partial_loss_counter, ci, device):
    for counter in tqdm(range(num_epochs), desc=f"cell[{ci}]"):
        for batch_x, batch_y in dataloader:
            # optimizer
            optimizer.zero_grad()

            # criterion
            criterion = nn.MSELoss()

            # Forward pass
            outputs = d1(batch_x.to(device))

            # individual loss
            loss = criterion(outputs[:,:(ci+1)*STATE_SIZE], batch_y[:,:(ci+1)*STATE_SIZE].to(device))
            loss.backward()
            optimizer.step()
            partial_running_loss[ci] += loss.item()

            # optimizer.zero_grad()
            optimizer.zero_grad()
        # LOG THE Partial Cell Loss 
        writer.add_scalar(f"Partial Dynamic Loss/cell[{ci}]", loss.item(), partial_loss_counter+counter)

# main dynamic training funtion
def training_function(dataset, batch_size, num_epochs, d1, writer, dynamic_start_state, episode_num, device=torch.device("cpu")):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    d1.to(device)
    for ci in range(25):
        training_loop(dataloader, num_epochs, optim.Adam(d1.cells[:ci+1].parameters(), lr=1e-5), writer, dynamic_start_state+episode_num*25, ci, device)

# endregion

# region algorithm parameters

# region main loop params
NUM_TRAJS = 25
NUM_ITERS = 1000 if not TRAIN_DYNAMICS_INITIALIZATION_ONLY else 0
number_of_predicted_trajs = 100
counter = 0
__total_timesteps = 1000
# endregion

# region model params
entropy_coef = 0.7
partial_running_loss = [0.0 for _ in range(25)]
batch_size = 1200
zero_array = np.zeros((1), dtype=int)
Z = [np.zeros((ACTION_SIZE,)) for _ in range(25)]
replay_buffer_store_period = 5
model_store_period = 1
device = torch.device("cpu")
# endregion

# region dynamic params
dynamic_batch_size = 1240
dynamic_train_iterations = 2
dynamic_writer_counter = 0
num_epochs = 2
dynamic_training_period = 1
# endregion

# region reward params
expert_buffer = [[],[]]
reward_train_epoch_size = 1024
reward_train_batch_size = 512
reward_train_num_of_epochs = 5
reward_train_learning_rate = 1e-4
reward_train_iterations = 20
mask = np.zeros((7,POS_SIZE))

# _pos_indeces = np.array([list(range(3*x,3*x+3,1)) for x in [2,3,7,8,19,22,26,29,10,16]]).reshape(-1)
_ang_indeces = 93+np.array([1,2,3,4,5,6,8,9,10,11,12,13,33,34,38,45,46,50]).reshape(-1)

pos_coef = 0.5
mask[0, [
    3 * 1 + 2,
    3 * 6 + 2,
]] = pos_coef
mask[1, [
    3 * 4 + 2,
    3 * 9 + 2,
]] = pos_coef
mask[2, [
    3 * 2 + 2,
    3 * 7 + 2,
]] = pos_coef
mask[3, [
    3 * 3 + 2,
    3 * 8 + 2,
]] = pos_coef
mask[4, [
    36, 37, 38,
]] = pos_coef
mask[5, [
    45, 46, 47,
]] = pos_coef
mask[6,  _ang_indeces,] = .3

if not load_reward: r1.writer = writer
denv.reward_estimator = r1
print("[INFO] : Environment Reward Estimator Was Set Successfully.")
# endregion

# endregion

raise Exception("STOP HERE")


# region algorithm main loop
if discriminator_warm_up_phase:
    assert load_replay_buffer is True, "[WARNING] : Make Sure Replay Buffer Is Initiated In Warming Reward Phase"
    for _ in tqdm(list(range(20))):
        expert_data,agent_data = model.replay_buffer.sample(batch_size=4096,reward_training=True)
        r1.train(expert_data, agent_data, num_epochs=10, batch_size=512, lr=1e-4)

if training_warm_up_phase:
    print(f"[INFO] : Warmup Phase Activated...")
    # model.train(gradient_steps=15,batch_size=4096)
    model.learn( total_timesteps=__total_timesteps, reset_num_timesteps=False )

for episode_num in range(NUM_ITERS):

    # Collect Trajectory
    reward_stats, length_stats = [], []
    dtw_stats, mse_stats= [], []
    for traj_num in tqdm(range(NUM_TRAJS)):

        # recording video of robot performance
        _current_eps_frames = []

        n_state, done = denv.reset(ps_mode=True)
        real_states, pos_states, actions = [], [], []
        episode_start = copy.deepcopy(denv.main_env.dm_env._task._time_step)
        traj_info = []
        while not done:
            # capture current frame
            _current_eps_frames += [denv.render()]
            _current_time = denv.main_env.dm_env._task._time_step
            action, traj_error = predictive_sampling(n_state[0], _current_time,predictive_trials=number_of_predicted_trajs, temperature=0.9)
            # action, traj_error = torch.clamp(torch.tensor(expert.predict(denv.core_state, deterministic=False)[0]), min=-1.0, max=+1.0), 0.1
            # print(f"[DEBUG] : Trajectory error = {traj_error}")
            state = np.copy(n_state[0])
            real_states += [n_state[0]]
            pos_states += [n_state[1]]
            actions += [action]
            n_state, reward, done, _, _ = denv.step(action, ps_mode=True)
            
            # store to train expert reward estimator
            expert_buffer[0] += [n_state[0].reshape((1,-1))]
            expert_buffer[1] += [action.reshape((1,-1))]

            # store to train 
            traj_info += [
                (
                    action.detach().numpy().reshape((1,-1)),
                    zero_array,
                    n_state[0].reshape((1,-1)),
                    state[0].reshape((1,-1)),
                    reward.reshape((1,-1)),
                    [{"TimeLimit.truncated" : False}],
                )
            ]

        for step_info in traj_info:
            model.replay_buffer.add(
                action   =step_info[0],
                done     =step_info[1],
                next_obs =step_info[2],
                obs      =step_info[3],
                reward   =step_info[4]-1/len(traj_info),
                infos    =step_info[5],
            )

        # truncate the expert reward estimator buffer 
        expert_buffer[0] = expert_buffer[0][-4000:]
        expert_buffer[1] = expert_buffer[1][-4000:]


        # Update Dynamics Dataset
        real_states += [n_state[0]]
        pos_states += [n_state[1]]
        if len(real_states) >= 25:
            for start in range(len(real_states)-25+1-1):
                dynamic_dataset[0] += [ np.concatenate([real_states[start]]+actions[start:start+25], axis=-1) ]
                dynamic_dataset[1] += [ np.concatenate(pos_states[start+1:start+1+25], axis=-1) ]
        else:
            for start in range(len(real_states)):
                dynamic_dataset[0] += [ np.concatenate([real_states[start]]+actions[start:]+Z[:25-len(actions[start:])], axis=-1) ]
                dynamic_dataset[1] += [ np.concatenate(pos_states[start+1:]+[pos_states[-1] for _ in range(25-len(pos_states[start+1:]))], axis=-1) ]
        
        # update episode reward stats
        best_traj_error = traj_dist(epS=torch.tensor(pos_states), result_index=-1, start_step=episode_start, traj_length=len(pos_states))
        dtw_stats += [ best_traj_error[0] ]
        length_stats += [ len(pos_states) ]

        # store performance video
        out = cv2.VideoWriter(f'{logger2_path}\\videos\\episode-[{episode_num}]-[{traj_num}].mp4',cv2.VideoWriter_fourcc(*'DIVX'),15, (640, 480))
        for i in range(len(_current_eps_frames)):
            rgb_img = cv2.cvtColor(_current_eps_frames[i], cv2.COLOR_RGB2BGR)
            out.write(rgb_img)
        out.release()


    # Sanity Checks
    print(f"[INFO] : Model Buffer Current POS IS {model.replay_buffer.pos}")
    print(f"[INFO] : Dynamic Buffer Size IS {len(dynamic_dataset[0])}")


    # Train The Dynamic Model
    if training_dynamic:
        for _ in tqdm(range(dynamic_train_iterations)):
            dynamic_dataset[0], dynamic_dataset[1] = dynamic_dataset[0][-4000:], dynamic_dataset[1][-4000:]
            indeces = random.sample(range(len(dynamic_dataset[1])), min(len(dynamic_dataset[1]), dynamic_batch_size))
            dataset = CustomDataset([dynamic_dataset[0][__i] for __i in indeces], [dynamic_dataset[1][__i] for __i in indeces], device=torch.device("cuda"))
            training_function(dataset, batch_size, num_epochs, d1, writer, dynamic_start_state, dynamic_writer_counter)
            dynamic_writer_counter += 1
       

    # Collect Extra Steps For Algorithm Itself
    if training_mode:
        model.learn( total_timesteps=int(__total_timesteps), reset_num_timesteps=False, log_interval=10)
        

    # Train The Reward Model
    if training_reward:
        for _ in tqdm(list(range(reward_train_iterations))):
            _agent_data = model.replay_buffer.sample(batch_size=reward_train_epoch_size)
            agent_data = [_agent_data[0].detach().cpu().numpy(), _agent_data[1].detach().cpu().numpy()]
            indeces = random.sample(range(len(expert_buffer[0])), min(reward_train_epoch_size,len(expert_buffer[0])))
            expert_data = [np.array(expert_buffer[0]).squeeze()[indeces], np.array(expert_buffer[1]).squeeze()[indeces]]
            r1.train(expert_data, agent_data, num_epochs=reward_train_num_of_epochs, batch_size=reward_train_batch_size, lr=reward_train_learning_rate)

    # check point every 5 steps
    if (episode_num%model_store_period==0) and ((episode_num>=0) or (replay_buffer_store_period==1)):

        # backup model
        if training_mode:
            model.save(policy_model_path)
            print(f"[INFO] : Model Stored Successfully.")

        # dynamic model
        if training_dynamic:
            torch.save(d1.state_dict(), dynamic_model_backup_path)
            print(f"[INFO] : Dynamics Model Stored Successfully.")

        # reward model
        if training_reward:
            torch.save(r1.state_dict(), reward_model_backup_path)
            print(f"[INFO] : Reward Model Stored Successfully.")


    # store & truncate replay buffer
    if (episode_num%replay_buffer_store_period==0) and ((episode_num>0) or (replay_buffer_store_period==1)) and training_mode: 
        
        # model replay buffer
        model.save_replay_buffer(replay_buffer_path)
        print(f"[INFO] : Model Replay Buffer Stored Successfully.")

        # dynamic replay buffer
        with open(dynamic_model_replay_buffer, "wb") as f:
            pickle.dump(file=f, obj=dynamic_dataset)
            print(f"[INFO] : Dynamics Replay Buffer Stored Successfully.")
# endregion


  root_folder                 = "E:\MoCAP\MCDH\\root_1"
  dynamic_model_replay_buffer = f"{root_folder}\dyna_replay_buffer_backup.pt"
  logger_path                 = f"{root_folder}\logs"


[INFO] : meta-information updated successfully.
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
[INFO] : Reward Initialized Successfully.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
[INFO] : Model Loaded Successfully.
[INFO] : Buffer Loaded Successfully.
[INFO] : Buffer Size IS 171417
[INFO] : Physics Loaded Successfully.
[INFO] : Physics Buffer Initialized Successfully.
Logging to E:\MoCAP\MCDH\root_1\logs
[INFO] : Environment Reward Estimator Was Set Successfully.


Exception: STOP HERE

## Collect trajectories to train gpt model

In [8]:
# Collect Trajectory
# sequence_buffer = [] # containes trajectories of form state[206]-action[56] pairs , [[S1],[A1],...,[ST-1],[AT-1],[ST]]
state_buffer = [] # containes pairs of states and their corresponding position collected from trajectoreis
for traj_num in tqdm(range(10000)):

    # recording video of robot performance
    n_state, done = denv.reset(ps_mode=True)
    # sequence_buffer += [[]]
    while not done:
        # capture current frame
        # action = torch.clamp(torch.tensor(expert.predict(denv.core_state, deterministic=False)[0]), min=-1.0, max=+1.0)
        action = torch.clamp(torch.tensor(expert.predict(denv.core_state, deterministic=True)[0])+torch.empty(ACTION_SIZE).uniform_(-0.75, 0.75), min=-1.0, max=+1.0)

        # store state action pair
        # sequence_buffer[-1] += [
        #     [n_state[0]],
        #     [action.detach().cpu().numpy()],
        # ]

        state_buffer += [
            n_state
        ]
        
        n_state, reward, done, _, _ = denv.step(action, ps_mode=True)
        # store final state if done
        if done: 
            # sequence_buffer[-1] += [
            #     [n_state[0]],
            # ]
            state_buffer += [ n_state ]
    

100%|██████████| 10000/10000 [46:32<00:00,  3.58it/s] 


In [11]:
len(state_buffer)

115705

In [9]:
import pickle
# with open("E:/MoCAP/MCDH/minGPT/test_trajs.pkl", "wb") as f: pickle.dump(sequence_buffer, f)
with open("E:/MoCAP/MCDH/StPosAE/train_stpos.pkl", "wb") as f: pickle.dump(state_buffer, f)