In [1]:
#!cat ~/.bashrc

In [2]:
#!mv ../drone_dataset.pkl .

In [3]:
#!pip3 install --upgrade protobuf==3.20.0 

In [4]:
#!pip3 install transformers==4.5.1
#!pip3 install -U tokenizers
# The code below just solve many problems lol
#!pip3 uninstall tokenizers -y

In [5]:
from torch.utils.tensorboard import SummaryWriter
import argparse
import pickle
import random
import time
import gym
import d4rl
import torch
import numpy as np

import utils
from replay_buffer import ReplayBuffer
from lamb import Lamb
from stable_baselines3.common.vec_env import SubprocVecEnv
from pathlib import Path
from data import create_dataloader
from decision_transformer.models.decision_transformer import DecisionTransformer
from evaluation import create_vec_eval_episodes_fn, vec_evaluate_episode_rtg
from trainer import SequenceTrainer
from logger import Logger

from env import make_pytorch_env

MAX_EPISODE_LEN = 4000 # 4000 # Warning: there is a similar variable in data.py! 

pybullet build time: May 20 2022 19:44:17


In [6]:
import sys
sys.argv = ['']

parser = argparse.ArgumentParser()
parser.add_argument("--seed", type=int, default=10)
parser.add_argument("--env", type=str, default="drone_dataset")
#parser.add_argument("--env", type=str, default="antmaze-large-diverse-v2")

# model options
#parser.add_argument("--K", type=int, default=20)
#parser.add_argument("--K", type=int, default=200)
#parser.add_argument("--K", type=int, default=80)
parser.add_argument("--K", type=int, default=40)
parser.add_argument("--embed_dim", type=int, default=512)
parser.add_argument("--n_layer", type=int, default=4)
parser.add_argument("--n_head", type=int, default=4)
parser.add_argument("--activation_function", type=str, default="relu")
parser.add_argument("--dropout", type=float, default=0.1)
#parser.add_argument("--eval_context_length", type=int, default=5)
#parser.add_argument("--eval_context_length", type=int, default=50)
#parser.add_argument("--eval_context_length", type=int, default=20)
parser.add_argument("--eval_context_length", type=int, default=10)
# 0: no pos embedding others: absolute ordering
#parser.add_argument("--ordering", type=int, default=0)
parser.add_argument("--ordering", type=int, default=1)

# shared evaluation options
parser.add_argument("--eval_rtg", type=int, default=3600)
parser.add_argument("--num_eval_episodes", type=int, default=10)

# shared training options
parser.add_argument("--init_temperature", type=float, default=0.1)
#parser.add_argument("--batch_size", type=int, default=256)
#parser.add_argument("--batch_size", type=int, default=32)
parser.add_argument("--batch_size", type=int, default=128)
parser.add_argument("--learning_rate", "-lr", type=float, default=1e-4)
parser.add_argument("--weight_decay", "-wd", type=float, default=5e-4)
parser.add_argument("--warmup_steps", type=int, default=10000)

# pretraining options
parser.add_argument("--max_pretrain_iters", type=int, default=1)
parser.add_argument("--num_updates_per_pretrain_iter", type=int, default=5000)

# finetuning options
parser.add_argument("--max_online_iters", type=int, default=1500)
parser.add_argument("--online_rtg", type=int, default=7200)
parser.add_argument("--num_online_rollouts", type=int, default=1)
parser.add_argument("--replay_size", type=int, default=1000)
parser.add_argument("--num_updates_per_online_iter", type=int, default=300)
parser.add_argument("--eval_interval", type=int, default=10)

# environment options
parser.add_argument("--device", type=str, default="cuda")
parser.add_argument("--log_to_tb", "-w", type=bool, default=True)
parser.add_argument("--save_dir", type=str, default="./exp")
parser.add_argument("--exp_name", type=str, default="default")

args = parser.parse_args()

In [7]:
class Experiment:
    def __init__(self, variant):

        self.state_dim, self.act_dim, self.action_range = self._get_env_spec(variant)
        self.offline_trajs, self.state_mean, self.state_std = self._load_dataset(
            variant["env"]
        )
        # initialize by offline trajs
        self.replay_buffer = ReplayBuffer(variant["replay_size"], self.offline_trajs)

        self.aug_trajs = []

        self.device = variant.get("device", "cuda")
        self.target_entropy = -self.act_dim
        self.model = DecisionTransformer(
            state_dim=self.state_dim,
            act_dim=self.act_dim,
            action_range=self.action_range,
            max_length=variant["K"],
            eval_context_length=variant["eval_context_length"],
            max_ep_len=MAX_EPISODE_LEN,
            hidden_size=variant["embed_dim"],
            n_layer=variant["n_layer"],
            n_head=variant["n_head"],
            n_inner=4 * variant["embed_dim"],
            activation_function=variant["activation_function"],
            n_positions=1024,
            resid_pdrop=variant["dropout"],
            attn_pdrop=variant["dropout"],
            stochastic_policy=True,
            ordering=variant["ordering"],
            init_temperature=variant["init_temperature"],
            target_entropy=self.target_entropy,
        ).to(device=self.device)

        self.optimizer = Lamb(
            self.model.parameters(),
            lr=variant["learning_rate"],
            weight_decay=variant["weight_decay"],
            eps=1e-8,
        )
        self.scheduler = torch.optim.lr_scheduler.LambdaLR(
            self.optimizer, lambda steps: min((steps + 1) / variant["warmup_steps"], 1)
        )

        self.log_temperature_optimizer = torch.optim.Adam(
            [self.model.log_temperature],
            lr=1e-4,
            betas=[0.9, 0.999],
        )

        # track the training progress and
        # training/evaluation/online performance in all the iterations
        self.pretrain_iter = 0
        self.online_iter = 0
        self.total_transitions_sampled = 0
        self.variant = variant
        self.reward_scale = 1.0 if "antmaze" in variant["env"] else 0.001
        self.logger = Logger(variant)

    def _get_env_spec(self, variant):
        #####env = gym.make(variant["env"])
        env = make_pytorch_env(args)
        env.max_step = MAX_EPISODE_LEN
        state_dim = env.observation_space.shape[0]
        act_dim = env.action_space.shape[0]
        #action_range = [-0.999999, 0.999999]
        
        action_range = [
            float(env.action_space.low.min()) + 1e-6,
            float(env.action_space.high.max()) - 1e-6,
        ]
        
        print("action_range: {}".format(action_range))
        env.close()
        return state_dim, act_dim, action_range

    def _save_model(self, path_prefix, is_pretrain_model=False):
        to_save = {
            "model_state_dict": self.model.state_dict(),
            "optimizer_state_dict": self.optimizer.state_dict(),
            "scheduler_state_dict": self.scheduler.state_dict(),
            "pretrain_iter": self.pretrain_iter,
            "online_iter": self.online_iter,
            "args": self.variant,
            "total_transitions_sampled": self.total_transitions_sampled,
            "np": np.random.get_state(),
            "python": random.getstate(),
            "pytorch": torch.get_rng_state(),
            "log_temperature_optimizer_state_dict": self.log_temperature_optimizer.state_dict(),
        }

        with open(f"{path_prefix}/model.pt", "wb") as f:
            torch.save(to_save, f)
        print(f"\nModel saved at {path_prefix}/model.pt")

        if is_pretrain_model:
            with open(f"{path_prefix}/pretrain_model.pt", "wb") as f:
                torch.save(to_save, f)
            print(f"Model saved at {path_prefix}/pretrain_model.pt")

    def _load_model(self, path_prefix):
        if Path(f"{path_prefix}/model.pt").exists():
            with open(f"{path_prefix}/model.pt", "rb") as f:
                checkpoint = torch.load(f)
            self.model.load_state_dict(checkpoint["model_state_dict"])
            self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
            self.scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
            self.log_temperature_optimizer.load_state_dict(
                checkpoint["log_temperature_optimizer_state_dict"]
            )
            self.pretrain_iter = checkpoint["pretrain_iter"]
            self.online_iter = checkpoint["online_iter"]
            self.total_transitions_sampled = checkpoint["total_transitions_sampled"]
            np.random.set_state(checkpoint["np"])
            random.setstate(checkpoint["python"])
            torch.set_rng_state(checkpoint["pytorch"])
            print(f"Model loaded at {path_prefix}/model.pt")

    def _load_dataset(self, env_name):

        dataset_path = f"./data/{env_name}.pkl"
        with open(dataset_path, "rb") as f:
            trajectories = pickle.load(f)

        states, traj_lens, returns = [], [], []
        for path in trajectories:
            states.append(path["observations"])
            traj_lens.append(len(path["observations"]))
            returns.append(path["rewards"].sum())
        traj_lens, returns = np.array(traj_lens), np.array(returns)

        # used for input normalization
        states = np.concatenate(states, axis=0)
        state_mean, state_std = np.mean(states, axis=0), np.std(states, axis=0) + 1e-6
        num_timesteps = sum(traj_lens)

        print("=" * 50)
        print(f"Starting new experiment: {env_name}")
        print(f"{len(traj_lens)} trajectories, {num_timesteps} timesteps found")
        print(f"Average return: {np.mean(returns):.2f}, std: {np.std(returns):.2f}")
        print(f"Max return: {np.max(returns):.2f}, min: {np.min(returns):.2f}")
        print(f"Average length: {np.mean(traj_lens):.2f}, std: {np.std(traj_lens):.2f}")
        print(f"Max length: {np.max(traj_lens):.2f}, min: {np.min(traj_lens):.2f}")
        print("=" * 50)

        sorted_inds = np.argsort(returns)  # lowest to highest
        num_trajectories = 1
        timesteps = traj_lens[sorted_inds[-1]]
        ind = len(trajectories) - 2
        while ind >= 0 and timesteps + traj_lens[sorted_inds[ind]] < num_timesteps:
            timesteps += traj_lens[sorted_inds[ind]]
            num_trajectories += 1
            ind -= 1
        sorted_inds = sorted_inds[-num_trajectories:]
        trajectories = [trajectories[ii] for ii in sorted_inds]

        return trajectories, state_mean, state_std

    def _augment_trajectories(
        self,
        online_envs,
        target_explore,
        n,
        randomized=False,
    ):

        max_ep_len = MAX_EPISODE_LEN

        with torch.no_grad():
            # generate init state
            target_return = [target_explore * self.reward_scale] * online_envs.num_envs

            returns, lengths, trajs = vec_evaluate_episode_rtg(
                online_envs,
                self.state_dim,
                self.act_dim,
                self.model,
                max_ep_len=max_ep_len,
                reward_scale=self.reward_scale,
                target_return=target_return,
                mode="normal",
                state_mean=self.state_mean,
                state_std=self.state_std,
                device=self.device,
                use_mean=False,
            )

        self.replay_buffer.add_new_trajs(trajs)
        self.aug_trajs += trajs
        self.total_transitions_sampled += np.sum(lengths)

        return {
            "aug_traj/return": np.mean(returns),
            "aug_traj/length": np.mean(lengths),
        }

    def pretrain(self, eval_envs, loss_fn):
        print("\n\n\n*** Pretrain ***")
        print("----------------")
        print("eval_envs: {}".format(eval_envs))
        print("loss_fn: {}".format(loss_fn))
        
        eval_fns = [
            create_vec_eval_episodes_fn(
                vec_env=eval_envs,
                eval_rtg=self.variant["eval_rtg"],
                state_dim=self.state_dim,
                act_dim=self.act_dim,
                state_mean=self.state_mean,
                state_std=self.state_std,
                device=self.device,
                use_mean=True,
                reward_scale=self.reward_scale,
            )
        ]

        trainer = SequenceTrainer(
            model=self.model,
            optimizer=self.optimizer,
            log_temperature_optimizer=self.log_temperature_optimizer,
            scheduler=self.scheduler,
            device=self.device,
        )

        writer = (
            SummaryWriter(self.logger.log_path) if self.variant["log_to_tb"] else None
        )
        while self.pretrain_iter < self.variant["max_pretrain_iters"]:
            # in every iteration, prepare the data loader
            dataloader = create_dataloader(
                trajectories=self.offline_trajs,
                num_iters=self.variant["num_updates_per_pretrain_iter"],
                batch_size=self.variant["batch_size"],
                max_len=self.variant["K"],
                state_dim=self.state_dim,
                act_dim=self.act_dim,
                state_mean=self.state_mean,
                state_std=self.state_std,
                reward_scale=self.reward_scale,
                action_range=self.action_range,
            )

            train_outputs = trainer.train_iteration(
                loss_fn=loss_fn,
                dataloader=dataloader,
            )
            eval_outputs, eval_reward = self.evaluate(eval_fns)
            outputs = {"time/total": time.time() - self.start_time}
            outputs.update(train_outputs)
            outputs.update(eval_outputs)
            self.logger.log_metrics(
                outputs,
                iter_num=self.pretrain_iter,
                total_transitions_sampled=self.total_transitions_sampled,
                writer=writer,
            )

            self._save_model(
                path_prefix=self.logger.log_path,
                is_pretrain_model=True,
            )

            self.pretrain_iter += 1

    def evaluate(self, eval_fns):
        eval_start = time.time()
        self.model.eval()
        outputs = {}
        for eval_fn in eval_fns:
            o = eval_fn(self.model)
            outputs.update(o)
        outputs["time/evaluation"] = time.time() - eval_start

        eval_reward = outputs["evaluation/return_mean_gm"]
        return outputs, eval_reward

    def online_tuning(self, online_envs, eval_envs, loss_fn):

        print("\n\n\n*** Online Finetuning ***")

        trainer = SequenceTrainer(
            model=self.model,
            optimizer=self.optimizer,
            log_temperature_optimizer=self.log_temperature_optimizer,
            scheduler=self.scheduler,
            device=self.device,
        )
        eval_fns = [
            create_vec_eval_episodes_fn(
                vec_env=eval_envs,
                eval_rtg=self.variant["eval_rtg"],
                state_dim=self.state_dim,
                act_dim=self.act_dim,
                state_mean=self.state_mean,
                state_std=self.state_std,
                device=self.device,
                use_mean=True,
                reward_scale=self.reward_scale,
            )
        ]
        writer = (
            SummaryWriter(self.logger.log_path) if self.variant["log_to_tb"] else None
        )
        while self.online_iter < self.variant["max_online_iters"]:

            outputs = {}
            augment_outputs = self._augment_trajectories(
                online_envs,
                self.variant["online_rtg"],
                n=self.variant["num_online_rollouts"],
            )
            outputs.update(augment_outputs)

            dataloader = create_dataloader(
                trajectories=self.replay_buffer.trajectories,
                num_iters=self.variant["num_updates_per_online_iter"],
                batch_size=self.variant["batch_size"],
                max_len=self.variant["K"],
                state_dim=self.state_dim,
                act_dim=self.act_dim,
                state_mean=self.state_mean,
                state_std=self.state_std,
                reward_scale=self.reward_scale,
                action_range=self.action_range,
            )

            # finetuning
            is_last_iter = self.online_iter == self.variant["max_online_iters"] - 1
            if (self.online_iter + 1) % self.variant[
                "eval_interval"
            ] == 0 or is_last_iter:
                evaluation = True
            else:
                evaluation = False

            train_outputs = trainer.train_iteration(
                loss_fn=loss_fn,
                dataloader=dataloader,
            )
            outputs.update(train_outputs)

            if evaluation:
                eval_outputs, eval_reward = self.evaluate(eval_fns)
                outputs.update(eval_outputs)

            outputs["time/total"] = time.time() - self.start_time

            # log the metrics
            self.logger.log_metrics(
                outputs,
                iter_num=self.pretrain_iter + self.online_iter,
                total_transitions_sampled=self.total_transitions_sampled,
                writer=writer,
            )

            self._save_model(
                path_prefix=self.logger.log_path,
                is_pretrain_model=False,
            )

            self.online_iter += 1

    def __call__(self):

        utils.set_seed_everywhere(args.seed)

        import d4rl

        def loss_fn(
            a_hat_dist,     # action_preds
            a,              # action_target
            attention_mask, # padding_mask
            entropy_reg,    # self.model.temperature().detach()
        ):
            # a_hat is a SquashedNormal Distribution
            log_likelihood = a_hat_dist.log_likelihood(a)[attention_mask > 0].mean()
            
            entropy = a_hat_dist.entropy().mean()
            loss = -(log_likelihood + entropy_reg * entropy)
            
            '''
            print("a_hat_dist : {}".format(a_hat_dist))
            print("a : {}".format(a))
            torch.save(a,"a.pt")
            print("a_hat_dist.log_likelihood(a) : {}".format(a_hat_dist.log_likelihood(a)))
            #print("attention_mask : {}".format(attention_mask))
            print("log_likelihood: {}".format(log_likelihood))
            print("loss inside jupyter: {} of type: {}".format(loss,type(loss)))
            '''
            
            return (
                loss,
                -log_likelihood,
                entropy,
            )

        def get_env_builder(seed, env_name, target_goal=None):
            def make_env_fn():
                import d4rl

                #####env = gym.make(env_name)
                env = make_pytorch_env(args)
                env.max_step = MAX_EPISODE_LEN
                env.seed(seed)
                '''
                if hasattr(env.env, "wrapped_env"):
                    env.env.wrapped_env.seed(seed)
                elif hasattr(env.env, "seed"):
                    env.env.seed(seed)
                else:
                    pass
                '''
                '''
                env.action_space.seed(seed)
                env.observation_space.seed(seed)
                '''

                if target_goal:
                    env.set_target_goal(target_goal)
                    print(f"Set the target goal to be {env.target_goal}")
                return env

            return make_env_fn

        print("\n\nMaking Eval Env.....")
        env_name = self.variant["env"]
        if "antmaze" in env_name:
            env = gym.make(env_name)
            target_goal = env.target_goal
            env.close()
            print(f"Generated the fixed target goal: {target_goal}")
        else:
            target_goal = None
        eval_envs = SubprocVecEnv(
            [
                get_env_builder(i, env_name=env_name, target_goal=target_goal)
                for i in range(self.variant["num_eval_episodes"])
            ]
        )

        self.start_time = time.time()
        if self.variant["max_pretrain_iters"]:
            self.pretrain(eval_envs, loss_fn)

        if self.variant["max_online_iters"]:
            print("\n\nMaking Online Env.....")
            online_envs = SubprocVecEnv(
                [
                    get_env_builder(i + 100, env_name=env_name, target_goal=target_goal)
                    for i in range(self.variant["num_online_rollouts"])
                ]
            )
            self.online_tuning(online_envs, eval_envs, loss_fn)
            online_envs.close()

        eval_envs.close()

In [None]:
utils.set_seed_everywhere(args.seed)
experiment = Experiment(vars(args))

print("=" * 50)
experiment()

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


action_range: [-0.999999, 0.999999]
Starting new experiment: drone_dataset
1254 trajectories, 1971662 timesteps found
Average return: 3687.11, std: 875.27
Max return: 5216.00, min: 1264.00
Average length: 1572.30, std: 325.37
Max length: 2000.00, min: 920.00
Experiment log path: ./exp/2023.03.22/192943-default


Making Eval Env.....



*** Pretrain ***
----------------
eval_envs: <stable_baselines3.common.vec_env.subproc_vec_env.SubprocVecEnv object at 0x7f6425ea2d30>
loss_fn: <function Experiment.__call__.<locals>.loss_fn at 0x7f6425dd7ee0>


pybullet build time: May 20 2022 19:44:17
pybullet build time: May 20 2022 19:44:17
pybullet build time: May 20 2022 19:44:17
pybullet build time: May 20 2022 19:44:17
pybullet build time: May 20 2022 19:44:17
pybullet build time: May 20 2022 19:44:17
pybullet build time: May 20 2022 19:44:17
pybullet build time: May 20 2022 19:44:17
pybullet build time: May 20 2022 19:44:17
pybullet build time: May 20 2022 19:44:17


Iteration 0
time/total: 662.2079253196716
time/training: 644.9144990444183
training/train_loss_mean: 2548.5900681017292
training/train_loss_std: 7701.98695884238
training/nll: -7.420923233032227
training/entropy: -4.700595378875732
training/temp_value: 0.13960246423171133
evaluation/return_mean_gm: -26911.09551140177
evaluation/return_std_gm: 87.56313613335602
evaluation/length_mean_gm: 4000.0
evaluation/length_std_gm: 0.0
time/evaluation: 17.25350594520569

Model saved at ./exp/2023.03.22/192943-default/model.pt
Model saved at ./exp/2023.03.22/192943-default/pretrain_model.pt


Making Online Env.....


pybullet build time: May 20 2022 19:44:17





*** Online Finetuning ***
Iteration 1
aug_traj/return: -12382.717877879148
aug_traj/length: 4000.0
time/training: 40.25926971435547
training/train_loss_mean: -7.023461218004702
training/train_loss_std: 0.4425772020724578
training/nll: -8.135189056396484
training/entropy: -5.234961986541748
training/temp_value: 0.14401037048216409
time/total: 714.4794156551361

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 2
aug_traj/return: -14047.686036687379
aug_traj/length: 4000.0
time/training: 40.36855506896973
training/train_loss_mean: -7.282544582324545
training/train_loss_std: 0.4218312368355145
training/nll: -8.343433380126953
training/entropy: -5.587865352630615
training/temp_value: 0.1491861268782931
time/total: 764.0346298217773

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 3
aug_traj/return: -13795.619374291968
aug_traj/length: 4000.0
time/training: 40.6241717338562
training/train_loss_mean: -7.497875820724148
training/train_loss_std: 0.427585330

Iteration 18
aug_traj/return: -25566.961737223995
aug_traj/length: 4000.0
time/training: 38.300838470458984
training/train_loss_mean: -8.301880053909127
training/train_loss_std: 0.5148701288477702
training/nll: -10.386176109313965
training/entropy: -7.294092655181885
training/temp_value: 0.2668084254672863
time/total: 1548.8689975738525

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 19
aug_traj/return: -6782.79328411375
aug_traj/length: 4000.0
time/training: 38.07186818122864
training/train_loss_mean: -8.341214618533526
training/train_loss_std: 0.5074280989407342
training/nll: -10.006784439086914
training/entropy: -7.114073276519775
training/temp_value: 0.2762244017172188
time/total: 1595.6253850460052

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 20
aug_traj/return: -19154.478775695094
aug_traj/length: 4000.0
time/training: 36.31813311576843
training/train_loss_mean: -8.290594971347236
training/train_loss_std: 0.5056916510636519
training/nll: -


Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 35
aug_traj/return: -20142.37153513896
aug_traj/length: 4000.0
time/training: 39.324578046798706
training/train_loss_mean: -7.762956586998358
training/train_loss_std: 0.33368575975431086
training/nll: -10.627575874328613
training/entropy: -6.47216272354126
training/temp_value: 0.46951075818334304
time/total: 2375.4446477890015

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 36
aug_traj/return: -12598.322347667652
aug_traj/length: 4000.0
time/training: 36.64483094215393
training/train_loss_mean: -7.729332210127444
training/train_loss_std: 0.3874537333716998
training/nll: -11.542144775390625
training/entropy: -7.1702446937561035
training/temp_value: 0.4851638607937459
time/total: 2420.329426050186

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 37
aug_traj/return: -9032.069810821682
aug_traj/length: 4000.0
time/training: 38.583099126815796
training/train_loss_mean: -7.614002945263376



Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 52
aug_traj/return: -8377.26551536494
aug_traj/length: 4000.0
time/training: 39.12387132644653
training/train_loss_mean: -5.610066602364207
training/train_loss_std: 0.31966872990659395
training/nll: -11.185203552246094
training/entropy: -6.771195411682129
training/temp_value: 0.8143431371693256
time/total: 3203.1498606204987

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 53
aug_traj/return: -9191.0
aug_traj/length: 4000.0
time/training: 38.833773612976074
training/train_loss_mean: -5.456004610311966
training/train_loss_std: 0.32021120630690714
training/nll: -10.559442520141602
training/entropy: -6.379962921142578
training/temp_value: 0.8405106859570061
time/total: 3250.404598236084

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 54
aug_traj/return: -9938.290000135376
aug_traj/length: 4000.0
time/training: 38.11332678794861
training/train_loss_mean: -5.284225114086027
training/train


Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 69
aug_traj/return: -5858.962603162014
aug_traj/length: 4000.0
time/training: 37.83768343925476
training/train_loss_mean: -4.367309402114211
training/train_loss_std: 0.295291091033241
training/nll: -7.159402847290039
training/entropy: -2.8897149562835693
training/temp_value: 1.0157169296704687
time/total: 4019.250181913376

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 70
aug_traj/return: -6984.853467545261
aug_traj/length: 4000.0
time/training: 39.785237312316895
training/train_loss_mean: -4.401947090420058
training/train_loss_std: 0.2668943881527869
training/nll: -7.062492370605469
training/entropy: -2.9032771587371826
training/temp_value: 1.0153329444627455
evaluation/return_mean_gm: -12537.052603710472
evaluation/return_std_gm: 3105.367630561282
evaluation/length_mean_gm: 4000.0
evaluation/length_std_gm: 0.0
time/evaluation: 16.308732748031616
time/total: 4084.1024103164673

Model saved at ./exp/


Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 86
aug_traj/return: -5825.773590154594
aug_traj/length: 4000.0
time/training: 37.948776721954346
training/train_loss_mean: -4.362249809600634
training/train_loss_std: 0.26540936216126587
training/nll: -7.368747234344482
training/entropy: -3.0367591381073
training/temp_value: 1.0112792014982424
time/total: 4850.988021850586

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 87
aug_traj/return: -7879.9739626064575
aug_traj/length: 4000.0
time/training: 36.42337203025818
training/train_loss_mean: -4.378457937511606
training/train_loss_std: 0.28722665116815094
training/nll: -8.386518478393555
training/entropy: -3.3625717163085938
training/temp_value: 1.0134672439290928
time/total: 4896.1209626197815

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 88
aug_traj/return: -5681.931184466943
aug_traj/length: 4000.0
time/training: 38.54718351364136
training/train_loss_mean: -4.351726999655683
trai


Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 103
aug_traj/return: -6088.811486198378
aug_traj/length: 4000.0
time/training: 40.1516375541687
training/train_loss_mean: -4.330961656952093
training/train_loss_std: 0.2582835292593018
training/nll: -7.622885227203369
training/entropy: -3.057396411895752
training/temp_value: 1.0138250820158836
time/total: 5688.005211830139

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 104
aug_traj/return: -4263.3558193646295
aug_traj/length: 4000.0
time/training: 37.913424015045166
training/train_loss_mean: -4.32560260728932
training/train_loss_std: 0.25468035379105375
training/nll: -7.530298233032227
training/entropy: -3.084594488143921
training/temp_value: 1.0110026578764166
time/total: 5735.145093202591

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 105
aug_traj/return: -9051.400632668732
aug_traj/length: 4000.0
time/training: 38.915268421173096
training/train_loss_mean: -4.2964838960167695
tr


Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 120
aug_traj/return: -5833.0
aug_traj/length: 4000.0
time/training: 39.428258657455444
training/train_loss_mean: -4.212982191450034
training/train_loss_std: 0.2786717077649505
training/nll: -7.320705413818359
training/entropy: -3.1835014820098877
training/temp_value: 1.0124211517064865
evaluation/return_mean_gm: -27017.58857503215
evaluation/return_std_gm: 1431.2345972867242
evaluation/length_mean_gm: 4000.0
evaluation/length_std_gm: 0.0
time/evaluation: 15.998784065246582
time/total: 6521.404648780823

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 121
aug_traj/return: -5166.5922192511425
aug_traj/length: 4000.0
time/training: 38.04676795005798
training/train_loss_mean: -4.208995654466741
training/train_loss_std: 0.25370638285507857
training/nll: -7.479677677154541
training/entropy: -3.070706605911255
training/temp_value: 1.0125318792062907
time/total: 6567.916573524475

Model saved at ./exp/2023.03.


Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 137
aug_traj/return: -6485.09558781535
aug_traj/length: 4000.0
time/training: 37.630889892578125
training/train_loss_mean: -4.113366163036365
training/train_loss_std: 0.2724033838063975
training/nll: -7.494516372680664
training/entropy: -2.9733927249908447
training/temp_value: 1.0110731158233566
time/total: 7336.449981451035

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 138
aug_traj/return: -7691.038829772854
aug_traj/length: 4000.0
time/training: 37.9359667301178
training/train_loss_mean: -4.085769806387927
training/train_loss_std: 0.2805129271337353
training/nll: -7.137289047241211
training/entropy: -2.822263717651367
training/temp_value: 1.011147415729543
time/total: 7382.780305147171

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 139
aug_traj/return: -4874.781039064342
aug_traj/length: 4000.0
time/training: 38.41923403739929
training/train_loss_mean: -4.117913607511143
traini


Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 154
aug_traj/return: -5166.707861372368
aug_traj/length: 4000.0
time/training: 37.73251986503601
training/train_loss_mean: -3.9977769123547864
training/train_loss_std: 0.259993202216424
training/nll: -7.48676061630249
training/entropy: -3.195047378540039
training/temp_value: 1.009849387727582
time/total: 8164.684139251709

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 155
aug_traj/return: -7435.728576891736
aug_traj/length: 4000.0
time/training: 37.70838022232056
training/train_loss_mean: -3.9925199476856625
training/train_loss_std: 0.26210711158059463
training/nll: -6.9783244132995605
training/entropy: -2.961934804916382
training/temp_value: 1.0089084510255348
time/total: 8211.351840019226

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 156
aug_traj/return: -3854.852075318955
aug_traj/length: 4000.0
time/training: 39.37343764305115
training/train_loss_mean: -3.9822146506475407
tra


Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 171
aug_traj/return: -5683.480637273132
aug_traj/length: 4000.0
time/training: 38.425124645233154
training/train_loss_mean: -3.878678648245386
training/train_loss_std: 0.2625075979652041
training/nll: -7.065759658813477
training/entropy: -2.9846222400665283
training/temp_value: 1.0091250591830094
time/total: 8997.937738895416

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 172
aug_traj/return: -8118.616500969356
aug_traj/length: 4000.0
time/training: 38.03668451309204
training/train_loss_mean: -3.8796418352969417
training/train_loss_std: 0.2781311533636381
training/nll: -6.570175647735596
training/entropy: -2.9101462364196777
training/temp_value: 1.0081718930672146
time/total: 9044.849130630493

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 173
aug_traj/return: -6746.041459151861
aug_traj/length: 4000.0
time/training: 38.872814416885376
training/train_loss_mean: -3.8903235025659213


Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 188
aug_traj/return: -5025.196979102095
aug_traj/length: 4000.0
time/training: 38.08225417137146
training/train_loss_mean: -3.79219093873087
training/train_loss_std: 0.27912415666935725
training/nll: -7.477508068084717
training/entropy: -3.462345838546753
training/temp_value: 1.0072186407612447
time/total: 9819.690302610397

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 189
aug_traj/return: -6704.284534383052
aug_traj/length: 4000.0
time/training: 39.078513622283936
training/train_loss_mean: -3.7759557576242013
training/train_loss_std: 0.27168260769157
training/nll: -6.20850944519043
training/entropy: -2.7299630641937256
training/temp_value: 1.0092696977948512
time/total: 9867.285078048706

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 190
aug_traj/return: -7787.168089808174
aug_traj/length: 4000.0
time/training: 39.53770709037781
training/train_loss_mean: -3.7863794711290795
trai


Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 205
aug_traj/return: -6466.066252423169
aug_traj/length: 4000.0
time/training: 38.05769348144531
training/train_loss_mean: -3.672567208956779
training/train_loss_std: 0.2773895202235414
training/nll: -7.014803409576416
training/entropy: -3.198704957962036
training/temp_value: 1.006344006766278
time/total: 10652.814505338669

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 206
aug_traj/return: -5901.119802169961
aug_traj/length: 4000.0
time/training: 37.945406913757324
training/train_loss_mean: -3.662348161032246
training/train_loss_std: 0.26493028134806856
training/nll: -7.078942775726318
training/entropy: -3.167912244796753
training/temp_value: 1.0060272912824586
time/total: 10699.667671918869

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 207
aug_traj/return: -5575.6154366626415
aug_traj/length: 4000.0
time/training: 38.26470232009888
training/train_loss_mean: -3.6815788440635075



Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 222
aug_traj/return: -6859.23398873659
aug_traj/length: 4000.0
time/training: 37.33485412597656
training/train_loss_mean: -3.590819630994594
training/train_loss_std: 0.26776906881435875
training/nll: -6.9165358543396
training/entropy: -3.1829066276550293
training/temp_value: 1.0077079264418625
time/total: 11484.543875932693

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 223
aug_traj/return: -7363.4813905446945
aug_traj/length: 4000.0
time/training: 38.36253786087036
training/train_loss_mean: -3.5833591108938343
training/train_loss_std: 0.2661305205563745
training/nll: -6.675202369689941
training/entropy: -3.0970382690429688
training/temp_value: 1.0061350371794293
time/total: 11531.317291736603

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 224
aug_traj/return: -5086.041847908027
aug_traj/length: 4000.0
time/training: 38.4063823223114
training/train_loss_mean: -3.574360403227093
tr


Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 239
aug_traj/return: -4336.942279667521
aug_traj/length: 4000.0
time/training: 37.982481718063354
training/train_loss_mean: -3.476332429005523
training/train_loss_std: 0.25261614686483264
training/nll: -6.375866889953613
training/entropy: -3.028280019760132
training/temp_value: 1.0042928351027596
time/total: 12297.622365951538

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 240
aug_traj/return: -9072.919414385698
aug_traj/length: 4000.0
time/training: 37.57859778404236
training/train_loss_mean: -3.490267972948433
training/train_loss_std: 0.25458313274772776
training/nll: -6.277682781219482
training/entropy: -2.709136962890625
training/temp_value: 1.0048461632637238
evaluation/return_mean_gm: -25890.24265555714
evaluation/return_std_gm: 1234.9704723038988
evaluation/length_mean_gm: 4000.0
evaluation/length_std_gm: 0.0
time/evaluation: 16.188983917236328
time/total: 12360.069968223572

Model saved at ./


Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 256
aug_traj/return: -5303.811026280224
aug_traj/length: 4000.0
time/training: 38.678760290145874
training/train_loss_mean: -3.413938159733763
training/train_loss_std: 0.2642750064347933
training/nll: -6.204018592834473
training/entropy: -2.9363105297088623
training/temp_value: 1.002981197134552
time/total: 13131.36930680275

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 257
aug_traj/return: -7173.1987902675355
aug_traj/length: 4000.0
time/training: 37.74834108352661
training/train_loss_mean: -3.387562664389871
training/train_loss_std: 0.2606723182074221
training/nll: -6.619773864746094
training/entropy: -3.0883028507232666
training/temp_value: 1.00350673554083
time/total: 13177.660103321075

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 258
aug_traj/return: -6640.173188318894
aug_traj/length: 4000.0
time/training: 38.17473793029785
training/train_loss_mean: -3.398057842949711
tra


Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 273
aug_traj/return: -3852.0454450778493
aug_traj/length: 4000.0
time/training: 38.00166153907776
training/train_loss_mean: -3.2989287042206783
training/train_loss_std: 0.24760297847639143
training/nll: -6.356984615325928
training/entropy: -3.127495050430298
training/temp_value: 1.000440363538643
time/total: 13968.433626413345

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 274
aug_traj/return: -3971.027473068078
aug_traj/length: 4000.0
time/training: 38.420623540878296
training/train_loss_mean: -3.3030205953034653
training/train_loss_std: 0.25701370608980084
training/nll: -6.646392345428467
training/entropy: -3.202181339263916
training/temp_value: 1.0000649209206323
time/total: 14015.293743133545

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 275
aug_traj/return: -4085.059780938862
aug_traj/length: 4000.0
time/training: 37.60509634017944
training/train_loss_mean: -3.33147699533265


Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 290
aug_traj/return: -6907.81670717922
aug_traj/length: 4000.0
time/training: 37.827624559402466
training/train_loss_mean: -3.255774269680486
training/train_loss_std: 0.25989332870546206
training/nll: -6.453128814697266
training/entropy: -3.0915896892547607
training/temp_value: 1.0002174532352668
evaluation/return_mean_gm: -29147.938595767337
evaluation/return_std_gm: 378.6997290772935
evaluation/length_mean_gm: 4000.0
evaluation/length_std_gm: 0.0
time/evaluation: 16.209022521972656
time/total: 14801.730618715286

Model saved at ./exp/2023.03.22/192943-default/model.pt
Iteration 291
aug_traj/return: -5488.050614504726
aug_traj/length: 4000.0
time/training: 38.73582220077515
training/train_loss_mean: -3.2289623868247
training/train_loss_std: 0.28096445151157995
training/nll: -6.745492458343506
training/entropy: -3.2328507900238037
training/temp_value: 0.9994670285856191
time/total: 14848.921913385391

Model saved at ./e

In [None]:
def study_env(env):
    
    state_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    action_range = [
        float(env.action_space.low.min()) + 1e-6,
        float(env.action_space.high.max()) - 1e-6]
        
    print("state_dim: {}".format(state_dim))
    print("act_dim: {}".format(act_dim))
    print("action_range: {}".format(action_range))


In [None]:
my_env = make_pytorch_env(args)
their_env = gym.make('antmaze-large-diverse-v2')

In [None]:
study_env(my_env)

In [None]:
study_env(their_env)

In [None]:
my_env.reset()
my_env.step(2)

In [None]:
args

In [None]:
their_env.action_space

In [None]:
their_env.reset()
their_env.step(2)

In [None]:
#experiment.variant
#experiment.model.forward

In [None]:
loss

In [None]:
experiment.model.forward

In [None]:
experiment.model

In [None]:
import math
math.log(1e-310)

In [None]:
action_preds = torch.load('action_preds.pt')


In [None]:
a = torch.load("a.pt")

In [None]:
action_preds.log_likelihood(a)

In [None]:
sefude = action_preds.log_likelihood(a)

In [None]:
a

In [None]:
a

In [None]:
a

In [None]:
torch.nan_to_num(sefude)

In [None]:
action_preds

In [None]:
a[0][0]

In [None]:
math.log(-0.3)

In [None]:
action_preds.entropy().mean()

In [None]:
action_preds.log_likelihood(10)

In [None]:
action_preds.perplexity

In [None]:
import torch
state_dim = 4
hidden_size = 512

embed_state = torch.nn.Linear(state_dim, hidden_size).to('cuda')
embed_state_2 = torch.load('embed_state.pt').to('cuda')
states = torch.load('states.pt').to('cuda')
state_embeddings = embed_state(states)
state_embeddings_2 = torch.load('state_embeddings.pt').to('cuda')


In [None]:
states[0]

In [None]:
print("state_embeddings {}".format(state_embeddings))


In [None]:
print("state_embeddings 2 {}".format(state_embeddings_2))


In [None]:
embed_state.weight

In [None]:
embed_state_2.weight

In [None]:
embed_state

In [None]:
embed_state_2

In [None]:
embed_state(states)

In [None]:
embed_state_2(states)

In [None]:
stoppppppppppp

In [None]:
import torch
torch.__version__

In [None]:
!pip list | grep torch

In [None]:
!pip3 install torch --upgrade

In [None]:
# Normalizando as rewards pra ver se resolve o problema

In [None]:
import pickle

with open('data/drone_dataset.pkl', 'rb') as f:
    my_data = pickle.load(f)
    
with open('data/antmaze-large-diverse-v2.pkl', 'rb') as f:
    their_data = pickle.load(f)

In [None]:
for data in my_data:
    rewards = data['actions']
    print("max: {}".format(np.max(rewards)))
    print("min: {}".format(np.min(rewards)))
    print("mean: {}".format(np.mean(rewards)))
    print('----------------')

In [None]:
np.shape(my_data[0]['observations'])

In [None]:
np.shape(their_data[0]['observations'])

In [None]:
(v - v.min()) / (v.max() - v.min())