In [1]:
#!cat ~/.bashrc

In [2]:
#!mv ../drone_dataset.pkl .

In [3]:
#!pip3 install --upgrade protobuf==3.20.0 

In [4]:
#!pip3 install transformers==4.5.1
#!pip3 install -U tokenizers
# The code below just solve many problems lol
#!pip3 uninstall tokenizers -y

In [5]:
from torch.utils.tensorboard import SummaryWriter
import argparse
import pickle
import random
import time
import gym
import d4rl
import torch
import numpy as np

import utils
from replay_buffer import ReplayBuffer
from lamb import Lamb
from stable_baselines3.common.vec_env import SubprocVecEnv
from pathlib import Path
from data import create_dataloader
from decision_transformer.models.decision_transformer import DecisionTransformer
from evaluation import create_vec_eval_episodes_fn, vec_evaluate_episode_rtg
from trainer import SequenceTrainer
from logger import Logger

from env import make_pytorch_env

#MAX_EPISODE_LEN = 2000 # 4000 # 2000 # 4000 # Warning: there is a similar variable in data.py! 

pybullet build time: May 20 2022 19:44:17


In [6]:
import sys
sys.argv = ['']

parser = argparse.ArgumentParser()
parser.add_argument("--seed", type=int, default=10)
parser.add_argument("--env", type=str, default="drone_dataset")

# model options
## Training Context Length K: (default: 20)
parser.add_argument("--K", type=int, default=40)
## Embedding dimension: (default: 512)
parser.add_argument("--embed_dim", type=int, default=512)
## Number of Layers: (default: 4)
parser.add_argument("--n_layer", type=int, default=8)
## Number of Attention Heads: (default: 4)
parser.add_argument("--n_head", type=int, default=8)
## Nonlinearity function: 
parser.add_argument("--activation_function", type=str, default="relu")
## Dropout:
parser.add_argument("--dropout", type=float, default=0.1)
## Evaluating Context Length K: 
parser.add_argument("--eval_context_length", type=int, default=10)
## Positional embedding: absolute ordering
parser.add_argument("--ordering", type=int, default=1) # 0

# shared evaluation options
# g_eval: (default: 3600)
parser.add_argument("--eval_rtg", type=int, default=6000)
parser.add_argument("--num_eval_episodes", type=int, default=10)

# shared training options
parser.add_argument("--init_temperature", type=float, default=0.1)
## Batch Size: (default: 256)
parser.add_argument("--batch_size", type=int, default=256)
parser.add_argument("--learning_rate", "-lr", type=float, default=1e-4)
parser.add_argument("--weight_decay", "-wd", type=float, default=5e-4)
parser.add_argument("--warmup_steps", type=int, default=10000)

# pretraining options
parser.add_argument("--max_pretrain_iters", type=int, default=1)
parser.add_argument("--num_updates_per_pretrain_iter", type=int, default=5000)

# finetuning options
parser.add_argument("--max_online_iters", type=int, default=1500)
parser.add_argument("--online_rtg", type=int, default=7200)
parser.add_argument("--num_online_rollouts", type=int, default=1)
parser.add_argument("--replay_size", type=int, default=1000)
parser.add_argument("--num_updates_per_online_iter", type=int, default=300)
parser.add_argument("--eval_interval", type=int, default=10)

# environment options
parser.add_argument("--device", type=str, default="cuda")
parser.add_argument("--log_to_tb", "-w", type=bool, default=True)
parser.add_argument("--save_dir", type=str, default="./exp")
parser.add_argument("--exp_name", type=str, default="default")

# general options
parser.add_argument("--max_episode_len", type=int, default=2000)

# Add for fast Debbuging
'''
parser.add_argument("--K", type=int, default=1)#####40)
parser.add_argument("--embed_dim", type=int, default=4)#####512)
parser.add_argument("--n_layer", type=int, default=2)#####8)
parser.add_argument("--n_head", type=int, default=2)#####8)
parser.add_argument("--eval_context_length", type=int, default=1)#####10) # d:5
parser.add_argument("--eval_rtg", type=int, default=600)#####6000)
parser.add_argument("--num_eval_episodes", type=int, default=2)#####10)
parser.add_argument("--batch_size", type=int, default=4)#####256)
parser.add_argument("--warmup_steps", type=int, default=10)#####10000)
parser.add_argument("--num_updates_per_pretrain_iter", type=int, default=500)#####5000)
parser.add_argument("--max_online_iters", type=int, default=500)#####1500)
parser.add_argument("--online_rtg", type=int, default=720)#####7200)
'''



args = parser.parse_args()

In [7]:
class Experiment:
    def __init__(self, variant):

        self.state_dim, self.act_dim, self.action_range = self._get_env_spec(variant)
        self.offline_trajs, self.state_mean, self.state_std = self._load_dataset(
            variant["env"]
        )
        # initialize by offline trajs
        self.replay_buffer = ReplayBuffer(variant["replay_size"], self.offline_trajs)

        self.aug_trajs = []

        self.device = variant.get("device", "cuda")
        self.target_entropy = -self.act_dim
        self.model = DecisionTransformer(
            state_dim=self.state_dim,
            act_dim=self.act_dim,
            action_range=self.action_range,
            max_length=variant["K"],
            eval_context_length=variant["eval_context_length"],
            max_ep_len=variant["max_episode_len"],
            hidden_size=variant["embed_dim"],
            n_layer=variant["n_layer"],
            n_head=variant["n_head"],
            n_inner=4 * variant["embed_dim"],
            activation_function=variant["activation_function"],
            n_positions=1024,
            resid_pdrop=variant["dropout"],
            attn_pdrop=variant["dropout"],
            stochastic_policy=True,
            ordering=variant["ordering"],
            init_temperature=variant["init_temperature"],
            target_entropy=self.target_entropy,
        ).to(device=self.device)

        self.optimizer = Lamb(
            self.model.parameters(),
            lr=variant["learning_rate"],
            weight_decay=variant["weight_decay"],
            eps=1e-8,
        )
        self.scheduler = torch.optim.lr_scheduler.LambdaLR(
            self.optimizer, lambda steps: min((steps + 1) / variant["warmup_steps"], 1)
        )

        self.log_temperature_optimizer = torch.optim.Adam(
            [self.model.log_temperature],
            lr=1e-4,
            betas=[0.9, 0.999],
        )

        # track the training progress and
        # training/evaluation/online performance in all the iterations
        self.pretrain_iter = 0
        self.online_iter = 0
        self.total_transitions_sampled = 0
        self.variant = variant
        self.reward_scale = 1.0 if "antmaze" in variant["env"] else 0.001
        self.logger = Logger(variant)

    def _get_env_spec(self, variant):
        #####env = gym.make(variant["env"])
        env = make_pytorch_env(args)
        #env.max_step = MAX_EPISODE_LEN
        state_dim = env.observation_space.shape[0]
        act_dim = env.action_space.shape[0]
        #action_range = [-0.999999, 0.999999]
        
        action_range = [
            float(env.action_space.low.min()) + 1e-6,
            float(env.action_space.high.max()) - 1e-6,
        ]
        
        print("action_range: {}".format(action_range))
        env.close()
        return state_dim, act_dim, action_range

    def _save_model(self, path_prefix, is_pretrain_model=False):
        to_save = {
            "model_state_dict": self.model.state_dict(),
            "optimizer_state_dict": self.optimizer.state_dict(),
            "scheduler_state_dict": self.scheduler.state_dict(),
            "pretrain_iter": self.pretrain_iter,
            "online_iter": self.online_iter,
            "args": self.variant,
            "total_transitions_sampled": self.total_transitions_sampled,
            "np": np.random.get_state(),
            "python": random.getstate(),
            "pytorch": torch.get_rng_state(),
            "log_temperature_optimizer_state_dict": self.log_temperature_optimizer.state_dict(),
        }

        with open(f"{path_prefix}/model.pt", "wb") as f:
            torch.save(to_save, f)
        print(f"\nModel saved at {path_prefix}/model.pt")

        if is_pretrain_model:
            with open(f"{path_prefix}/pretrain_model.pt", "wb") as f:
                torch.save(to_save, f)
            print(f"Model saved at {path_prefix}/pretrain_model.pt")

    def _load_model(self, path_prefix):
        if Path(f"{path_prefix}/model.pt").exists():
            with open(f"{path_prefix}/model.pt", "rb") as f:
                checkpoint = torch.load(f)
            self.model.load_state_dict(checkpoint["model_state_dict"])
            self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
            self.scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
            self.log_temperature_optimizer.load_state_dict(
                checkpoint["log_temperature_optimizer_state_dict"]
            )
            self.pretrain_iter = checkpoint["pretrain_iter"]
            self.online_iter = checkpoint["online_iter"]
            self.total_transitions_sampled = checkpoint["total_transitions_sampled"]
            np.random.set_state(checkpoint["np"])
            random.setstate(checkpoint["python"])
            torch.set_rng_state(checkpoint["pytorch"])
            print(f"Model loaded at {path_prefix}/model.pt")

    def _load_dataset(self, env_name):

        dataset_path = f"./data/{env_name}.pkl"
        with open(dataset_path, "rb") as f:
            trajectories = pickle.load(f)

        states, traj_lens, returns = [], [], []
        for path in trajectories:
            states.append(path["observations"])
            traj_lens.append(len(path["observations"]))
            returns.append(path["rewards"].sum())
        traj_lens, returns = np.array(traj_lens), np.array(returns)

        # used for input normalization
        states = np.concatenate(states, axis=0)
        state_mean, state_std = np.mean(states, axis=0), np.std(states, axis=0) + 1e-6
        num_timesteps = sum(traj_lens)

        print("=" * 50)
        print(f"Starting new experiment: {env_name}")
        print(f"{len(traj_lens)} trajectories, {num_timesteps} timesteps found")
        print(f"Average return: {np.mean(returns):.2f}, std: {np.std(returns):.2f}")
        print(f"Max return: {np.max(returns):.2f}, min: {np.min(returns):.2f}")
        print(f"Average length: {np.mean(traj_lens):.2f}, std: {np.std(traj_lens):.2f}")
        print(f"Max length: {np.max(traj_lens):.2f}, min: {np.min(traj_lens):.2f}")
        print("=" * 50)

        sorted_inds = np.argsort(returns)  # lowest to highest
        num_trajectories = 1
        timesteps = traj_lens[sorted_inds[-1]]
        ind = len(trajectories) - 2
        while ind >= 0 and timesteps + traj_lens[sorted_inds[ind]] < num_timesteps:
            timesteps += traj_lens[sorted_inds[ind]]
            num_trajectories += 1
            ind -= 1
        sorted_inds = sorted_inds[-num_trajectories:]
        trajectories = [trajectories[ii] for ii in sorted_inds]

        return trajectories, state_mean, state_std

    def _augment_trajectories(
        self,
        online_envs,
        target_explore,
        n,
        randomized=False,
    ):

        max_ep_len = self.variant["max_episode_len"]

        with torch.no_grad():
            # generate init state
            target_return = [target_explore * self.reward_scale] * online_envs.num_envs

            returns, lengths, trajs = vec_evaluate_episode_rtg(
                online_envs,
                self.state_dim,
                self.act_dim,
                self.model,
                max_ep_len=max_ep_len,
                reward_scale=self.reward_scale,
                target_return=target_return,
                mode="normal",
                state_mean=self.state_mean,
                state_std=self.state_std,
                device=self.device,
                use_mean=False,
            )

        self.replay_buffer.add_new_trajs(trajs)
        self.aug_trajs += trajs
        self.total_transitions_sampled += np.sum(lengths)

        return {
            "aug_traj/return": np.mean(returns),
            "aug_traj/length": np.mean(lengths),
        }

    def pretrain(self, eval_envs, loss_fn):
        print("\n\n\n*** Pretrain ***")
        print("----------------")
        print("eval_envs: {}".format(eval_envs))
        print("loss_fn: {}".format(loss_fn))
        
        eval_fns = [
            create_vec_eval_episodes_fn(
                vec_env=eval_envs,
                eval_rtg=self.variant["eval_rtg"],
                state_dim=self.state_dim,
                act_dim=self.act_dim,
                state_mean=self.state_mean,
                state_std=self.state_std,
                device=self.device,
                use_mean=True,
                reward_scale=self.reward_scale,
                max_episode_len = self.variant["max_episode_len"],
            )
        ]

        trainer = SequenceTrainer(
            model=self.model,
            optimizer=self.optimizer,
            log_temperature_optimizer=self.log_temperature_optimizer,
            scheduler=self.scheduler,
            device=self.device,
        )

        writer = (
            SummaryWriter(self.logger.log_path) if self.variant["log_to_tb"] else None
        )
        while self.pretrain_iter < self.variant["max_pretrain_iters"]:
            # in every iteration, prepare the data loader
            dataloader = create_dataloader(
                trajectories=self.offline_trajs,
                num_iters=self.variant["num_updates_per_pretrain_iter"],
                batch_size=self.variant["batch_size"],
                max_len=self.variant["K"],
                state_dim=self.state_dim,
                act_dim=self.act_dim,
                state_mean=self.state_mean,
                state_std=self.state_std,
                reward_scale=self.reward_scale,
                action_range=self.action_range,
                max_episode_len = self.variant["max_episode_len"],
            )

            train_outputs = trainer.train_iteration(
                loss_fn=loss_fn,
                dataloader=dataloader,
            )
            eval_outputs, eval_reward = self.evaluate(eval_fns)
            outputs = {"time/total": time.time() - self.start_time}
            outputs.update(train_outputs)
            outputs.update(eval_outputs)
            self.logger.log_metrics(
                outputs,
                iter_num=self.pretrain_iter,
                total_transitions_sampled=self.total_transitions_sampled,
                writer=writer,
            )

            self._save_model(
                path_prefix=self.logger.log_path,
                is_pretrain_model=True,
            )

            self.pretrain_iter += 1

    def evaluate(self, eval_fns):
        eval_start = time.time()
        self.model.eval()
        outputs = {}
        for eval_fn in eval_fns:
            o = eval_fn(self.model)
            outputs.update(o)
        outputs["time/evaluation"] = time.time() - eval_start

        eval_reward = outputs["evaluation/return_mean_gm"]
        return outputs, eval_reward

    def online_tuning(self, online_envs, eval_envs, loss_fn):

        print("\n\n\n*** Online Finetuning ***")

        trainer = SequenceTrainer(
            model=self.model,
            optimizer=self.optimizer,
            log_temperature_optimizer=self.log_temperature_optimizer,
            scheduler=self.scheduler,
            device=self.device,
        )
        eval_fns = [
            create_vec_eval_episodes_fn(
                vec_env=eval_envs,
                eval_rtg=self.variant["eval_rtg"],
                state_dim=self.state_dim,
                act_dim=self.act_dim,
                state_mean=self.state_mean,
                state_std=self.state_std,
                device=self.device,
                use_mean=True,
                reward_scale=self.reward_scale,
                max_episode_len = self.variant["max_episode_len"],
            )
        ]
        writer = (
            SummaryWriter(self.logger.log_path) if self.variant["log_to_tb"] else None
        )
        while self.online_iter < self.variant["max_online_iters"]:

            outputs = {}
            augment_outputs = self._augment_trajectories(
                online_envs,
                self.variant["online_rtg"],
                n=self.variant["num_online_rollouts"],
            )
            outputs.update(augment_outputs)

            dataloader = create_dataloader(
                trajectories=self.replay_buffer.trajectories,
                num_iters=self.variant["num_updates_per_online_iter"],
                batch_size=self.variant["batch_size"],
                max_len=self.variant["K"],
                state_dim=self.state_dim,
                act_dim=self.act_dim,
                state_mean=self.state_mean,
                state_std=self.state_std,
                reward_scale=self.reward_scale,
                action_range=self.action_range,
                max_episode_len = self.variant["max_episode_len"],
            )

            # finetuning
            is_last_iter = self.online_iter == self.variant["max_online_iters"] - 1
            if (self.online_iter + 1) % self.variant[
                "eval_interval"
            ] == 0 or is_last_iter:
                evaluation = True
            else:
                evaluation = False

            train_outputs = trainer.train_iteration(
                loss_fn=loss_fn,
                dataloader=dataloader,
            )
            outputs.update(train_outputs)

            if evaluation:
                eval_outputs, eval_reward = self.evaluate(eval_fns)
                outputs.update(eval_outputs)

            outputs["time/total"] = time.time() - self.start_time

            # log the metrics
            self.logger.log_metrics(
                outputs,
                iter_num=self.pretrain_iter + self.online_iter,
                total_transitions_sampled=self.total_transitions_sampled,
                writer=writer,
            )

            self._save_model(
                path_prefix=self.logger.log_path,
                is_pretrain_model=False,
            )

            self.online_iter += 1

    def __call__(self):

        utils.set_seed_everywhere(args.seed)

        import d4rl

        def loss_fn(
            a_hat_dist,     # action_preds
            a,              # action_target
            attention_mask, # padding_mask
            entropy_reg,    # self.model.temperature().detach()
        ):
            # a_hat is a SquashedNormal Distribution
            log_likelihood = a_hat_dist.log_likelihood(a)[attention_mask > 0].mean()
            
            entropy = a_hat_dist.entropy().mean()
            loss = -(log_likelihood + entropy_reg * entropy)
            
            '''
            print("a_hat_dist : {}".format(a_hat_dist))
            print("a : {}".format(a))
            torch.save(a,"a.pt")
            print("a_hat_dist.log_likelihood(a) : {}".format(a_hat_dist.log_likelihood(a)))
            #print("attention_mask : {}".format(attention_mask))
            print("log_likelihood: {}".format(log_likelihood))
            print("loss inside jupyter: {} of type: {}".format(loss,type(loss)))
            '''
            
            return (
                loss,
                -log_likelihood,
                entropy,
            )

        def get_env_builder(seed, env_name, target_goal=None):
            def make_env_fn():
                import d4rl

                #####env = gym.make(env_name)
                env = make_pytorch_env(args)
                #env.max_step = MAX_EPISODE_LEN
                env.seed(seed)
                '''
                if hasattr(env.env, "wrapped_env"):
                    env.env.wrapped_env.seed(seed)
                elif hasattr(env.env, "seed"):
                    env.env.seed(seed)
                else:
                    pass
                '''
                '''
                env.action_space.seed(seed)
                env.observation_space.seed(seed)
                '''

                if target_goal:
                    env.set_target_goal(target_goal)
                    print(f"Set the target goal to be {env.target_goal}")
                return env

            return make_env_fn

        print("\n\nMaking Eval Env.....")
        env_name = self.variant["env"]
        if "antmaze" in env_name:
            env = gym.make(env_name)
            target_goal = env.target_goal
            env.close()
            print(f"Generated the fixed target goal: {target_goal}")
        else:
            target_goal = None
        eval_envs = SubprocVecEnv(
            [
                get_env_builder(i, env_name=env_name, target_goal=target_goal)
                for i in range(self.variant["num_eval_episodes"])
            ]
        )

        self.start_time = time.time()
        
        if self.variant["max_pretrain_iters"]:
            self.pretrain(eval_envs, loss_fn)
        
        if self.variant["max_online_iters"]:
            print("\n\nMaking Online Env.....")
            online_envs = SubprocVecEnv(
                [
                    get_env_builder(i + 100, env_name=env_name, target_goal=target_goal)
                    for i in range(self.variant["num_online_rollouts"])
                ]
            )
            self.online_tuning(online_envs, eval_envs, loss_fn)
            online_envs.close()

        eval_envs.close()

In [None]:
utils.set_seed_everywhere(args.seed)
experiment = Experiment(vars(args))

print("=" * 50)
experiment()

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


action_range: [-0.999999, 0.999999]
Starting new experiment: drone_dataset
1254 trajectories, 1971662 timesteps found
Average return: 3687.11, std: 875.27
Max return: 5216.00, min: 1264.00
Average length: 1572.30, std: 325.37
Max length: 2000.00, min: 920.00
Experiment log path: ./exp/2023.03.23/210846-default


Making Eval Env.....


pybullet build time: May 20 2022 19:44:17
pybullet build time: May 20 2022 19:44:17
pybullet build time: May 20 2022 19:44:17
pybullet build time: May 20 2022 19:44:17
pybullet build time: May 20 2022 19:44:17
pybullet build time: May 20 2022 19:44:17
pybullet build time: May 20 2022 19:44:17
pybullet build time: May 20 2022 19:44:17
pybullet build time: May 20 2022 19:44:17
pybullet build time: May 20 2022 19:44:17





*** Pretrain ***
----------------
eval_envs: <stable_baselines3.common.vec_env.subproc_vec_env.SubprocVecEnv object at 0x7fb12b024f70>
loss_fn: <function Experiment.__call__.<locals>.loss_fn at 0x7fb12af4c550>
Iteration 0
time/total: 2114.2444989681244
time/training: 2100.520931482315
training/train_loss_mean: 485.36548055641714
training/train_loss_std: 1814.8151574660537
training/nll: -8.106392860412598
training/entropy: -5.479274272918701
training/temp_value: 0.1389832622826801
evaluation/return_mean_gm: -15252.124855877486
evaluation/return_std_gm: 585.554763952337
evaluation/length_mean_gm: 2000.0
evaluation/length_std_gm: 0.0
time/evaluation: 13.651319026947021

Model saved at ./exp/2023.03.23/210846-default/model.pt
Model saved at ./exp/2023.03.23/210846-default/pretrain_model.pt


Making Online Env.....


pybullet build time: May 20 2022 19:44:17





*** Online Finetuning ***
Iteration 1
aug_traj/return: -9673.539614860325
aug_traj/length: 2000.0
time/training: 127.313720703125
training/train_loss_mean: -7.490225026782818
training/train_loss_std: 0.32555692744405623
training/nll: -8.513853073120117
training/entropy: -5.705856800079346
training/temp_value: 0.14412743401064995
time/total: 2250.48313164711

Model saved at ./exp/2023.03.23/210846-default/model.pt
Iteration 2
aug_traj/return: -10534.549647993284
aug_traj/length: 2000.0
time/training: 127.218674659729
training/train_loss_mean: -7.625753367341819
training/train_loss_std: 0.3218401629551256
training/nll: -8.470282554626465
training/entropy: -5.619393825531006
training/temp_value: 0.14955678086315907
time/total: 2383.7400357723236

Model saved at ./exp/2023.03.23/210846-default/model.pt
Iteration 3
aug_traj/return: -15603.031585460478
aug_traj/length: 2000.0
time/training: 127.3924560546875
training/train_loss_mean: -7.731088190730953
training/train_loss_std: 0.371361317


Model saved at ./exp/2023.03.23/210846-default/model.pt
Iteration 18
aug_traj/return: -7944.006460521699
aug_traj/length: 2000.0
time/training: 127.60695505142212
training/train_loss_mean: -8.442174642551599
training/train_loss_std: 0.4070900462555881
training/nll: -9.66983699798584
training/entropy: -6.842126369476318
training/temp_value: 0.2632891111194778
time/total: 4537.902723789215

Model saved at ./exp/2023.03.23/210846-default/model.pt
Iteration 19
aug_traj/return: -10793.501987656271
aug_traj/length: 2000.0
time/training: 127.60411190986633
training/train_loss_mean: -8.529940348801723
training/train_loss_std: 0.403202286107585
training/nll: -9.390022277832031
training/entropy: -7.028358459472656
training/temp_value: 0.2728224064807252
time/total: 4671.672616958618

Model saved at ./exp/2023.03.23/210846-default/model.pt
Iteration 20
aug_traj/return: -4164.831370037468
aug_traj/length: 2000.0
time/training: 127.27671146392822
training/train_loss_mean: -8.421928213902673
traini


Model saved at ./exp/2023.03.23/210846-default/model.pt
Iteration 35
aug_traj/return: -4802.6055832046895
aug_traj/length: 2000.0
time/training: 127.31180047988892
training/train_loss_mean: -8.394217494437163
training/train_loss_std: 0.2459819692985502
training/nll: -11.72803783416748
training/entropy: -7.603883266448975
training/temp_value: 0.46874407402065593
time/total: 6837.724826812744

Model saved at ./exp/2023.03.23/210846-default/model.pt
Iteration 36
aug_traj/return: -2184.0
aug_traj/length: 2000.0
time/training: 127.38120484352112
training/train_loss_mean: -8.247947189966846
training/train_loss_std: 0.3297887148231673
training/nll: -11.420433044433594
training/entropy: -7.193024635314941
training/temp_value: 0.48454824662970125
time/total: 6971.376536369324

Model saved at ./exp/2023.03.23/210846-default/model.pt
Iteration 37
aug_traj/return: -11877.0
aug_traj/length: 2000.0
time/training: 127.40717267990112
training/train_loss_mean: -8.153495439495861
training/train_loss_st


Model saved at ./exp/2023.03.23/210846-default/model.pt
Iteration 52
aug_traj/return: -3812.0955484475235
aug_traj/length: 2000.0
time/training: 127.30602240562439
training/train_loss_mean: -6.228546736154475
training/train_loss_std: 0.21938057907267305
training/nll: -12.045866966247559
training/entropy: -7.189733982086182
training/temp_value: 0.8182223768161956
time/total: 9136.809084415436

Model saved at ./exp/2023.03.23/210846-default/model.pt
Iteration 53
aug_traj/return: -1287.8470630222532
aug_traj/length: 2000.0
time/training: 127.27637982368469
training/train_loss_mean: -6.051904610214445
training/train_loss_std: 0.21089151126631148
training/nll: -11.760930061340332
training/entropy: -7.032107830047607
training/temp_value: 0.8450565677472541
time/total: 9270.206394672394

Model saved at ./exp/2023.03.23/210846-default/model.pt
Iteration 54
aug_traj/return: -2357.0496051112173
aug_traj/length: 2000.0
time/training: 127.52279496192932
training/train_loss_mean: -5.87743032775045


Model saved at ./exp/2023.03.23/210846-default/model.pt
Iteration 69
aug_traj/return: -2228.2708357778765
aug_traj/length: 2000.0
time/training: 127.32409358024597
training/train_loss_mean: -4.964005753189904
training/train_loss_std: 0.18049886563508927
training/nll: -8.59914779663086
training/entropy: -3.041093587875366
training/temp_value: 1.0219139553505876
time/total: 11425.222650527954

Model saved at ./exp/2023.03.23/210846-default/model.pt
Iteration 70
aug_traj/return: -2642.773358190107
aug_traj/length: 2000.0
time/training: 127.55595254898071
training/train_loss_mean: -4.955961346290846
training/train_loss_std: 0.167536105855215
training/nll: -7.830422401428223
training/entropy: -2.9493188858032227
training/temp_value: 1.021584024005158
evaluation/return_mean_gm: -7452.656333461555
evaluation/return_std_gm: 2908.188075092388
evaluation/length_mean_gm: 2000.0
evaluation/length_std_gm: 0.0
time/evaluation: 13.616621971130371
time/total: 11572.666473150253

Model saved at ./exp/


Model saved at ./exp/2023.03.23/210846-default/model.pt
Iteration 86
aug_traj/return: -2321.6655872848937
aug_traj/length: 2000.0
time/training: 127.21845245361328
training/train_loss_mean: -4.950611316902603
training/train_loss_std: 0.17987583099972462
training/nll: -8.133801460266113
training/entropy: -2.921909809112549
training/temp_value: 1.023179537840018
time/total: 13723.692293167114

Model saved at ./exp/2023.03.23/210846-default/model.pt
Iteration 87
aug_traj/return: -2052.4071153669124
aug_traj/length: 2000.0
time/training: 127.33053803443909
training/train_loss_mean: -4.932280317543361
training/train_loss_std: 0.16781275725991118
training/nll: -8.006542205810547
training/entropy: -3.1798818111419678
training/temp_value: 1.0221022617544655
time/total: 13857.107142686844

Model saved at ./exp/2023.03.23/210846-default/model.pt
Iteration 88
aug_traj/return: -595.954878035112
aug_traj/length: 2000.0
time/training: 127.07156562805176
training/train_loss_mean: -4.930051889037512



Model saved at ./exp/2023.03.23/210846-default/model.pt
Iteration 103
aug_traj/return: -4077.488006092503
aug_traj/length: 2000.0
time/training: 127.37723660469055
training/train_loss_mean: -4.888995764348415
training/train_loss_std: 0.17870473136631906
training/nll: -7.891871452331543
training/entropy: -3.0671756267547607
training/temp_value: 1.0204512413930622
time/total: 16020.960854768753

Model saved at ./exp/2023.03.23/210846-default/model.pt
Iteration 104
aug_traj/return: -3055.4539291886585
aug_traj/length: 2000.0
time/training: 127.56203937530518
training/train_loss_mean: -4.8884461689611864
training/train_loss_std: 0.18468386787160052
training/nll: -7.673949718475342
training/entropy: -2.7084896564483643
training/temp_value: 1.0197298892582949
time/total: 16154.52244234085

Model saved at ./exp/2023.03.23/210846-default/model.pt
Iteration 105
aug_traj/return: -20.071887450380018
aug_traj/length: 2000.0
time/training: 127.39182662963867
training/train_loss_mean: -4.8695103318


Model saved at ./exp/2023.03.23/210846-default/model.pt
Iteration 120
aug_traj/return: -132.14730108438744
aug_traj/length: 2000.0
time/training: 127.9992618560791
training/train_loss_mean: -4.80662362074915
training/train_loss_std: 0.18500895245234972
training/nll: -7.059218406677246
training/entropy: -2.6377193927764893
training/temp_value: 1.0211542773438012
evaluation/return_mean_gm: -5973.018261334088
evaluation/return_std_gm: 1268.9433281010117
evaluation/length_mean_gm: 2000.0
evaluation/length_std_gm: 0.0
time/evaluation: 13.589059591293335
time/total: 18320.227265119553

Model saved at ./exp/2023.03.23/210846-default/model.pt
Iteration 121
aug_traj/return: -100.61943347653687
aug_traj/length: 2000.0
time/training: 127.19037985801697
training/train_loss_mean: -4.8312761826772626
training/train_loss_std: 0.18534708746246845
training/nll: -8.059770584106445
training/entropy: -3.0569374561309814
training/temp_value: 1.0185109095188078
time/total: 18453.49631500244

Model saved at

In [None]:
def study_env(env):
    
    state_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    action_range = [
        float(env.action_space.low.min()) + 1e-6,
        float(env.action_space.high.max()) - 1e-6]
        
    print("state_dim: {}".format(state_dim))
    print("act_dim: {}".format(act_dim))
    print("action_range: {}".format(action_range))


In [None]:
my_env = make_pytorch_env(args)
their_env = gym.make('antmaze-large-diverse-v2')

In [None]:
study_env(my_env)

In [None]:
study_env(their_env)

In [None]:
my_env.reset()
my_env.step(2)

In [None]:
args

In [None]:
their_env.action_space

In [None]:
their_env.reset()
their_env.step(2)

In [None]:
#experiment.variant
#experiment.model.forward

In [None]:
loss

In [None]:
experiment.model.forward

In [None]:
experiment.model

In [None]:
import math
math.log(1e-310)

In [None]:
action_preds = torch.load('action_preds.pt')


In [None]:
a = torch.load("a.pt")

In [None]:
action_preds.log_likelihood(a)

In [None]:
sefude = action_preds.log_likelihood(a)

In [None]:
a

In [None]:
a

In [None]:
a

In [None]:
torch.nan_to_num(sefude)

In [None]:
action_preds

In [None]:
a[0][0]

In [None]:
math.log(-0.3)

In [None]:
action_preds.entropy().mean()

In [None]:
action_preds.log_likelihood(10)

In [None]:
action_preds.perplexity

In [None]:
import torch
state_dim = 4
hidden_size = 512

embed_state = torch.nn.Linear(state_dim, hidden_size).to('cuda')
embed_state_2 = torch.load('embed_state.pt').to('cuda')
states = torch.load('states.pt').to('cuda')
state_embeddings = embed_state(states)
state_embeddings_2 = torch.load('state_embeddings.pt').to('cuda')


In [None]:
states[0]

In [None]:
print("state_embeddings {}".format(state_embeddings))


In [None]:
print("state_embeddings 2 {}".format(state_embeddings_2))


In [None]:
embed_state.weight

In [None]:
embed_state_2.weight

In [None]:
embed_state

In [None]:
embed_state_2

In [None]:
embed_state(states)

In [None]:
embed_state_2(states)

In [None]:
stoppppppppppp

In [None]:
import torch
torch.__version__

In [None]:
!pip list | grep torch

In [None]:
!pip3 install torch --upgrade

In [None]:
# Normalizando as rewards pra ver se resolve o problema

In [None]:
import pickle

with open('data/drone_dataset.pkl', 'rb') as f:
    my_data = pickle.load(f)
    
with open('data/antmaze-large-diverse-v2.pkl', 'rb') as f:
    their_data = pickle.load(f)

In [None]:
for data in my_data:
    rewards = data['actions']
    print("max: {}".format(np.max(rewards)))
    print("min: {}".format(np.min(rewards)))
    print("mean: {}".format(np.mean(rewards)))
    print('----------------')

In [None]:
np.shape(my_data[0]['observations'])

In [None]:
np.shape(their_data[0]['observations'])

In [None]:
(v - v.min()) / (v.max() - v.min())