In [None]:
import numpy as np
import torch
import pickle

In [1]:
import argparse
# import gym
import gymnasium as gym
import torch.nn as nn
import time
from data_generator_2groups_4tasks import DataGenerator
from models_en import GaussianPolicy, Value, MultiTaskGaussianPolicy,MutilValue
from environment import get_threshold
from utils import *
from collections import deque

from big_foot_half_cheetah_v4 import BigFootHalfCheetahEnv
from huge_gravity_half_cheetah_v4 import HugeGravityHalfCheetahEnv
from ten_fric_half_cheetah_v4 import TenFricHalfCheetahEnv


from collections import deque
from itertools import combinations
import numpy as np

In [2]:

class Agent:
    def __init__(self,env,envname,args,device):
        # TO-DO: add the task encoding length to observation dimension.
        obs_dim = env.observation_space.shape[0] + 4
        act_dim = env.action_space.shape[0]

        # Change to multi-gaussian policy. No task_id here.
    
        # change which to policy
        policy = GaussianPolicy(obs_dim, act_dim, args.hidden_size, args.activation, args.logstd)
        value_net = Value(obs_dim, args.hidden_size, args.activation)
        cvalue_net = Value(obs_dim, args.hidden_size, args.activation)
        policy.to(device)
        value_net.to(device)
        cvalue_net.to(device)
        
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        
        self.policy = policy
        self.value_net = value_net
        self.cvalue_net = cvalue_net
        
        # Initialize optimizer
        self.pi_optimizer = torch.optim.Adam(policy.parameters(), args.pi_lr)
        self.vf_optimizer = torch.optim.Adam(value_net.parameters(), args.vf_lr)
        self.cvf_optimizer = torch.optim.Adam(cvalue_net.parameters(), args.cvf_lr)

        lr_lambda = lambda it: max(1.0 - it / args.max_iter_num, 0)
        self.pi_scheduler = torch.optim.lr_scheduler.LambdaLR(self.pi_optimizer, lr_lambda=lr_lambda)
        self.vf_scheduler = torch.optim.lr_scheduler.LambdaLR(self.vf_optimizer, lr_lambda=lr_lambda)
        self.cvf_scheduler = torch.optim.lr_scheduler.LambdaLR(self.cvf_optimizer, lr_lambda=lr_lambda)
        
        hyperparams = vars(args)
        
        self.running_stat = RunningStats(clip=5)
        self.score_queue = deque(maxlen=100)
        self.cscore_queue = deque(maxlen=100)

        self.score_queues = [deque(maxlen=100),deque(maxlen=100), deque(maxlen=100), deque(maxlen=100)]
        self.logger = Logger(hyperparams, 1)
        # envname = envname[env]
        self.cost_lim = get_threshold(envname, constraint=args.constraint)
        
class MultiTaskAgent(Agent):
    def __init__(self, env, envname, args, device, num_tasks):
        super().__init__(env, envname, args, device)
        
        # Override the policy with MultiTaskGaussianPolicy
        self.policy = MultiTaskGaussianPolicy(self.obs_dim, self.act_dim, num_tasks, 
                                              args.hidden_size, args.activation, args.logstd)
        self.policy.to(device)
        
        #
        self.value_net = MutilValue(self.obs_dim,num_tasks, args.hidden_size, args.activation)
        #value_net = Value(obs_dim, args.hidden_size, args.activation)
        self.value_net.to(device)
        
        # Reinitialize the optimizer for the new policy
        self.pi_optimizer = torch.optim.Adam(self.policy.parameters(), args.pi_lr)
        self.pi_scheduler = torch.optim.lr_scheduler.LambdaLR(self.pi_optimizer, 
                                                              lr_lambda=lambda it: max(1.0 - it / args.max_iter_num, 0))
        #
        self.vf_optimizer = torch.optim.Adam(self.value_net.parameters(), args.vf_lr)

In [8]:
parser = argparse.ArgumentParser(description='PyTorch FOCOPS Implementation')
parser.add_argument('--epsilon',type=float, default=1000,
                   help='Maximum difference between the return of any two groups (Default: 1000)')
parser.add_argument('--rounds-of-update',type=int, default=3,
                   help='The number of times policy from each group take turn to update')

parser.add_argument('--env-id', default='Humanoid-v3',
                    help='Name of Environment (default: Humanoid-v3')
parser.add_argument('--constraint', default='velocity',
                    help='Constraint setting (default: velocity')
parser.add_argument('--activation', default="tanh",
                    help='Activation function for policy/critic network (Default: tanh)')
parser.add_argument('--hidden_size', type=float, default=(64, 64),
                    help='Tuple of size of hidden layers for policy/critic network (Default: (64, 64))')
parser.add_argument('--logstd', type=float, default=-0.5,
                    help='Log std of Policy (Default: -0.5)')
parser.add_argument('--gamma', type=float, default=0.99,
                    help='Discount factor for reward (Default: 0.99)')
parser.add_argument('--c-gamma', type=float, default=0.99,
                    help='Discount factor for cost (Default: 0.99)')
parser.add_argument('--gae-lam', type=float, default=0.95,
                    help='Lambda value for GAE for reward (Default: 0.95)')
parser.add_argument('--c-gae-lam', type=float, default=0.95,
                    help='Lambda value for GAE for cost (Default: 0.95)')
parser.add_argument('--l2-reg', type=float, default=1e-3,
                    help='L2 Regularization Rate (default: 1e-3)')
parser.add_argument('--pi-lr', type=float, default=1e-4,
                    help='Learning Rate for policy (default: 3e-4)')
parser.add_argument('--vf-lr', type=float, default=3e-4,
                    help='Learning Rate for value function (default: 3e-4)')
parser.add_argument('--cvf-lr', type=float, default=3e-4,
                    help='Learning Rate for c-value function (default: 3e-4)')
parser.add_argument('--lam', type=float, default=1.5,
                    help='Inverse temperature lambda (default: 1.5)')
parser.add_argument('--delta', type=float, default=0.02,
                    help='KL bound (default: 0.02)')
parser.add_argument('--eta', type=float, default=0.02,
                    help='KL bound for indicator function (default: 0.02)')
# parser.add_argument('--nu', type=float, default=0,
#                     help='Cost coefficient (default: 0)')
parser.add_argument('--nu', type=float, default=[0, 0],
                    help='Cost coefficient (default: 0)')
parser.add_argument('--nu_lr', type=float, default=0.01,
                    help='Cost coefficient learning rate (default: 0.01)')
parser.add_argument('--nu_max', type=float, default=2.0,
                    help='Maximum cost coefficient (default: 2.0)')
parser.add_argument('--seed', type=int, default=0,
                    help='Random Seed (default: 0)')
parser.add_argument('--max-eps-len', type=int, default=1000,
                    help='Maximum length of episode (default: 1000)')
parser.add_argument('--mb-size', type=int, default=64,
                    help='Minibatch size per update (default: 64)')
parser.add_argument('--batch-size', type=int, default=2048,
                    help='Batch Size per Update (default: 2048)')
parser.add_argument('--num-epochs', type=int, default=10,
                    help='Number of passes through each minibatch per update (default: 10)')
parser.add_argument('--max-iter-num', type=int, default=200,
                    help='Number of Main Iterations (default: 500)')
parser.add_argument("--nu-init", type=float, default=0,
                    help="the initial nu parameter")
args = parser.parse_args()

usage: ipykernel_launcher.py [-h] [--epsilon EPSILON]
                             [--rounds-of-update ROUNDS_OF_UPDATE]
                             [--env-id ENV_ID] [--constraint CONSTRAINT]
                             [--activation ACTIVATION]
                             [--hidden_size HIDDEN_SIZE] [--logstd LOGSTD]
                             [--gamma GAMMA] [--c-gamma C_GAMMA]
                             [--gae-lam GAE_LAM] [--c-gae-lam C_GAE_LAM]
                             [--l2-reg L2_REG] [--pi-lr PI_LR] [--vf-lr VF_LR]
                             [--cvf-lr CVF_LR] [--lam LAM] [--delta DELTA]
                             [--eta ETA] [--nu NU] [--nu_lr NU_LR]
                             [--nu_max NU_MAX] [--seed SEED]
                             [--max-eps-len MAX_EPS_LEN] [--mb-size MB_SIZE]
                             [--batch-size BATCH_SIZE]
                             [--num-epochs NUM_EPOCHS]
                             [--max-iter-num MAX_ITER_NUM] [--nu-init

SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [4]:
agent = MultiTaskAgent(env, 'hcv4')

TypeError: __init__() missing 5 required positional arguments: 'env', 'envname', 'args', 'device', and 'num_tasks'

In [6]:
env = gym.make('HalfCheetah-v4')

In [None]:
a = [[2, 1, 3], [1, 2, 0]]

In [None]:
b = [2, 1, 3, 0]

In [None]:
torch.tensor(a)

In [None]:
np.shape(torch.tensor(a))

In [18]:
torch.max(torch.tensor(a), axis=0)

torch.return_types.max(
values=tensor([2, 2, 3]),
indices=tensor([0, 1, 0]))

In [23]:
torch.argmax(torch.tensor(b))

tensor(2)

In [40]:
with open('x.pkl', 'rb') as f:
    q_values = pickle.load(f)

In [41]:
np.shape(q_values)

torch.Size([64])

In [42]:
q_values

tensor([ 1.5688, -6.5910, -1.0667,  1.1510,  2.3388, -0.2951, -2.0908,  1.6278,
        -1.9939, -0.8018, -0.3756, -2.6624, -0.3271, -0.9380,  0.9938, -0.6569,
        -2.5282,  2.6348, -1.9309,  1.1222, -0.4571,  1.1015, -0.0088,  5.1097,
        -1.5109,  1.5340, -0.3305, -1.5948,  1.3264,  0.0164, -0.9399, -0.8253,
         1.6932, -1.7627,  0.9988, -0.7561, -1.4530, -1.7285,  5.8658, -2.0212,
        -0.2736, -1.0093,  7.7411,  2.3708,  0.4603,  0.0329, -0.1991,  0.7409,
        -2.3331, -1.1794, -1.3610, -0.5626, -2.8930, -0.2411, -1.8140, -0.4838,
         0.5190, -0.8248, -2.4412,  1.6214, -2.6672,  4.1839, -0.7971, -2.3388])

In [35]:
q_values[2]

tensor([-28.6451, -29.3490, -25.4953, -27.8936], grad_fn=<SelectBackward0>)

In [31]:
values, _ = torch.max(q_values, axis=1)

In [37]:
values[2]

tensor(-25.4953, grad_fn=<SelectBackward0>)