In [1]:
import os
import time
from datetime import datetime
import argparse
import gymnasium as gym
import numpy as np
import torch as th
import pandas as pd
import csv

from gym_pybullet_drones.utils.Logger import Logger
from gym_pybullet_drones.envs.HoverAviary import HoverAviary
from gym_pybullet_drones.envs.MultiHoverAviary import MultiHoverAviary
from gym_pybullet_drones.utils.utils import sync, str2bool
from gym_pybullet_drones.utils.enums import ObservationType, ActionType, Physics

from policies import GaussianMLPPolicy
from server import Federated_RL

DEFAULT_GUI = True
DEFAULT_RECORD_VIDEO = True
DEFAULT_OUTPUT_FOLDER = 'results'
DEFAULT_COLAB = False
DEFAULT_DYNAMICS = Physics('pyb') # pyb: Pybullet dynamics; dyn: Explicit Dynamics specified in BaseAviary.py
DEFAULT_WIND = np.array([0, 0.05, 0]) # units are in induced newtons
DEFAULT_OBS = ObservationType('kin') # 'kin' or 'rgb'
DEFAULT_ACT = ActionType('rpm') # 'rpm' or 'pid' or 'vel' or 'one_d_rpm' or 'one_d_pid'
DEFAULT_AGENTS = 2
DEFAULT_MA = False

DR = True
MASS_RANGE = [0.027, 0.042] # Maximum recommended payload is 15g
WIND_RANGE = 0.005 # Inspired by literature


pybullet build time: Jun 24 2024 15:23:59


In [2]:
algorithms = ['FedSVRPG-M', 'PPO', 'SAC', 'TD3']
num_agents = len(algorithms)
envs = [HoverAviary for _ in range(num_agents)]
env_kwargs = [dict(obs = DEFAULT_OBS, act = DEFAULT_ACT) for _ in range(num_agents)]
agent_names = algorithms
if DR == True:
    domain_randomizations = [DR for _ in range(num_agents)]
    DR_episode_thresholds = [.5 for _ in range(num_agents)] # Probability of DR at each episode
    DR_step_thresholds = [.3 for _ in range(num_agents)] # If DR episode, probability of wind at each step

mass_ranges = [MASS_RANGE for _ in range(num_agents)]
wind_ranges = [WIND_RANGE for _ in range(num_agents)]
env_example = HoverAviary(**env_kwargs[0])
# Get the state size
state_space = env_example.observation_space
state_size = state_space.shape[1]
# Get the action size
action_space = env_example.action_space
action_size = action_space.shape[1]

layers = [512, 512, 256, 128]
value_layers = [32, 32]
# Maintain consistent network structures
policy_kwargs = dict(activation_fn=th.nn.Tanh,
                     net_arch=dict(pi=layers, qf=value_layers))

print("State size:", state_size)
print("Action size:", action_size)

policy = GaussianMLPPolicy(input_size=state_size, output_size=action_size, hidden_layers=layers) # Will need some smarter way to initialize the policy within the model in the future
# ASSUMING ONE ALGORITHM SO FAR. WILL IMPLEMENT GENERAL STRUCTURE FOR DIVERSIFIED ALGORITHMS LATER

#### Train the model #######################################
model = Federated_RL(policy = policy,
                     envs = envs,
                     env_kwargs = env_kwargs,
                     num_agents = num_agents,
                     global_iterations = 2,
                     state_size = state_size,
                     action_size = action_size,
                     local_step_size = 1e-3,
                     policy_kwargs = policy_kwargs,
                     critic_net_aggregation = True,
                     critic_net = value_layers,
                     local_iterations = 5,
                     max_episode_length=64*4,
                     agent_names = agent_names,
                     DR = domain_randomizations,
                     DR_episode_th = DR_episode_thresholds,
                     DR_step_th = DR_step_thresholds,
                     mass_ranges = mass_ranges,
                     wind_ranges = wind_ranges,
                     algorithms = algorithms)



State size: 72
Action size: 4


  gym.logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [3]:
serverModel = model.learn()

  gym.logger.warn(f"Box bound precision lowered by casting to {self.dtype}")



Training agent FedSVRPG-M

GLOBAL ITERATION: 0
LOCAL ITERATION: 0

Episode Reward: 78.96915473295871

Importance sampling weight: 1.7999999523162842

Episode has activated Domain Randomization. Wind will be applied with a probability of 0.3 with a maximum magnitude of 0.005 Newtons.
Mass parameter for next episode: 0.03629291746542303
GLOBAL ITERATION: 0
LOCAL ITERATION: 1

Episode Reward: 67.09965754714028

Importance sampling weight: 1.7999999523162842

Episode has activated Domain Randomization. Wind will be applied with a probability of 0.3 with a maximum magnitude of 0.005 Newtons.
Mass parameter for next episode: 0.0393113446402317
GLOBAL ITERATION: 0
LOCAL ITERATION: 2

Episode Reward: 96.55948269861223

Importance sampling weight: 0.0010000000474974513

Episode has activated Domain Randomization. Wind will be applied with a probability of 0.3 with a maximum magnitude of 0.005 Newtons.
Mass parameter for next episode: 0.03388872871207071
GLOBAL ITERATION: 0
LOCAL ITERATION: 3





Eval num_timesteps=256, episode_reward=133.34 +/- 0.32
Episode length: 131.20 +/- 0.40
New best mean reward!
Eval num_timesteps=512, episode_reward=40.71 +/- 3.10
Episode length: 45.20 +/- 7.11
Eval num_timesteps=768, episode_reward=42.86 +/- 5.89
Episode length: 38.60 +/- 10.76
Eval num_timesteps=1024, episode_reward=51.56 +/- 6.08
Episode length: 68.60 +/- 6.09
Eval num_timesteps=1280, episode_reward=46.96 +/- 10.17
Episode length: 62.60 +/- 7.89

Training agent SAC





Episode has activated Domain Randomization. Wind will be applied with a probability of 0.3 with a maximum magnitude of 0.005 Newtons.
Mass parameter for next episode: 0.03051991483783581
Eval num_timesteps=256, episode_reward=21.88 +/- 0.00
Episode length: 16.00 +/- 0.00
New best mean reward!
Episode has activated Domain Randomization. Wind will be applied with a probability of 0.3 with a maximum magnitude of 0.005 Newtons.
Mass parameter for next episode: 0.03189797328842401
Eval num_timesteps=512, episode_reward=21.88 +/- 0.00
Episode length: 16.00 +/- 0.00
New best mean reward!
Episode has activated Domain Randomization. Wind will be applied with a probability of 0.3 with a maximum magnitude of 0.005 Newtons.
Mass parameter for next episode: 0.033123289921653525
Eval num_timesteps=768, episode_reward=21.90 +/- 0.00
Episode length: 16.00 +/- 0.00
New best mean reward!
Episode has activated Domain Randomization. Wind will be applied with a probability of 0.3 with a maximum magnitude o

In [4]:
totalRewards = serverModel.get_rewards()


In [12]:
print(len(totalRewards))
for i in totalRewards:
    print(len(i))

10
5
6
5
5
4
5
6
5
5
4


Everything below is unit testing stuff

In [None]:
main_model = model.get_model()
model_params = []
for i in main_model.named_parameters():
    print(i[0] + ': ' + str(i[1].shape))
    if 'fc' in i[0]:
        model_params.append(i[1])
print()
for i in model_params:
    print(i.size())


std_bias: torch.Size([])
fc1.weight: torch.Size([512, 27])
fc1.bias: torch.Size([512])
fc_layers.0.weight: torch.Size([512, 512])
fc_layers.0.bias: torch.Size([512])
fc_layers.1.weight: torch.Size([256, 512])
fc_layers.1.bias: torch.Size([256])
fc_layers.2.weight: torch.Size([128, 256])
fc_layers.2.bias: torch.Size([128])
fc2.weight: torch.Size([1, 128])
fc2.bias: torch.Size([1])

torch.Size([512, 27])
torch.Size([512])
torch.Size([512, 512])
torch.Size([512])
torch.Size([256, 512])
torch.Size([256])
torch.Size([128, 256])
torch.Size([128])
torch.Size([1, 128])
torch.Size([1])


In [None]:
import torch
# Set a seed for reproducibility
torch.manual_seed(42)
samplePolicy = GaussianMLPPolicy(input_size=state_size, output_size=action_size, hidden_layers=layers)
policy_list = [param for name, param in policy.named_parameters()][2:]
print([param.shape for param in policy_list])
'''
# Create a list of tensors with the specified sizes
policy_list = [
    torch.randn(512, 27),  # fc1.weight
    torch.randn(512),      # fc1.bias
    torch.randn(512, 512), # fc_layers.0.weight
    torch.randn(512),      # fc_layers.0.bias
    torch.randn(256, 512), # fc_layers.1.weight
    torch.randn(256),      # fc_layers.1.bias
    torch.randn(128, 256), # fc_layers.2.weight
    torch.randn(128),      # fc_layers.2.bias
    torch.randn(1, 128),   # fc2.weight
    torch.randn(1)         # fc2.bias
]
'''
value_list = [
    torch.randn(32, 27),   # critic_target.qf0.0.weight
    torch.randn(32),       # critic_target.qf0.0.bias
    torch.randn(32, 32),   # critic_target.qf0.2.weight
    torch.randn(32),       # critic_target.qf0.2.bias
    torch.randn(1, 32),    # critic_target.qf0.4.weight
    torch.randn(1)         # critic_target.qf0.4.bias
]

[torch.Size([512, 27]), torch.Size([512]), torch.Size([512, 512]), torch.Size([512]), torch.Size([256, 512]), torch.Size([256]), torch.Size([128, 256]), torch.Size([128]), torch.Size([1, 128]), torch.Size([1])]


In [None]:
from stable_baselines3 import SAC
import torch
from utils import SAC_policy_update, get_ActorCritic_delta
SAC_model = SAC.load('/Users/kevinhan/opt/anaconda3/envs/drones/lib/python3.12/site-packages/Federated_RL/SAC_test_run/final_model.zip')
#for i in SAC_model.get_parameters()['policy']:
    #print(i + ': ' + str(SAC_model.get_parameters()['policy'][i].shape))
    #if 'actor' in i:
params = SAC_model.get_parameters()['policy']
policy_params = [params[param] for param in params if 'actor.mu' in param or 'actor.latent' in param]
print([param.shape for param in policy_params])
p1 = [param for param in params if 'critic.qf0' in param]
p2 = [param for param in params if 'critic.qf1' in param]
p3 = [(params[par1] + params[par2]) / 2 for par1, par2 in zip(p1, p2)]
state_tensor = p3[0][:, :-1] 
action_tensor = p3[0][:, -1:]  
p3[0] = state_tensor
SAC_dp, SAC_dv = get_ActorCritic_delta(policy_list, value_list, policy_params, p3)
print(SAC_dp)

[torch.Size([512, 27]), torch.Size([512]), torch.Size([512, 512]), torch.Size([512]), torch.Size([256, 512]), torch.Size([256]), torch.Size([128, 256]), torch.Size([128]), torch.Size([1, 128]), torch.Size([1])]
[tensor([[-0.1946, -0.0768,  0.1262,  ...,  0.1544, -0.1885, -0.1533],
        [ 0.1978, -0.0163,  0.0471,  ..., -0.1965,  0.0983,  0.0018],
        [-0.2438,  0.2235, -0.1447,  ...,  0.0854, -0.1452, -0.0766],
        ...,
        [ 0.0584,  0.3280,  0.0702,  ...,  0.2089, -0.1179,  0.0209],
        [-0.0813,  0.1450,  0.2429,  ..., -0.1136, -0.0697, -0.1240],
        [-0.0536, -0.2149,  0.2787,  ...,  0.1074, -0.1406,  0.0563]],
       grad_fn=<SubBackward0>), tensor([-4.2979e-02, -1.0624e-01, -8.4359e-02,  2.3150e-01, -1.3429e-01,
        -5.7821e-02, -2.1505e-01, -5.6568e-04, -3.3384e-02,  2.8125e-01,
        -1.0576e-01, -1.6301e-01, -3.3318e-02,  5.6894e-02, -6.0403e-02,
         1.0583e-01, -4.1496e-02,  8.1576e-03,  3.4869e-03,  2.1273e-01,
        -4.7976e-02, -9.0580e-

In [None]:
from stable_baselines3 import TD3
from utils import TD3_policy_update, get_ActorCritic_delta

policy_deltas = []
values_deltas = []
action_weights = []

TD3_model = TD3.load('/Users/kevinhan/opt/anaconda3/envs/drones/lib/python3.12/site-packages/Federated_RL/TD3_test_run/final_model.zip')
params = TD3_model.get_parameters()['policy']
TD3_policy_params = [params[param] for param in params if 'actor.mu' in param]
TD3_critic_params1 = [param for param in params if 'critic.qf0' in param]
TD3_critic_params2 = [param for param in params if 'critic.qf1' in param]
TD3_critic_params = [(params[par1] + params[par2]) / 2 for par1, par2 in zip(TD3_critic_params1, TD3_critic_params2)] # Consider average of 2 Q networks
# TD3 uses state-action value estimation, so we must reduce action dimensions from the end
first_tensor = TD3_critic_params[0]
state_tensor, action_tensor = first_tensor[:, :-1], first_tensor[:, -1:]
TD3_critic_params[0] = state_tensor
TD3_dp, TD3_dv = get_ActorCritic_delta(policy_list, value_list, TD3_policy_params, TD3_critic_params)
policy_deltas.append(TD3_dp)
values_deltas.append(TD3_dv)
action_weights.append(action_tensor)
def create_tensors_with_same_shapes(reference_tensors, seed=42):
    # Set a seed for reproducibility
    torch.manual_seed(seed)

    # Create new tensors with the same shapes as the reference tensors
    new_tensors = [torch.randn(tensor.shape) for tensor in reference_tensors]

    return new_tensors
policy_deltas.append(create_tensors_with_same_shapes(TD3_dp, seed=43))
policy_deltas.append(create_tensors_with_same_shapes(TD3_dp, seed=15))
print(len(policy_deltas[0]))

10


In [None]:
from utils import multiply_tensors_in_place
weighted_delta = multiply_tensors_in_place(tensors=policy_deltas[0], scalar=2)
print(policy_deltas[0])
print(weighted_delta[0])

[tensor([[-4.0915, -3.0744, -1.9402,  ..., -2.3465, -2.7450, -1.1898],
        [-2.8013,  0.7884, -0.0803,  ..., -1.2924,  1.2125, -2.5214],
        [ 1.8821,  1.6310,  2.8634,  ...,  4.1113,  1.2098,  0.2790],
        ...,
        [ 1.6250,  2.1202, -0.0757,  ...,  2.9574, -0.4262, -2.3083],
        [ 0.5302,  1.1562,  0.2127,  ..., -3.6054, -0.9754,  2.2718],
        [-2.1870,  1.2250,  0.4627,  ...,  1.0937, -1.3609,  0.2187]]), tensor([-1.3627e+00, -1.1204e+00, -7.6458e-01, -3.3671e-01,  3.2080e+00,
         8.4263e-01,  1.3523e+00, -1.4536e+00, -1.0073e+00, -1.0807e+00,
        -1.1328e+00,  2.9584e-01,  6.2563e-01, -8.0994e-01, -7.0776e-01,
         9.7964e-01,  1.3888e+00,  1.5587e+00, -1.5110e+00, -6.4336e-01,
        -1.4469e+00, -2.2614e+00, -1.9713e+00, -1.8977e+00,  5.9557e-01,
         2.9247e+00, -1.7017e+00, -1.1471e-01, -3.3423e+00,  2.2237e+00,
        -3.8842e+00, -2.3225e+00,  1.9973e+00,  1.6105e+00, -1.5293e+00,
         7.4402e-01,  5.7538e-01, -1.7939e+00,  2.746

In [4]:
from stable_baselines3 import PPO
PPO_params = []
PPO_model = PPO.load('/Users/kevinhan/opt/anaconda3/envs/drones/lib/python3.12/site-packages/Federated_RL/PPO/best_model.zip')
PPO_orig_params_OG = PPO_model.get_parameters()
PPO_orig_params = PPO_orig_params_OG['policy']
for i in PPO_orig_params:
    print(i + ': ' + str(PPO_model.get_parameters()['policy'][i].shape))
    if 'mlp' and 'policy' in i or 'action' in i:
        PPO_params.append(PPO_orig_params[i])


log_std: torch.Size([3])
mlp_extractor.policy_net.0.weight: torch.Size([512, 57])
mlp_extractor.policy_net.0.bias: torch.Size([512])
mlp_extractor.policy_net.2.weight: torch.Size([512, 512])
mlp_extractor.policy_net.2.bias: torch.Size([512])
mlp_extractor.policy_net.4.weight: torch.Size([256, 512])
mlp_extractor.policy_net.4.bias: torch.Size([256])
mlp_extractor.policy_net.6.weight: torch.Size([128, 256])
mlp_extractor.policy_net.6.bias: torch.Size([128])
mlp_extractor.value_net.0.weight: torch.Size([32, 57])
mlp_extractor.value_net.0.bias: torch.Size([32])
mlp_extractor.value_net.2.weight: torch.Size([32, 32])
mlp_extractor.value_net.2.bias: torch.Size([32])
action_net.weight: torch.Size([3, 128])
action_net.bias: torch.Size([3])
value_net.weight: torch.Size([1, 32])
value_net.bias: torch.Size([1])


In [None]:
new_params = []
new_params = [mp - ppop for mp, ppop in zip(model_params, PPO_params)]
print(new_params[0].size())

torch.Size([512, 27])


In [None]:
for i,j in zip([param for param in PPO_orig_params if ('mlp' in param and 'policy' in param) or 'action' in param], new_params):
    PPO_orig_params[i] = j


In [None]:
from stable_baselines3.common.env_util import make_vec_env
PPO_orig_params_OG['policy'] = PPO_orig_params
PPO_model.set_parameters(PPO_orig_params_OG, exact_match = True)

In [None]:
from utils import PPO_policy_update
new_params = PPO_policy_update(PPO_Model=PPO_model, policy_net_update=PPO_params, value_net_update = None)
print(new_params)

{'policy': OrderedDict({'log_std': tensor([0.0155]), 'mlp_extractor.policy_net.0.weight': tensor([[ 0.2983,  0.0939,  0.0582,  ..., -0.3148, -0.2203,  0.0149],
        [ 0.1385,  0.2232, -0.0835,  ..., -0.1183, -0.1324, -0.0246],
        [ 0.1947,  0.0075, -0.1628,  ..., -0.0387,  0.0062,  0.1038],
        ...,
        [-0.1292, -0.3942, -0.0594,  ...,  0.0531, -0.0747,  0.1528],
        [-0.0552,  0.2426,  0.0551,  ..., -0.0403,  0.0743, -0.1494],
        [ 0.0859,  0.0312,  0.1152,  ...,  0.1407, -0.0177, -0.1886]]), 'mlp_extractor.policy_net.0.bias': tensor([ 1.2528e-02, -9.4204e-03, -9.1568e-03,  3.4986e-03, -1.2750e-02,
        -9.8455e-05,  6.7938e-03,  1.2609e-02, -1.8640e-04,  5.3353e-03,
         8.9538e-03,  5.2348e-04,  4.0354e-03, -4.9570e-03, -1.4039e-02,
        -8.2345e-03,  1.2211e-02,  5.4926e-03,  1.4528e-02, -1.1743e-02,
         1.1887e-02,  3.5642e-03, -8.1238e-04,  6.3826e-03,  9.9020e-03,
        -1.2047e-02, -8.4845e-03, -9.1807e-03,  8.9473e-03, -1.9909e-04,
  

In [None]:
import torch.nn as nn
from collections import OrderedDict

def build_nested_module_params(module, param_dict):
    # Initialize an empty dictionary to hold the nested modules
    nested_dict = {}

    # Recursively populate the nested dictionary
    def add_params_to_module(module, param_names):
        for name, param in param_names.items():
            if '.' in name:
                # Split the name to get the module path
                parts = name.split('.')
                module_name = parts[0]
                param_name = '.'.join(parts[1:])

                # Create or get the nested module
                if module_name not in nested_dict:
                    nested_dict[module_name] = nn.ModuleDict()
                
                # If the nested module doesn't exist, create it
                if param_name:
                    if param_name not in nested_dict[module_name]:
                        nested_dict[module_name][param_name] = param
            else:
                # If there is no '.' in the name, it's a direct parameter of the module
                if name not in nested_dict:
                    nested_dict[name] = param

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the neural network class
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        # Initialize weights and biases as nn.Parameter
        self.w1 = nn.Parameter(torch.tensor(0.5))
        self.b1 = nn.Parameter(torch.tensor(0.0))
        self.w2 = nn.Parameter(torch.tensor(0.5))
        self.b2 = nn.Parameter(torch.tensor(0.0))

    def forward(self, x):
        # Compute hidden layer output
        h = self.w1 * x + self.b1
        # Compute output layer output
        y = self.w2 * h + self.b2
        return y

# Instantiate the model
model = SimpleNN()

# Define a loss function and optimizer
def criterion(y,t):
    return 0.5 * (y-t)**2


# Sample data
x = torch.tensor([1.0], requires_grad=False)  # Ensure x does not require gradients
t = torch.tensor([2.0])

# Forward pass
y = model(x)

# Compute loss
loss = criterion(y, t)


# Print out the gradients
with th.no_grad():
    for name, param in model.named_parameters():
        param.add_(2)
        print(f"{name}: {param.item()}")

# Print the computed loss
print(f"Loss: {loss.item()}")


w1: 2.5
b1: 2.0
w2: 2.5
b2: 2.0
Loss: 1.53125


In [None]:
from utils import multiply_and_sum_tensors
import torch as th
def calculate_g(
    policy_params,
    log_probs: list[th.Tensor],
    returns: th.Tensor) -> list[th.Tensor]:
        grads = []
        for pi in log_probs:
            print(pi)
            grad_tuple = th.autograd.grad(outputs=pi, inputs=policy_params, grad_outputs=th.ones_like(pi))
            grads.append(grad_tuple)
        print(grad_tuple)
        return multiply_and_sum_tensors(scalar_tensor=returns, tensor_lists=grads)

In [None]:
import torch
import torch.nn as nn
from policies import GaussianMLPPolicy
import torch as th


# Set random seed for reproducibility
torch.manual_seed(42)

# Instantiate the model
policy = GaussianMLPPolicy(input_size=2, output_size=1, hidden_layers=[1])

# Sample data
x = torch.tensor([1.0, 2.0])
x2 = torch.tensor([2.0, 3.0])
action1 = policy.get_action(x)[0]
print("action1:", action1)
mean1, std1 = policy.forward(x)
print("mean and std:", policy.forward(x))
# Compute log probabilities
log_probs = [policy.get_log_prob(x, action1), policy.get_log_prob(x2, policy.get_action(x2)[0])]

print("Log probabilities:", log_probs)
grad_list = [th.autograd.grad(outputs=pi, inputs=policy.parameters(), grad_outputs=th.ones_like(pi)) for pi in log_probs]
print(grad_list)
#grads = torch.autograd.grad(outputs=log_probs, inputs=policy.parameters(), grad_outputs=torch.ones_like(log_probs))

print(*policy.parameters())



action1: tensor([0.9623], grad_fn=<AddBackward0>)
mean and std: (tensor([0.6201], grad_fn=<ViewBackward0>), tensor([1.0505], grad_fn=<AddBackward0>))
Log probabilities: [tensor(-1.0213, grad_fn=<MulBackward0>), tensor(-1.3880, grad_fn=<MulBackward0>)]
[(tensor(-0.9519), tensor([[-0.0940, -0.1881]]), tensor([-0.0940]), tensor([[-0.5655]]), tensor([-0.6190])), (tensor(-0.9115), tensor([[-0.0209, -0.0314]]), tensor([-0.0105]), tensor([[-0.6015]]), tensor([-0.6072]))]
Parameter containing:
tensor(0., requires_grad=True) Parameter containing:
tensor([[0.5406, 0.5869]], requires_grad=True) Parameter containing:
tensor([-0.1657], requires_grad=True) Parameter containing:
tensor([[0.9186]], requires_grad=True) Parameter containing:
tensor([-0.2191], requires_grad=True)


In [None]:
from policies import GaussianMLPPolicy
testModel = GaussianMLPPolicy(input_size=27, hidden_layers=[512, 512, 256, 128], output_size=1)
paramList = [param for param in testModel.parameters()]
paramList[0] = paramList[0].unsqueeze(dim=0)
paramList[1] = paramList[1].unsqueeze(dim=0)


print(len([name for name, param in testModel.named_parameters()][2:]))

10


In [None]:
import numpy as np
summationTerm = 0
print(len(action1))
for mean, std, act in zip(mean1, std1, action1):

    summationTerm += (act-mean)**2/std**2 + 2*torch.log(std)

total = -0.5 * (summationTerm + 1 * np.log(2*np.pi))
print(total)

In [None]:
total = -0.5 * (torch.sum((action1-mean1)**2/std1**2 + 2 * torch.log(std1)) + len(action1) * np.log(2 * np.pi))

In [None]:
test1 = torch.tensor([1,1,1,1])
test2 = torch.tensor([2,2,2,2])
torch.sum(test2+test1)

In [None]:
from utils import multiply_tensors_in_place, subtract_lists_of_tensors, add_lists_of_tensors
import torch
test = [torch.tensor([1,2,3]), torch.tensor([7,8]), torch.tensor([9,10,22])]
multiply_tensors_in_place(test, scalar=1)
print(test)

[tensor([1, 2, 3]), tensor([7, 8]), tensor([ 9, 10, 22])]


In [None]:
u_r = [torch.tensor([1,1,1]), torch.tensor([1,1]), torch.tensor([1,1,1])]
otherG = [torch.tensor([1,1,1]), torch.tensor([1,1]), torch.tensor([1,1,1])]
momentum_term = subtract_lists_of_tensors(add_lists_of_tensors(u_r, otherG), test)
print(momentum_term)

[tensor([ 1,  0, -1]), tensor([-5, -6]), tensor([ -7,  -8, -20])]


In [None]:
testDict = dict(a=2, b=3)
print(*testDict)

a b


In [6]:
test = ([1,2],)
print(test[0])

[1, 2]
