In [3]:
import os
import time
from datetime import datetime
import argparse
import gymnasium as gym
import numpy as np
import torch as th
import pandas as pd
import csv

from gym_pybullet_drones.utils.Logger import Logger
from gym_pybullet_drones.envs.HoverAviary import HoverAviary
from gym_pybullet_drones.envs.MultiHoverAviary import MultiHoverAviary
from gym_pybullet_drones.utils.utils import sync, str2bool
from gym_pybullet_drones.utils.enums import ObservationType, ActionType, Physics

from policies import GaussianMLPPolicy
from server import Federated_RL

DEFAULT_GUI = True
DEFAULT_RECORD_VIDEO = True
DEFAULT_OUTPUT_FOLDER = 'results'
DEFAULT_COLAB = False
DEFAULT_DYNAMICS = Physics('pyb') # pyb: Pybullet dynamics; dyn: Explicit Dynamics specified in BaseAviary.py
DEFAULT_WIND = np.array([0, 0.05, 0]) # units are in induced newtons
DEFAULT_OBS = ObservationType('kin') # 'kin' or 'rgb'
DEFAULT_ACT = ActionType('one_d_rpm') # 'rpm' or 'pid' or 'vel' or 'one_d_rpm' or 'one_d_pid'
DEFAULT_AGENTS = 2
DEFAULT_MA = False
DEFAULT_MASS = 0.037 # Actual default is 0.027

DR = False
MASS_RANGE = [0.027, 0.042] # Maximum recommended payload is 15g
WIND_RANGE = 0.005 # Inspired by literature

In [4]:
# Testing on classic gym control environment first
envs = [gym.make('MountainCarContinuous-v0') for _ in range(DEFAULT_AGENTS)]
env = envs[0]
# Get the state size
state_space = env.observation_space
state_size = state_space.shape[0]
# Get the action size
action_space = env.action_space
action_size = action_space.shape[0]

layers = [16,16]

print("State size:", state_size)
print("Action size:", action_size)

policy = GaussianMLPPolicy(input_size=state_size, output_size=action_size, hidden_layers=layers) # Will need some smarter way to initialize the policy within the model in the future
# ASSUMING ONE ALGORITHM SO FAR. WILL IMPLEMENT GENERAL STRUCTURE FOR DIVERSIFIED ALGORITHMS LATER

#### Train the model #######################################
model = Federated_RL(policy=policy,
                    envs=envs,
                    num_agents = DEFAULT_AGENTS,
                    global_iterations=5,
                    local_iterations=10,
                    max_episode_length=5
                    )

model.learn()

State size: 2
Action size: 1
GLOBAL ITERATION: 0
LOCAL ITERATION: 0

TRAINING AGENT: 1
Acquiring global rollout
Total Reward: -1.171763861893404
Episode length: 5

Acquiring local rollout
Total Reward: -0.17346065572630612
Episode length: 5

Importance sampling weight: 0.005362994037568569

GLOBAL ITERATION: 0
LOCAL ITERATION: 1

TRAINING AGENT: 1
Acquiring global rollout
Total Reward: -0.39753802590085113
Episode length: 5

Acquiring local rollout
Total Reward: -0.36363732162007156
Episode length: 5

Importance sampling weight: 0.9476621747016907

GLOBAL ITERATION: 0
LOCAL ITERATION: 2

TRAINING AGENT: 1
Acquiring global rollout
Total Reward: -0.5340036662265137
Episode length: 5

Acquiring local rollout
Total Reward: -0.336897565710252
Episode length: 5

Importance sampling weight: 0.1627451330423355

GLOBAL ITERATION: 0
LOCAL ITERATION: 3

TRAINING AGENT: 1
Acquiring global rollout
Total Reward: -0.6949878879876834
Episode length: 5

Acquiring local rollout
Total Reward: -0.21003997

<server.Federated_RL at 0x1334e8320>

Everything below is unit testing stuff

In [3]:
# verify returns
rewards = [2,-3,4,2,7,0,5]
dones = [0 for _ in range(len(rewards))]
returns = []
gamma = 1
R = 0
for reward, done in zip(reversed(rewards), reversed(dones)):
    if done:
        R = 0
    R = reward + gamma * R
    returns.insert(0, R)
print(returns)

[17, 15, 18, 14, 12, 5, 5]


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the neural network class
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        # Initialize weights and biases as nn.Parameter
        self.w1 = nn.Parameter(torch.tensor(0.5))
        self.b1 = nn.Parameter(torch.tensor(0.0))
        self.w2 = nn.Parameter(torch.tensor(0.5))
        self.b2 = nn.Parameter(torch.tensor(0.0))

    def forward(self, x):
        # Compute hidden layer output
        h = self.w1 * x + self.b1
        # Compute output layer output
        y = self.w2 * h + self.b2
        return y

# Instantiate the model
model = SimpleNN()

# Define a loss function and optimizer
def criterion(y,t):
    return 0.5 * (y-t)**2


# Sample data
x = torch.tensor([1.0], requires_grad=False)  # Ensure x does not require gradients
t = torch.tensor([2.0])

# Forward pass
y = model(x)

# Compute loss
loss = criterion(y, t)

# Zero gradients, backward pass, optimizer step

loss.backward()

# Print out the gradients
print("Gradients with respect to weights and biases:")
for name, param in model.named_parameters():
    print(f"{name}: {param.grad.item()}")

# Print the computed loss
print(f"Loss: {loss.item()}")


NameError: name 'optimizer' is not defined

In [3]:
from utils import multiply_and_sum_tensors
import torch as th
def calculate_g(
    policy_params,
    log_probs: list[th.Tensor],
    returns: th.Tensor) -> list[th.Tensor]:
        grads = []
        for pi in log_probs:
            print(pi)
            grad_tuple = th.autograd.grad(outputs=pi, inputs=policy_params, grad_outputs=th.ones_like(pi))
            grads.append(grad_tuple)
        print(grad_tuple)
        return multiply_and_sum_tensors(scalar_tensor=returns, tensor_lists=grads)

In [6]:
import torch
import torch.nn as nn
from policies import GaussianMLPPolicy
import torch as th


# Set random seed for reproducibility
torch.manual_seed(42)

# Instantiate the model
policy = GaussianMLPPolicy(input_size=2, output_size=1, hidden_layers=[1])

# Sample data
x = torch.tensor([1.0, 2.0])
x2 = torch.tensor([2.0, 3.0])
action1 = policy.get_action(x)[0]
print("action1:", action1)
mean1, std1 = policy.forward(x)
print("mean and std:", policy.forward(x))
# Compute log probabilities
log_probs = [policy.get_log_prob(x, action1), policy.get_log_prob(x2, policy.get_action(x2)[0])]

print("Log probabilities:", log_probs)
grad_list = [th.autograd.grad(outputs=pi, inputs=policy.parameters(), grad_outputs=th.ones_like(pi)) for pi in log_probs]
print(grad_list)
#grads = torch.autograd.grad(outputs=log_probs, inputs=policy.parameters(), grad_outputs=torch.ones_like(log_probs))

print(*policy.parameters())



action1: tensor([0.9623], grad_fn=<AddBackward0>)
mean and std: (tensor([0.6201], grad_fn=<ViewBackward0>), tensor([1.0505], grad_fn=<AddBackward0>))
Log probabilities: [tensor(-1.0213, grad_fn=<SumBackward1>), tensor(-1.3880, grad_fn=<SumBackward1>)]
[(tensor(-0.9519), tensor([[-0.0940, -0.1881]]), tensor([-0.0940]), tensor([[-0.5655]]), tensor([-0.6190])), (tensor(-0.9115), tensor([[-0.0209, -0.0314]]), tensor([-0.0105]), tensor([[-0.6015]]), tensor([-0.6072]))]
Parameter containing:
tensor(0., requires_grad=True) Parameter containing:
tensor([[0.5406, 0.5869]], requires_grad=True) Parameter containing:
tensor([-0.1657], requires_grad=True) Parameter containing:
tensor([[0.9186]], requires_grad=True) Parameter containing:
tensor([-0.2191], requires_grad=True)


In [4]:
print("result:", calculate_g(policy.parameters(), log_probs, th.tensor([2])))

tensor(-1.0213, grad_fn=<SumBackward1>)
tensor(-1.3880, grad_fn=<SumBackward1>)


ValueError: grad requires non-empty inputs.

In [3]:
import numpy as np
summationTerm = 0
for mean, std, act in zip(mean1, std1, action1):

    summationTerm += (act-mean)**2/std**2 + 2*torch.log(std)

total = -0.5 * (summationTerm + 2 * np.log(2*np.pi))
print(total)

tensor(-1.3031, grad_fn=<MulBackward0>)


In [32]:
-0.5 * (torch.sum((action1-mean1)**2/std1**2 + 2 * torch.log(std1)) + len(action1) * np.log(2 * np.pi))

tensor(-1.3031, grad_fn=<MulBackward0>)

In [29]:
print(len(torch.tensor([1,2])))

2
