In [1]:
import os
import time
from datetime import datetime
import argparse
import gymnasium as gym
import numpy as np
import torch as th
import pandas as pd
import csv

from gym_pybullet_drones.utils.Logger import Logger
from gym_pybullet_drones.envs.HoverAviary import HoverAviary
from gym_pybullet_drones.envs.MultiHoverAviary import MultiHoverAviary
from gym_pybullet_drones.utils.utils import sync, str2bool
from gym_pybullet_drones.utils.enums import ObservationType, ActionType, Physics

from policies import GaussianMLPPolicy
from server import Federated_RL

DEFAULT_GUI = True
DEFAULT_RECORD_VIDEO = True
DEFAULT_OUTPUT_FOLDER = 'results'
DEFAULT_COLAB = False
DEFAULT_DYNAMICS = Physics('pyb') # pyb: Pybullet dynamics; dyn: Explicit Dynamics specified in BaseAviary.py
DEFAULT_WIND = np.array([0, 0.05, 0]) # units are in induced newtons
DEFAULT_OBS = ObservationType('kin') # 'kin' or 'rgb'
DEFAULT_ACT = ActionType('one_d_rpm') # 'rpm' or 'pid' or 'vel' or 'one_d_rpm' or 'one_d_pid'
DEFAULT_AGENTS = 4
DEFAULT_MA = False
DEFAULT_MASS = 0.037 # Actual default is 0.027

DR = False
MASS_RANGE = [0.027, 0.042] # Maximum recommended payload is 15g
WIND_RANGE = 0.005 # Inspired by literature

pybullet build time: Jun 24 2024 15:23:59


In [3]:
# Testing on classic gym control environment first
envs = [gym.make('MountainCarContinuous-v0') for _ in range(DEFAULT_AGENTS)]
env = envs[0]
# Get the state size
state_space = env.observation_space
state_size = state_space.shape[0]
# Get the action size
action_space = env.action_space
action_size = action_space.shape[0]

layers = [256, 128]

print("State size:", state_size)
print("Action size:", action_size)

policy = GaussianMLPPolicy(input_size=state_size, output_size=action_size, hidden_layers=layers) # Will need some smarter way to initialize the policy within the model in the future
# ASSUMING ONE ALGORITHM SO FAR. WILL IMPLEMENT GENERAL STRUCTURE FOR DIVERSIFIED ALGORITHMS LATER

#### Train the model #######################################
model = Federated_RL(policy=policy,
                    envs=envs,
                    num_agents = DEFAULT_AGENTS,
                    global_iterations=10,
                    local_iterations=100,
                    max_episode_length=5,
                    global_step_size=1e-2,
                    local_step_size=1e-2
                    )

model.learn()

State size: 2
Action size: 1
GLOBAL ITERATION: 0
TRAINING AGENT: 1
LOCAL ITERATION: 0

Acquiring global rollout
Total Reward: -0.02143685727444675
Episode length: 5

Acquiring local rollout
Total Reward: -0.1276381945503713
Episode length: 5

Importance sampling weight: 1.7999999523162842

Step size: tensor(0.0082, grad_fn=<MulBackward0>)
GLOBAL ITERATION: 0
TRAINING AGENT: 1
LOCAL ITERATION: 1

Acquiring global rollout
Total Reward: -0.15286043561014218
Episode length: 5

Acquiring local rollout
Total Reward: -0.6627010205516303
Episode length: 5

Importance sampling weight: 1.7999999523162842

Step size: tensor(0.0081, grad_fn=<MulBackward0>)
GLOBAL ITERATION: 0
TRAINING AGENT: 1
LOCAL ITERATION: 2

Acquiring global rollout
Total Reward: -0.1649814373770259
Episode length: 5

Acquiring local rollout
Total Reward: -0.2819879179317344
Episode length: 5

Importance sampling weight: 1.7039042711257935

Step size: tensor(0.0082, grad_fn=<MulBackward0>)
GLOBAL ITERATION: 0
TRAINING AGENT: 

<server.Federated_RL at 0x12c8e3260>

Everything below is unit testing stuff

In [None]:
# verify returns
rewards = [2,-3,4,2,7,0,5]
dones = [0 for _ in range(len(rewards))]
returns = []
gamma = 1
R = 0
for reward, done in zip(reversed(rewards), reversed(dones)):
    if done:
        R = 0
    R = reward + gamma * R
    returns.insert(0, R)
print(returns)

In [25]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the neural network class
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        # Initialize weights and biases as nn.Parameter
        self.w1 = nn.Parameter(torch.tensor(0.5))
        self.b1 = nn.Parameter(torch.tensor(0.0))
        self.w2 = nn.Parameter(torch.tensor(0.5))
        self.b2 = nn.Parameter(torch.tensor(0.0))

    def forward(self, x):
        # Compute hidden layer output
        h = self.w1 * x + self.b1
        # Compute output layer output
        y = self.w2 * h + self.b2
        return y

# Instantiate the model
model = SimpleNN()

# Define a loss function and optimizer
def criterion(y,t):
    return 0.5 * (y-t)**2


# Sample data
x = torch.tensor([1.0], requires_grad=False)  # Ensure x does not require gradients
t = torch.tensor([2.0])

# Forward pass
y = model(x)

# Compute loss
loss = criterion(y, t)


# Print out the gradients
with th.no_grad():
    for name, param in model.named_parameters():
        param.add_(2)
        print(f"{name}: {param.item()}")

# Print the computed loss
print(f"Loss: {loss.item()}")


w1: 2.5
b1: 2.0
w2: 2.5
b2: 2.0
Loss: 1.53125


In [None]:
from utils import multiply_and_sum_tensors
import torch as th
def calculate_g(
    policy_params,
    log_probs: list[th.Tensor],
    returns: th.Tensor) -> list[th.Tensor]:
        grads = []
        for pi in log_probs:
            print(pi)
            grad_tuple = th.autograd.grad(outputs=pi, inputs=policy_params, grad_outputs=th.ones_like(pi))
            grads.append(grad_tuple)
        print(grad_tuple)
        return multiply_and_sum_tensors(scalar_tensor=returns, tensor_lists=grads)

In [None]:
import torch
import torch.nn as nn
from policies import GaussianMLPPolicy
import torch as th


# Set random seed for reproducibility
torch.manual_seed(42)

# Instantiate the model
policy = GaussianMLPPolicy(input_size=2, output_size=1, hidden_layers=[1])

# Sample data
x = torch.tensor([1.0, 2.0])
x2 = torch.tensor([2.0, 3.0])
action1 = policy.get_action(x)[0]
print("action1:", action1)
mean1, std1 = policy.forward(x)
print("mean and std:", policy.forward(x))
# Compute log probabilities
log_probs = [policy.get_log_prob(x, action1), policy.get_log_prob(x2, policy.get_action(x2)[0])]

print("Log probabilities:", log_probs)
grad_list = [th.autograd.grad(outputs=pi, inputs=policy.parameters(), grad_outputs=th.ones_like(pi)) for pi in log_probs]
print(grad_list)
#grads = torch.autograd.grad(outputs=log_probs, inputs=policy.parameters(), grad_outputs=torch.ones_like(log_probs))

print(*policy.parameters())



In [None]:
print("result:", calculate_g(policy.parameters(), log_probs, th.tensor([2])))

In [None]:
import numpy as np
summationTerm = 0
print(len(action1))
for mean, std, act in zip(mean1, std1, action1):

    summationTerm += (act-mean)**2/std**2 + 2*torch.log(std)

total = -0.5 * (summationTerm + 1 * np.log(2*np.pi))
print(total)

In [None]:
total = -0.5 * (torch.sum((action1-mean1)**2/std1**2 + 2 * torch.log(std1)) + len(action1) * np.log(2 * np.pi))

In [None]:
test1 = torch.tensor([1,1,1,1])
test2 = torch.tensor([2,2,2,2])
torch.sum(test2+test1)

In [14]:
from utils import multiply_tensors_in_place, subtract_lists_of_tensors, add_lists_of_tensors
import torch
test = [torch.tensor([1,2,3]), torch.tensor([7,8]), torch.tensor([9,10,22])]
multiply_tensors_in_place(test, scalar=1)
print(test)

[tensor([1, 2, 3]), tensor([7, 8]), tensor([ 9, 10, 22])]


In [20]:
u_r = [torch.tensor([1,1,1]), torch.tensor([1,1]), torch.tensor([1,1,1])]
otherG = [torch.tensor([1,1,1]), torch.tensor([1,1]), torch.tensor([1,1,1])]
momentum_term = subtract_lists_of_tensors(add_lists_of_tensors(u_r, otherG), test)
print(momentum_term)

[tensor([ 1,  0, -1]), tensor([-5, -6]), tensor([ -7,  -8, -20])]


In [17]:
u = [.5 * gc + (1 - .5) * mt for gc, mt in zip(test, momentum_term)]

In [19]:
print(u)

[tensor([1., 1., 1.]), tensor([1., 1.]), tensor([1., 1., 1.])]
