In [1]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from stable_baselines3.common.buffers import ReplayBuffer
torch.set_printoptions(linewidth=120, precision=2, sci_mode=False)
np.set_printoptions(linewidth=120, precision=2, suppress=True)
import cv2 as cv
from mlagents_envs.environment import UnityEnvironment, BaseEnv, ActionTuple
from mlagents_envs.side_channel.environment_parameters_channel import EnvironmentParametersChannel
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
import random
from collections import deque, namedtuple
from gymnasium import spaces
from numpy import inf

seed = 1
torch_deterministic = True
totalTimesteps = 1000000
buffer_size = int(1e6)
gamma = 0.99
tau = 0.005
batch_size = 512
policy_lr = 3e-4
q_lr = 1e-3
policyFrequency = 4
QNetworkFrequency = 2  # Denis Yarats' implementation delays this by 2.
alpha = 0.2
autoEntropy = True


In [2]:

# ALGO LOGIC: initialize agent here:
class SoftQNetwork(nn.Module):
    def __init__(self, obsSize, actionSize):
        super().__init__()
        self.fc1 = nn.Linear(obsSize + actionSize, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 1)

    def forward(self, x, a):
        # print(f"x: {x} of shape {x.shape}, a: {a} of shape {a.shape} and we will try to concatenate them")
        x = x.to(torch.float)
        print(f"Shape of x {x.shape}, shape of a {a.shape}")
        if len(x.shape) > len(a.shape):
            a = a.unsqueeze(len(x.shape)-2)
        x = torch.cat([x, a], -1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

LOG_STD_MAX = 2
LOG_STD_MIN = -5

class Actor(nn.Module):
    def __init__(self, obsSize, actionSize, actionLowBound, actionHighBound):
        super().__init__()
        self.fc1 = nn.Linear(obsSize, 256)
        self.fc2 = nn.Linear(256, 256)
        self.actionSpace = actionSize
        self.fc_mean = nn.Linear(256, actionSize)
        self.fc_logstd = nn.Linear(256, actionSize)
        self.register_buffer("action_scale", torch.tensor((actionHighBound - actionLowBound) / 2.0, dtype=torch.float32))
        self.register_buffer("action_bias", torch.tensor((actionHighBound + actionLowBound) / 2.0, dtype=torch.float32))

    def forward(self, x):
        x = x.to(torch.float)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        mean = self.fc_mean(x)
        log_std = torch.clamp(self.fc_logstd(x), LOG_STD_MIN, LOG_STD_MAX)
        return mean, log_std

    def get_action(self, x, evaluation=False):
        mean, log_std = self(x)
        std = log_std.exp()
        # print(f"mean, std shapes: {mean.shape}, {std.shape}")
        policyDistribution = torch.distributions.Normal(mean, std)
        if evaluation:
            actionSample = mean
        else:
            actionSample = policyDistribution.rsample()
        # print(f"actionSample.shape {actionSample.shape}")
        tanhAction = torch.tanh(actionSample)
        # print(f"tanhAction.shape {tanhAction.shape}")
        action = tanhAction * self.action_scale + self.action_bias
        # print(f"action.shape {action.shape}")
        log_prob = policyDistribution.log_prob(actionSample)
        # print(f"log_prob.shape {log_prob.shape}, log_prob {log_prob}")
        # print(f"tanhAction.shape {tanhAction.shape}, tanhAction {tanhAction}")
        # print(f"action_scale.shape {self.action_scale.shape}, action_scale {self.action_scale}")
        # print(f"Trying to add to log prob log of self.action_scale * (1 - tanhAction.pow(2)) + 1e-6: {self.action_scale * (1 - tanhAction.pow(2)) + 1e-6}")
        # Enforcing Action Bound
        log_prob -= torch.log(self.action_scale * (1 - tanhAction.pow(2)) + 1e-6)
        # print(f"Added it, so now log_prob is {log_prob} of shape {log_prob.shape}")
        log_prob = log_prob.sum(-1, keepdim=True)
        # print(f"Summed it so now logprob is {log_prob} of shape {log_prob.shape}")
        mean = torch.tanh(mean) * self.action_scale + self.action_bias
        return action, log_prob, mean


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
environmentChannel = EnvironmentParametersChannel()
# engineChannel = EnvironmentParametersChannel() for now I can only have one channel at a time
env = UnityEnvironment(file_name=None, side_channels=[environmentChannel])
# channel.set_float_parameter("parameter_1", 2.0)
env.reset()

print(f"env.behavior_specs: {env.behavior_specs} So first element will be our behaviorName")
behaviorName = list(env.behavior_specs)[0]
specs = env.behavior_specs[behaviorName]
print(f"All specs: {specs}")

env.reset()
decisionSteps, terminalSteps = env.get_steps(behaviorName)
agentID = decisionSteps.agent_id[0] # I dont get that part yet
obs = decisionSteps.obs[agentID] # obs have shape 1, 1, 10 for now: nAgents, nStackedVectors, obsSize

continuousActionSize = specs.action_spec.continuous_size
print(f"We have {continuousActionSize} continuous action size")
print(f"We have {specs.action_spec.discrete_size} branches of {specs.action_spec.discrete_branches} discrete action sizes respectively")

env.behavior_specs: <mlagents_envs.base_env.BehaviorMapping object at 0x0000023E1D803100> So first element will be our behaviorName
All specs: BehaviorSpec(observation_specs=[ObservationSpec(shape=(126,), dimension_property=(<DimensionProperty.NONE: 1>,), observation_type=<ObservationType.DEFAULT: 0>, name='PhysicsBodySensor:Body'), ObservationSpec(shape=(32,), dimension_property=(<DimensionProperty.NONE: 1>,), observation_type=<ObservationType.DEFAULT: 0>, name='VectorSensor_size32')], action_spec=ActionSpec(continuous_size=20, discrete_branches=()))
We have 20 continuous action size
We have 0 branches of () continuous action sizes respectively


In [4]:
observationSize = np.array(specs.observation_specs[agentID].shape).prod()
continuousActionSize = specs.action_spec.continuous_size

actor = Actor(observationSize, continuousActionSize, 0, 1).to(device)
QNet1 = SoftQNetwork(observationSize, continuousActionSize).to(device)
QNet2 = SoftQNetwork(observationSize, continuousActionSize).to(device)
QNet1_target = SoftQNetwork(observationSize, continuousActionSize).to(device)
QNet2_target = SoftQNetwork(observationSize, continuousActionSize).to(device)
QNet1_target.load_state_dict(QNet1.state_dict())
QNet2_target.load_state_dict(QNet2.state_dict())
QNetsOptimizer = optim.Adam(list(QNet1.parameters()) + list(QNet2.parameters()), lr=q_lr)
actorOptimizer = optim.Adam(list(actor.parameters()), lr=policy_lr)

# Automatic entropy tuning
if autoEntropy:
    target_entropy = -torch.tensor(continuousActionSize).item()
    logAlpha = torch.zeros(1, requires_grad=True, device=device)
    alpha = logAlpha.exp().item()
    alphaOptimizer = optim.Adam([logAlpha], lr=q_lr)

nEnvs = 10
experiences = ReplayBuffer(
    buffer_size,
    spaces.Box(low=-inf, high=inf, shape=(nEnvs, observationSize), dtype=np.float32),
    spaces.Box(low=-inf, high=inf, shape=(nEnvs, continuousActionSize), dtype=np.float32),
    device,
    handle_timeout_termination=False
    # n_envs=10
)

env.reset()
decisionSteps, terminalSteps = env.get_steps(behaviorName)
agentID = decisionSteps.agent_id[0] # I dont get that part yet
obs = decisionSteps.obs[agentID] # obs have shape 1, 1, 10 for now: nAgents, nStackedVectors, obsSize

allScores = []
episodicReward = 0.0
for globalStep in range(totalTimesteps):
    actionsContinuous, _, _ = actor.get_action(torch.Tensor(obs).to(device))
    actionsContinuous = actionsContinuous.detach().cpu().numpy()
    actions = ActionTuple(continuous=actionsContinuous)

    # print(f"Based on observation {obs} we take action {actionsContinuous}")

    env.set_actions(behaviorName, actions)
    env.step()
    decisionSteps, terminalSteps = env.get_steps(behaviorName)



    if agentID in terminalSteps:
        next_obs = terminalSteps.obs[agentID]
        print(f"Total episodic reward: {episodicReward}")
        allScores.append(episodicReward)
        episodicReward = 0
        env.reset()
    else:
        episodicReward += decisionSteps.reward
        next_obs = decisionSteps.obs[agentID]

    # print(f"### Pushing {obs, next_obs, actionsContinuous} to experience as first 3 elements")
    experiences.add(obs, next_obs, actionsContinuous, episodicReward, True if agentID in terminalSteps else False, "")
    obs = next_obs


    sampledExperiences = experiences.sample(min(batch_size, experiences.size()))
    with torch.no_grad():
        nextStateActions, nextStateLogProbs, _ = actor.get_action(sampledExperiences.next_observations)
        # print(f"### Actor just spit out {nextStateActions} from input {sampledExperiences.next_observations}")
        # print(f"### So we're giving QNet: {sampledExperiences.next_observations} and {nextStateActions} and its fine")
        QFunction1NextTarget = QNet1_target(sampledExperiences.next_observations, nextStateActions)
        QFunction2NextTarget = QNet2_target(sampledExperiences.next_observations, nextStateActions)
        minQNextTarget = torch.min(QFunction1NextTarget, QFunction2NextTarget) - alpha * nextStateLogProbs
        nextQValue = sampledExperiences.rewards.flatten() + (1 - sampledExperiences.dones.flatten()) * gamma * (minQNextTarget).view(-1)


    # print(f"### But we're giving QNet: {sampledExperiences.observations} and {sampledExperiences.actions} and its NOTTTT fine")
    QFunction1ActionValues = QNet1(sampledExperiences.observations, sampledExperiences.actions).view(-1)
    QFunction2ActionValues = QNet2(sampledExperiences.observations, sampledExperiences.actions).view(-1)
    QFunction1Loss = F.mse_loss(QFunction1ActionValues, nextQValue)
    QFunction2Loss = F.mse_loss(QFunction2ActionValues, nextQValue)
    QFunctionsTotalLoss = QFunction1Loss + QFunction2Loss

    QNetsOptimizer.zero_grad()
    QFunctionsTotalLoss.backward()
    QNetsOptimizer.step()

    if globalStep % policyFrequency == 0:
        for i in range(policyFrequency):
            if i > 0:
                # Sample new experiences if we make multiple updates
                sampledExperiences = experiences.sample(min(batch_size, experiences.size()))
            actions, logProbabilities, _ = actor.get_action(sampledExperiences.observations)
            QFunction1Evaluation = QNet1(sampledExperiences.observations, actions)
            QFunction2Evaluation = QNet2(sampledExperiences.observations, actions)
            minQEvalutaion = torch.min(QFunction1Evaluation, QFunction2Evaluation)
            actorLoss = ((alpha * logProbabilities) - minQEvalutaion).mean()

            actorOptimizer.zero_grad()
            actorLoss.backward()
            actorOptimizer.step()

            if autoEntropy:
                with torch.no_grad():
                    _, logProbabilities, _ = actor.get_action(sampledExperiences.observations)
                alphaLoss = (-logAlpha.exp()*(logProbabilities + target_entropy)).mean()

                alphaOptimizer.zero_grad()
                alphaLoss.backward()
                alphaOptimizer.step()
                alpha = logAlpha.exp().item()

    if globalStep % QNetworkFrequency == 0:
        for param, targetParam in zip(QNet1.parameters(), QNet1_target.parameters()):
            targetParam.data.copy_(tau*param.data + (1 - tau)*targetParam.data)
        for param, targetParam in zip(QNet2.parameters(), QNet2_target.parameters()):
            targetParam.data.copy_(tau*param.data + (1 - tau)*targetParam.data)
env.close()

ValueError: could not broadcast input array from shape (10,) into shape (1,)