In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from torch.nn.init import kaiming_uniform_

In [2]:
import numpy as np
import gym
from tqdm import tqdm
import random as rand
from itertools import count

In [3]:
class OU_noise():
    def __init__(self,mu,sigma=0.3,theta=0.15,dt=1e-5,x0=None):
        self.theta = theta
        self.mu    = mu
        self.sigma = sigma
        self.dt    = dt
        self.x0    = x0
        self.reset()
        
    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev)* \
                self.dt +self.sigma * np.sqrt(self.dt)*np.random.normal(size = self.mu.shape)
        self.x_prev = x
        return x
    
    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

In [4]:
class linearApproximator_FCGSAP(nn.Module):
    def __init__(self,state_shape,outputs,hidden_dims=(32,32), log_entropy_lr =0.001,\
                log_std_dev_min=-20, log_std_dev_max= 2):
        super(linearApproximator_FCGSAP, self).__init__()
        self.input_size = state_shape
        self.out = outputs
        self.device = torch.device("cuda" if torch.cuda.is_available()\
                                   else "cpu")
        
        self.fc1  = nn.Linear(self.input_size,hidden_dims[0])
        self.hidden_layers = nn.ModuleList()
        for i in range(len(hidden_dims)-1):
            hidden_layer = nn.Linear(\
                                hidden_dims[i], hidden_dims[i+1])
            self.hidden_layers.append(hidden_layer)
        
        self.output_layer_log  = nn.Linear(hidden_dims[-1],self.out)
        self.output_layer_mean = nn.Linear(hidden_dims[-1],self.out)
        
        self.target_entropy = -self.out
        #according to the eq, log alpha is a learnable parameter
        self.log_alpha = torch.zeros(1,\
                                     requires_grad=True,\
                                     device = self.device)
        self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha],\
                                                    lr=log_entropy_lr)
                                     
        self.to(self.device)
        
    def forward(self, state_shape):
        if not isinstance(state_shape, torch.Tensor):
            state_shape = torch.tensor(state_shape, dtype=torch.float32)
        state_shape = state_shape.to(self.device)
        x = self.fc1(state_shape)
        x = F.relu(x)
        
        for hidden_layer in self.hidden_layers:
            x = F.relu(hidden_layer(x))
        
        logits = self.output_layer_log(x)#logits, preferences of actions
        mean   = self.output_layer_mean(x)
        logits = torch.clamp(logits, log_std_dev_min, log_std_dev_max)
        return mean, logits
        
    def full_pass(self, state):
        mean, logits = self.forward(state).cpu()
        distribution = Categorical(logits=logits)
        action = distribution.sample()#sample action
        log_prob_action = distribution.log_prob(action).unsqueeze(-1)#gets prob of sampled action
        entropy = distribution.entropy().unsqueeze(-1)
        return action.item(), log_prob_action, entropy, logits

In [5]:
import numpy as np
-np.prod(4)

-4

In [7]:
import gym
env = gym.make('BipedalWalker-v3')

In [12]:
act_space_high = env.action_space.high

In [14]:
np.prod(act_space_high.shape)

4

In [9]:
env.action_space.low

array([-1., -1., -1., -1.], dtype=float32)

In [11]:
type(env.action_space.shape)

tuple