In [1]:
import torch
import torch.optim as optim
from torch.distributions import Normal
import torch.nn as nn
import numpy as np
from gym.wrappers.monitoring.video_recorder import VideoRecorder
import warnings
from typing import Union
from utils import ReplayBuffer, get_env, run_episode
from scipy.stats import norm

In [3]:
TRAIN_EPISODES = 50
TEST_EPISODES = 300

In [45]:
class NeuralNetwork(nn.Module):
    '''
    This class implements a neural network with a variable number of hidden layers and hidden units.
    You may use this function to parametrize your policy and critic networks.
    '''
    def __init__(self, input_dim: int, output_dim: int, hidden_size: int, 
                                hidden_layers: int, activation: str):
        super(NeuralNetwork, self).__init__()

        # TODO: Implement this function which should define a neural network 
        # with a variable number of hidden layers and hidden units.
        # Here you should define layers which your network will use.
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.hidden_size = hidden_size
        self.hidden_layers = hidden_layers
        self.activations = {
            'relu': nn.ReLU(),
            'sigmoid': nn.Sigmoid(),
            'tanh': nn.Tanh()
            }
        #self.activation = self.activations[activation]
        self.activation = nn.ReLU()
        self.input = nn.Linear(self.input_dim, self.hidden_size)
        self.linears = nn.ModuleList([nn.Linear(self.hidden_size,self.hidden_size) for i in range(self.hidden_layers)])
        self.putput = nn.Linear(self.hidden_size, self.output_dim)

    def forward(self, s: torch.Tensor) -> torch.Tensor:
        # TODO: Implement the forward pass for the neural network you have defined.
        #pass
        s = self.input(s)
        s = self.activation(s)
        #print("after activation", s)
        for i in range(0,self.hidden_layers):
            
            s = self.linears[i](s)
            #print("linear layer", s)

            s = self.activation(s)
            #print("activation in linear layer", s)
            
        s = self.putput(s)
        #print("output is", s)
        #s = self.activation(s)
        #print("after activation output layer", s)
        #log_s = nn.Softmax(s)
        return s #, log_s

In [117]:
    
class Actor:
    def __init__(self,hidden_size: int, hidden_layers: int, actor_lr: float,
                state_dim: int = 3, action_dim: int = 1, device: torch.device = torch.device('cpu')):
        super(Actor, self).__init__()

        self.hidden_size = hidden_size
        self.hidden_layers = hidden_layers
        self.actor_lr = actor_lr
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.device = device
        self.LOG_STD_MIN = -20
        self.LOG_STD_MAX = 2
        self.setup_actor()

    def setup_actor(self):
        '''
        This function sets up the actor network in the Actor class.
        '''
        # TODO: Implement this function which sets up the actor network. 
        # Take a look at the NeuralNetwork class in utils.py. 
        #pass
        self.NN_actor = NeuralNetwork(input_dim=self.state_dim, output_dim=2*self.action_dim, hidden_size=self.hidden_size, hidden_layers=self.hidden_layers, activation="relu")
        self.NN_actor.to(self.device)
        self.optimizer= optim.Adam(self.NN_actor.parameters(),lr = self.actor_lr)
        self.temperature = TrainableParameter(init_param=0.005, lr_param=0.1, train_param=True)

    def clamp_log_std(self, log_std: torch.Tensor) -> torch.Tensor:
        '''
        :param log_std: torch.Tensor, log_std of the policy.
        Returns:
        :param log_std: torch.Tensor, log_std of the policy clamped between LOG_STD_MIN and LOG_STD_MAX.
        '''
        return torch.clamp(log_std, self.LOG_STD_MIN, self.LOG_STD_MAX)

    def get_action_and_log_prob(self, state: torch.Tensor, 
                                deterministic: bool = False) -> (torch.Tensor, torch.Tensor):
        '''
        :param state: torch.Tensor, state of the agent
        :param deterministic: boolean, if true return a deterministic action 
                                otherwise sample from the policy distribution.
        Returns:
        :param action: torch.Tensor, action the policy returns for the state.
        :param log_prob: log_probability of the the action.
        '''
        
        print("input state is", state.shape)
        assert state.shape == (3,) or state.shape[1] == self.state_dim, 'State passed to this method has a wrong shape'
        action , log_prob = torch.zeros(state.shape[0]), torch.ones(state.shape[0])
        # TODO: Implement this function which returns an action and its log probability.
        # If working with stochastic policies, make sure that its log_std are clamped 
        # using the clamp_log_std function.
        
        #state = torch.tensor(state) # assume state is either [3,] or [200, 3]

        #mean, std = torch.chunk(self.NN_actor(state), 2, dim=-1)#.to(self.device)

        outputs = self.NN_actor(state)


        if state.shape == (3,): # one state
            outputs = [outputs]

        print("outputs", outputs)
        
            # mean, std = out # need to check if can unpack

            #     std = torch.tensor(torch.abs(std))
            #     mean = torch.tensor(mean)

            #     log_std = self.clamp_log_std(torch.log(std))   #The log of the standard deviation must be clamped not the standard deviation
            #     std = torch.exp(log_std)

            #     print("mean", mean)
            #     print("std", std)

            #     dist = torch.distributions.normal.Normal(mean, std)

            #     if deterministic == False:  #We aren't sure about the placement of the clamping, as it makes a difference for the probability, what its std is
        
            #         action = dist.rsample() #rsample includes the reparametrization trick
            #         action = torch.tanh(action)

            #     else:
            #         action = mean

            #     log_prob = dist.log_prob(action)

        actions, log_probs = [], []

        for out in outputs:

            print("out", out)

            mean, std = out # need to check if can unpack

            std = torch.tensor(torch.abs(std))
            mean = torch.tensor(mean)

            log_std = self.clamp_log_std(torch.log(std))   #The log of the standard deviation must be clamped not the standard deviation
            std = torch.exp(log_std)

            print("mean", mean)
            print("std", std)

            dist = torch.distributions.normal.Normal(mean, std)

            if deterministic == False:  #We aren't sure about the placement of the clamping, as it makes a difference for the probability, what its std is
        
                action = dist.rsample() #rsample includes the reparametrization trick
                action = torch.tanh(action)

            else:
                action = mean

            log_prob = dist.log_prob(action)

            actions.append(action) 
            log_probs.append(log_prob)

        print("actions", actions)
        print("log_probs", log_probs)

        if state.shape[0] == self.state_dim: # working with a single state
            
            actions = torch.tensor(actions).reshape((self.action_dim, ))
            log_probs = torch.tensor(log_probs).reshape((self.action_dim, ))

        else:  # second dimension is the state shape

            N = state.shape[0]
            actions = torch.tensor(actions).reshape((N, self.action_dim))
            log_probs = torch.tensor(log_probs).reshape((N, self.action_dim))


        print("-------after conversion to tensors")
        print("actions", actions)
        print("log_probs", log_probs)

        assert (actions.shape == (self.action_dim, ) and \
            log_probs.shape == (self.action_dim, ), 'Incorrect shape for action or log_prob.' ) or \
                ( actions.shape[1] == self.action_dim and log_probs.shape[1] == self.action_dim )
             
        return actions, log_probs



In [120]:
agent = Agent()

# testing forward pass

input_tensor = torch.tensor([[-0.9985, -0.0543,  0.4931],
        [-0.9988, -0.0494,  0.3297],
        [-0.9996, -0.0295, -0.4181],
        [-0.9924,  0.1230, -0.0246],
        [-0.9926,  0.1210, -0.0899],
        [-0.9924,  0.1230, -0.0246],
        [-0.9924,  0.1230, -0.0246]])

#input_tensor = torch.tensor([[-0.9985,-0.0543,0.4931]]).reshape((3,))

print(input_tensor.shape)


agent.actor.get_action_and_log_prob(input_tensor, False)

Using device: cpu
torch.Size([7, 3])
input state is torch.Size([7, 3])
outputs tensor([[-0.0541, -0.0497],
        [-0.0562, -0.0474],
        [-0.0815, -0.0309],
        [-0.0638, -0.0482],
        [-0.0670, -0.0455],
        [-0.0638, -0.0482],
        [-0.0638, -0.0482]], grad_fn=<AddmmBackward0>)
out tensor([-0.0541, -0.0497], grad_fn=<UnbindBackward0>)
mean tensor(-0.0541)
std tensor(0.0497)
out tensor([-0.0562, -0.0474], grad_fn=<UnbindBackward0>)
mean tensor(-0.0562)
std tensor(0.0474)
out tensor([-0.0815, -0.0309], grad_fn=<UnbindBackward0>)
mean tensor(-0.0815)
std tensor(0.0309)
out tensor([-0.0638, -0.0482], grad_fn=<UnbindBackward0>)
mean tensor(-0.0638)
std tensor(0.0482)
out tensor([-0.0670, -0.0455], grad_fn=<UnbindBackward0>)
mean tensor(-0.0670)
std tensor(0.0455)
out tensor([-0.0638, -0.0482], grad_fn=<UnbindBackward0>)
mean tensor(-0.0638)
std tensor(0.0482)
out tensor([-0.0638, -0.0482], grad_fn=<UnbindBackward0>)
mean tensor(-0.0638)
std tensor(0.0482)
actions [ten

  std = torch.tensor(torch.abs(std))
  mean = torch.tensor(mean)


(tensor([[-0.0178],
         [ 0.0320],
         [-0.0869],
         [-0.0379],
         [-0.0181],
         [-0.1669],
         [-0.0492]]),
 tensor([[ 1.8175],
         [ 0.3974],
         [ 2.5429],
         [ 1.9690],
         [ 1.5929],
         [-0.1773],
         [ 2.0680]]))

In [66]:
class Critic:
    def __init__(self, hidden_size: int, 
                 hidden_layers: int, critic_lr: int, state_dim: int = 3, 
                    action_dim: int = 1,device: torch.device = torch.device('cpu')):
        super(Critic, self).__init__()
        self.hidden_size = hidden_size
        self.hidden_layers = hidden_layers
        self.critic_lr = critic_lr
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.device = device
        self.setup_critic()

    def setup_critic(self):
        # TODO: Implement this function which sets up the critic(s). Take a look at the NeuralNetwork 
        # class in utils.py. Note that you can have MULTIPLE critic networks in this class.

        self.NN_critic = NeuralNetwork(input_dim = self.state_dim, output_dim=1, hidden_size=self.hidden_size, hidden_layers=self.hidden_layers, activation="relu")
        self.NN_critic.to(self.device)
        self.optimizer = optim.Adam(self.NN_critic.parameters(),lr = self.critic_lr)
        


In [67]:
class TrainableParameter:
    '''
    This class could be used to define a trainable parameter in your method. You could find it 
    useful if you try to implement the entropy temerature parameter for SAC algorithm.
    '''
    def __init__(self, init_param: float, lr_param: float, 
                 train_param: bool, device: torch.device = torch.device('cpu')):
        
        self.log_param = torch.tensor(np.log(init_param), requires_grad=train_param, device=device)
        self.optimizer = optim.Adam([self.log_param], lr=lr_param)

    def get_param(self) -> torch.Tensor:
        return torch.exp(self.log_param)

    def get_log_param(self) -> torch.Tensor:
        return self.log_param

In [118]:

class Agent:
    def __init__(self):
        # Environment variables. You don't need to change this.
        self.state_dim = 3  # [cos(theta), sin(theta), theta_dot]
        self.action_dim = 1  # [torque] in[-1,1]
        self.batch_size = 200
        self.min_buffer_size = 1000
        self.max_buffer_size = 100000
        # If your PC possesses a GPU, you should be able to use it for training, 
        # as self.device should be 'cuda' in that case.
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print("Using device: {}".format(self.device))
        self.memory = ReplayBuffer(self.min_buffer_size, self.max_buffer_size, self.device)
        
        self.setup_agent()

    def setup_agent(self):
        # TODO: Setup off-policy agent with policy and critic classes. 
        # Feel free to instantiate any other parameters you feel you might need.   
        #pass
        self.hidden_layers = 2
        self.hidden_size = 256
        self.lr = 3E-3

        self.actor = Actor(self.hidden_size, self.hidden_layers, self.lr)
        self.critic_Q2 = Critic(state_dim=self.state_dim+self.action_dim,
                                hidden_size=self.hidden_size, 
                                hidden_layers=self.hidden_layers,
                                critic_lr=self.lr)
        
        self.critic_Q1 = Critic(state_dim=self.state_dim+self.action_dim,
                                hidden_size=self.hidden_size,
                                hidden_layers=self.hidden_layers,
                                critic_lr=self.lr)
        #self.critic = Critic(self.hidden_size, self.hidden_layers, self.lr)
        #self.trainable_params = TrainableParameter(init_param: float, self.lr, train_param: bool)
        #Name parameters from the paper
        #self.log_prob = []
        self.Tau = 0.005
        self.gamma = 0.99

    def get_action(self, s: np.ndarray, train: bool) -> np.ndarray:
        """
        :param s: np.ndarray, state of the pendulum. shape (3, )
        :param train: boolean to indicate if you are in eval or train mode. 
                    You can find it useful if you want to sample from deterministic policy.
        :return: np.ndarray,, action to apply on the environment, shape (1,)
        """
        # TODO: Implement a function that returns an action from the policy for the state s.
        #action = np.random.uniform(-1, 1, (1,))
        #Convert the state to a torch tensor, which is the required input for the actor
        s = torch.tensor(s)
        #Import action from the actor and discard the log probability here, possibly used elsewhere
        action, _ = self.actor.get_action_and_log_prob(s, False)
        # only get one action -> we have to sample in get_action_and_log_prob
        #Convert the returned tensor action to an nd.array
        action = action.clone().detach().numpy()
        #Need log probability for something -------> ?

        assert action.shape == (1,), 'Incorrect action shape.'
        assert isinstance(action, np.ndarray ), 'Action dtype must be np.ndarray' 
        return action

    @staticmethod
    # loss: 200 x 1
    def run_gradient_update_step(object: Union[Actor, Critic], loss: torch.Tensor):
        '''
        This function takes in a object containing trainable parameters and an optimizer, 
        and using a given loss, runs one step of gradient update. If you set up trainable parameters 
        and optimizer inside the object, you could find this function useful while training.
        :param object: object containing trainable parameters and an optimizer
        '''
        object.optimizer.zero_grad()
        loss.mean().backward()
        object.optimizer.step()

    def critic_target_update(self, base_net: NeuralNetwork, target_net: NeuralNetwork, 
                             tau: float, soft_update: bool):
        '''
        This method updates the target network parameters using the source network parameters.
        If soft_update is True, then perform a soft update, otherwise a hard update (copy).
        :param base_net: source network
        :param target_net: target network
        :param tau: soft update parameter
        :param soft_update: boolean to indicate whether to perform a soft update or not
        '''
        for param_target, param in zip(target_net.parameters(), base_net.parameters()):
            if soft_update:
                param_target.data.copy_(param_target.data * (1.0 - tau) + param.data * tau)
            else:
                param_target.data.copy_(param.data)

    def train_agent(self): 
        '''
        This function represents one training iteration for the agent. It samples a batch 
        from the replay buffer,and then updates the policy and critic networks 
        using the sampled batch.
        '''
        # TODO: Implement one step of training for the agent.
        # Hint: You can use the run_gradient_update_step for each policy and critic.
        # Example: self.run_gradient_update_step(self.policy, policy_loss)
        # Batch sampling
        batch = self.memory.sample(self.batch_size)
        s_batch, a_batch, r_batch, s_prime_batch = batch

        print("#############################")
        print("train_agent")
        print("#############################")

        #Get the temperature - We still need to figure out which network uses this
        alpha = self.actor.temperature.get_param()
        print("alpha before optimization", alpha)
        #alpha = torch.tensor(0.5)
        reward =  1/alpha * r_batch # smth to investigate
        print("modified reward", reward[0:5, :])
        #reward = r_batch + alpha * entropy <--

        #Store the basic Psi network - which I guess we still need
        base_net1 = NeuralNetwork(input_dim = self.state_dim + self.action_dim, 
                                  output_dim = 1, 
                                  hidden_size = 256,
                                  hidden_layers = 2,
                                  activation="relu").to(self.device) #self.critic_Q1.NN_critic #self.critic_Q1.NN_critic
        
        base_net2 = NeuralNetwork(input_dim = self.state_dim + self.action_dim, 
                                  output_dim = 1, 
                                  hidden_size = 256,
                                  hidden_layers = 2,
                                  activation="relu").to(self.device) #self.critic_Q2.NN_critic

        base_net1.load_state_dict(copy.deepcopy(self.critic_Q1.NN_critic.state_dict()))
        base_net1.to(self.device)
        base_net2.load_state_dict(copy.deepcopy(self.critic_Q2.NN_critic.state_dict()))
        base_net2.to(self.device)

        print("Q1 before gradient", base_net1.state_dict()['putput.weight'][0,:5])

        #Optimize the critic networks
        #Run a gradient update step for critic V
        # TODO: Implement Critic(s) update here.

       
        with torch.no_grad():

            results_list = [self.actor.get_action_and_log_prob(state, False) for state in s_prime_batch] 
            
            next_sampled_action, next_sampled_log_prob = zip(*results_list)

            next_sampled_action = torch.tensor(next_sampled_action).flatten().reshape(self.batch_size, 1)
            next_sampled_log_prob = torch.tensor(next_sampled_log_prob).flatten().reshape(self.batch_size, 1)

            print("next_sampled_action",next_sampled_action[0:5,:])
            print("next_sampled_log_prob", next_sampled_log_prob[0:5,:])

            input = torch.cat((s_prime_batch, next_sampled_action), dim = 1).to(self.device)
            print("input looks like", input[0:5,])

            qf1_next = self.critic_Q1.NN_critic(input)   
            qf2_next = self.critic_Q2.NN_critic(input)



            print("Total number of zero outputs", (qf1_next == 0).sum(), "out of", qf1_next.shape)

            min_qf_next = torch.min(qf1_next,qf2_next) - next_sampled_log_prob

            print("min_qf_next",min_qf_next[0:5,:])

            next_q_value = reward + self.gamma * min_qf_next # 200 x 1

        print("next_q_value", next_q_value[0:5,:])

        #Get the current values and optimize with respect to the next ones
        input_Q = torch.cat((s_batch, a_batch), dim = 1).to(self.device)
    
        qf1 = self.critic_Q1.NN_critic(input_Q) # 200 x 1
        qf2 = self.critic_Q2.NN_critic(input_Q) # 200 x 1

        print("Total number of zero outputs", (qf1 == 0).sum(), "out of", qf1.shape)

        q1_loss = nn.functional.mse_loss(qf1, next_q_value)  
        q2_loss = nn.functional.mse_loss(qf2,next_q_value)

        print("q1 loss", q1_loss)

        self.run_gradient_update_step(self.critic_Q1, q1_loss)
        self.run_gradient_update_step(self.critic_Q2, q2_loss)

        # print some gradients

        print("grad of NN_critic Q1 putput weight", self.critic_Q1.NN_critic.putput.weight.grad)
        print("grad of NN_critic Q2 putput weight", self.critic_Q2.NN_critic.putput.weight.grad)
        #print("grad of NN_critic Q2", )

        #Sample current action and its log_prob
        with torch.no_grad():
            results_list2 = [self.actor.get_action_and_log_prob(state, False) for state in s_batch]

            sampled_action, sampled_log_prob = zip(*results_list2) #self.actor.get_action_and_log_prob(state=s_batch, deterministic=False)
        
            sampled_action = torch.tensor(sampled_action).flatten().reshape(self.batch_size, 1)
            sampled_log_prob = torch.tensor(sampled_log_prob).flatten().reshape(self.batch_size, 1)

        input_policy = torch.cat((s_batch, sampled_action), dim = 1).to(self.device)
        Q1_pi = self.critic_Q1.NN_critic(input_policy) #s_batch,sampled_action)
        Q2_pi = self.critic_Q2.NN_critic(input_policy) #s_batch,sampled_action)
        min_q_pi = torch.min(Q1_pi, Q2_pi)
        
        #Policy loss


        # TODO: Implement Policy update here
        policy_loss = ((sampled_log_prob) - min_q_pi) # self.alpha * removed
        print("policy loss", policy_loss[0:5,])

        #Gradient update for policy
        self.run_gradient_update_step(self.actor, policy_loss)

        print("grad of policy network", self.actor.NN_actor.input.weight.grad)


        # print some gradients


        # Temperature (alpha) loss
        print("------ Training temperature -------")

        H = -1.
        alpha_loss = - alpha * sampled_log_prob - alpha * H

        self.actor.temperature.optimizer.zero_grad()
        alpha_loss.mean().backward()
        self.actor.temperature.optimizer.step()
        
        print("targetnet --> Q1 after gradient, before soft update", self.critic_Q1.NN_critic.state_dict()['putput.weight'][0,:5])
        #Critic target update step
        print("basenet1 -->", base_net1.state_dict()['putput.weight'][0,:5])

        self.critic_target_update(base_net1, self.critic_Q1.NN_critic, self.Tau,True)
        self.critic_target_update(base_net2, self.critic_Q2.NN_critic, self.Tau,True)

        print("Q1 after update", self.critic_Q1.NN_critic.state_dict()['putput.weight'][0,:5])

        alpha = self.actor.temperature.get_param()
        print("alpha after optimization", alpha)
        


In [59]:
agent = Agent()

# testing forward pass


input_tensor = torch.tensor([[-0.9994, -0.0352,  0.2698],
        [-0.9947, -0.1029, -0.0644],
        [-0.9969,  0.0783, -0.1867],
        [-1.0000, -0.0034, -0.2585],
        [-0.9993,  0.0362, -0.3160]])

#input_tensor = torch.tensor([[-0.9994, -0.0352,  0.2698, -0.0068]])

# different state dicts for Q1 and Q2

print(agent.actor.NN_actor(input_tensor))

print(agent.actor.NN_actor.state_dict()['input.weight'])



Using device: cpu
tensor([[-0.0683, -0.0202],
        [-0.0764, -0.0191],
        [-0.0667, -0.0296],
        [-0.0714, -0.0252],
        [-0.0694, -0.0275]], grad_fn=<AddmmBackward0>)
tensor([[ 5.7183e-01, -3.3239e-01, -3.7381e-01],
        [-1.1470e-02, -3.5208e-03, -5.6288e-01],
        [ 3.6049e-01,  9.2758e-02,  4.6839e-01],
        [ 4.4821e-01, -4.6854e-01, -5.5328e-01],
        [-2.3429e-01, -5.3039e-01,  1.5475e-01],
        [ 2.2095e-01, -2.7516e-01, -4.2512e-01],
        [-2.1257e-01, -2.8474e-01,  3.6621e-01],
        [-8.9758e-02, -3.7507e-01,  5.1281e-01],
        [-1.8807e-01, -6.5548e-02, -3.5136e-01],
        [-3.4773e-01, -1.3617e-01,  1.7860e-01],
        [-1.3889e-01, -5.1517e-01, -1.7246e-01],
        [-1.1834e-01, -1.4372e-01,  3.8751e-01],
        [-1.2922e-02, -2.1914e-01, -9.9491e-03],
        [ 2.1823e-01, -2.6642e-02,  2.1151e-03],
        [ 8.8240e-02, -2.1016e-01, -5.7161e-01],
        [-3.2454e-01, -2.9064e-01, -3.2408e-01],
        [ 2.0516e-01, -3.5161e-

In [57]:
agent = Agent()
env = get_env(g=10.0, train=True)

for EP in range(TRAIN_EPISODES):
    print("Running episode: ", EP)
    run_episode(env, agent, None, verbose, train=True)

Using device: cpu
Running episode:  0


  state = torch.tensor(state)
  std = torch.tensor(std)
  mean = torch.tensor(mean)


ValueError: Expected parameter scale (Tensor of shape (1,)) of distribution Normal(loc: tensor([0.0753]), scale: tensor([nan])) to satisfy the constraint GreaterThan(lower_bound=0.0), but found invalid values:
tensor([nan])

In [None]:
type(env)

gym.wrappers.time_limit.TimeLimit