### **Import Dependencies** 

In [272]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np

# import gymnasium library of OpenAI to create a environment
import gymnasium as gym

### **Create Policy Network**

In [273]:
class Actor(nn.Module):
    def __init__(self, state_size, action_size, hidden_size):
        super(Actor, self).__init__()

        # define first fully connected layer --- the input size is same as our state_size and the output size is same as the hidden_size (**SET HYPERPARAMETERS**)
        self.FullyConnectedLayer_1 = nn.Linear(state_size, hidden_size)
        
        # define first fully connected layer --- the input size is same as our hidden_size as the output of the last layer and the output size is same as action_size
        self.FullyConnectedLayer_2 = nn.Linear(hidden_size, action_size)

    # define forward function
        """
         1. state go to the first layer as input
          2. relu activation function apply on the output of first layer
           3. the output of relu function goes into the second layer as input
            4. softmax activation function apply on the output of the second layer
             5. the output of the second layer go to the Categorical Function as input that gives us a probability distribution over actions
              6. sampling one action in the distribution and return
              """
    def forward(self, state):
        state = Variable(torch.from_numpy(state).float().unsqueeze(0))
        state = nn.functional.relu(self.FullyConnectedLayer_1(state))
        state = self.FullyConnectedLayer_2(state)
        state =  nn.functional.softmax(state, dim=1)
        policy_dist = Categorical(state)
        action = policy_dist.sample()
        return state, action
    

### **Create Value Function Network**

In [274]:
class Critic(nn.Module):
    def __init__(self, state_size, hidden_size): # note that the output of this network is a value of a action (the action that done by policy network), so the output size is equal to 1.
        super(Critic, self).__init__()

        # define first fully connected layer --- the input size is same as our state_size and the output size is same as the hidden_size (**SET HYPERPARAMETERS**)
        self.FullyConnectedLayer_1 = nn.Linear(state_size, hidden_size)
        
        # define first fully connected layer --- the input size is same as our hidden_size as the output of the last layer and the output size is same as action_size
        self.FullyConnectedLayer_2 = nn.Linear(hidden_size, 1)

    # define forward function
        """
         1. state go to the first layer as input
          2. relu activation function apply on the output of first layer
           3. the output of relu function goes into the second layer as input and return
              """
    def forward(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        state = nn.functional.relu(self.FullyConnectedLayer_1(state))
        state = self.FullyConnectedLayer_2(state)
        return state

### **Define Advantage Function**

the figure below show the definition of advantage function

<img src="https://cdn-media-1.freecodecamp.org/images/1*SvSFYWx5-u5zf38baqBgyQ.png"  width="25%" height="20%">

we estimate Q(s,a) as TD error, so the formula convert to:

<img src="https://cdn-media-1.freecodecamp.org/images/1*fmWayfCY4QVIounYXWi2rg.png"  width="25%" height="20%">

so for define the advantage function, we need reward of an action, value of the next state, discount factor and value of the current state

In [275]:
def AdvantageFunction(reward, next_value, current_value, discount_factor):
    advantage = reward + discount_factor * next_value.detach() - current_value.detach()   # note that detach() returns a new tensor that doesn't require a gradient (dont follow gradient computation)
    # print(f"next value: {next_value}")
    return advantage

### **Define Policy Loss Function**

<img src="https://dev-to-uploads.s3.amazonaws.com/uploads/articles/lxdvqu1lno5xulbujb9l.jpeg"  width="25%" height="20%">

the figure show that the poliucy loss is equal to logarithm of probability of a action * advantage of that action

In [276]:
def PolicyLoss(prob_dist, action, advantage):

    # note that the negative sign is used to convert the problem of maximizing the expected return into a problem of minimizing the negative expected return.(gradient descent minimize the error)

    """ 
     to stabilize the training process and prevent the policy from changing too much in a single update, we define regularization term as
      0.01 * advantage.pow(2) that is the squared L2 norm of the advantage, which is a measure of how much variance there is in the advantage estimates.
       By adding this term to the policy loss, we encourage the policy to be more stable and smooth. """
    # prob_dist.item()[action]
    policy_loss = -(torch.log(prob_dist[0, action]).detach() * advantage.detach() + 0.01 * advantage.pow(2))
    policy_loss = torch.tensor(policy_loss, requires_grad=True)
    return policy_loss

### **Define Value Loss Function**

The value loss is a measure of how well the value network is doing at estimating the expected return.\
It is computed as **the mean squared error between the actual return and the estimated return**

In [277]:
def ValueLoss(advantage):
    value_loss = 0.5 * advantage.pow(2)
    value_loss = torch.tensor(value_loss, requires_grad=True)
    return value_loss

### **Create Environment** 

In [278]:
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

### **Set Hyperparameters**

In [279]:
hidden_size = 256
learning_rate = 0.0001
discount_factor = 0.99
num_epochs = 1000 # 1000
num_steps = 200 # 200

### **Create Policy and Value Network**

In [280]:
# Create Policy Network
Actor_Net = Actor(state_size, action_size, hidden_size)

# Create Value Function Network
Critic_Net = Critic(state_size, hidden_size)

### **Define an Optimizer**

In [281]:
parameters_list = list(Actor_Net.parameters()) + list(Critic_Net.parameters())
optimizer = optim.Adam(parameters_list, lr=learning_rate)

### **Define Train Function**

In [284]:
# define a blank list to save total reward of each epoch
epochs_rewards = []

# define a blank list to save total loss of each epoch
epochs_loss = []

# define a blank list to save the length of each epoch
epochs_length = []
def train(env, Actor_Net, Critic_Net, optimizer, num_epochs, num_steps):

    for episode in range(num_epochs):

        # this variable show that the episode has been ended or no, if it has been ended, done is True and no action can be taken
        # we set this variable to False at the beginnig of each episode
        done = False

        # reset the environment in the beginning of each episode
        state = env.reset()[0]

        # define a blank list to save the reward of each step in a specific epoch
        rewards = []

        # define a blank list to save the loss of each step in a specific epoch
        loss = []

        for step in range(num_steps):

            
            
            # applying state to each network and give action & action_value
            prob_dist , action = Actor_Net(state)
            # print(policy_dist.detach().numpy()[0])
            action_value = Critic_Net(state)
            # applying action to the environment and give (next_state, reward, done , _ , _)
            next_state, reward, done, _ , _ = env.step(action.item())

            # save the reward that come from env to the reward list
            rewards.append(reward)

            
            next_value = Critic_Net(next_state)
            advantage = AdvantageFunction(reward, next_value, action_value, discount_factor)
            policy_loss = PolicyLoss(prob_dist, action, advantage)
            value_loss = ValueLoss(advantage)
            total_loss = policy_loss + value_loss
            loss.append(total_loss)
            # total_loss.backward()
            policy_loss.backward()
            value_loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            state = next_state

            # print(f"episode: {episode + 1}, step: {step + 1}, action: {action.item()}, reward: {reward}, Policy loss: {policy_loss.item()}, Value loss: {value_loss.item()}")

            if done:
                epochs_length.append(step + 1)
                break

        epochs_rewards.append(sum(rewards))
        epochs_loss.append(sum(loss))
        if episode % 10 == 0:
            print(f"episode: {episode}, reward: {epochs_rewards[episode]}, total length: {epochs_length[episode]}, total loss: {epochs_loss[episode]}")
    

In [285]:
# train(env, Actor_Net, Critic_Net, optimizer, num_epochs, num_steps):
train(env, Actor_Net, Critic_Net, optimizer, num_epochs, num_steps)

  policy_loss = torch.tensor(policy_loss, requires_grad=True)
  value_loss = torch.tensor(value_loss, requires_grad=True)


episode: 0, reward: 9.0, total length: 9, total loss: tensor([[11.0779]], grad_fn=<AddBackward0>)
episode: 10, reward: 25.0, total length: 25, total loss: tensor([[29.4721]], grad_fn=<AddBackward0>)
episode: 20, reward: 11.0, total length: 11, total loss: tensor([[13.2655]], grad_fn=<AddBackward0>)
episode: 30, reward: 27.0, total length: 27, total loss: tensor([[31.7683]], grad_fn=<AddBackward0>)
episode: 40, reward: 17.0, total length: 17, total loss: tensor([[20.1618]], grad_fn=<AddBackward0>)
episode: 50, reward: 26.0, total length: 26, total loss: tensor([[31.3910]], grad_fn=<AddBackward0>)
episode: 60, reward: 17.0, total length: 17, total loss: tensor([[20.3263]], grad_fn=<AddBackward0>)
episode: 70, reward: 31.0, total length: 31, total loss: tensor([[36.7293]], grad_fn=<AddBackward0>)
episode: 80, reward: 23.0, total length: 23, total loss: tensor([[27.3833]], grad_fn=<AddBackward0>)
episode: 90, reward: 12.0, total length: 12, total loss: tensor([[14.2215]], grad_fn=<AddBackw