In [1]:
import torch
import torch.optim as optim
from src.Environment import Environment
from src.actor_critic import ActorCritic, Actor, Critic
import torch.distributions as dist
from torch.distributions import Categorical

In [2]:
def train(n_episodes, max_steps):
    env = Environment()
    for episode in range(n_episodes):
        state = env.reset(45,135)
        episode_reward = 0
        
        for step in range(max_steps):
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            action_probs, state_value = ac_net(state_tensor)
            
            # Sample action from the probability distribution
            #print(action_probs)
            dist = Categorical(action_probs)
            action = dist.sample()
            
            # Take action in the environment
            next_state, reward, done, _ = env.step(action.item())
            episode_reward += reward
            
            # Compute TD error
            next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
            _, next_state_value = ac_net(next_state_tensor)
            td_error = reward + (0.99 * next_state_value * (1 - int(done))) - state_value
            
            # Compute losses
            actor_loss = dist.log_prob(action) * td_error.detach()
            critic_loss = td_error * state_value
            print(actor_loss.item(),"..", critic_loss.item())
            loss = actor_loss + critic_loss/1000
            
            
            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if done:
                break
            
            state = next_state
        
        print(f"Episode {episode+1}, Reward: {episode_reward}")


In [3]:
def train2(n_episodes, max_steps):
    env = Environment()
    for episode in range(n_episodes):
        state = env.reset(90,135)
        episode_reward = 0
        print(state)
        
        for step in range(max_steps):
            #### ACTOR ####
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            action_probs = a_net(state_tensor)
            #print(action_probs)
            
            # Sample action from the probability distribution
            dist = Categorical(action_probs)
            action = dist.sample()
            
            # Take action in the environment
            next_state, reward, done, _ = env.step(action.item())
            # print("reward", reward)
            episode_reward += reward

            #### CRITIC ####
            state_value = c_net(state_tensor)
            next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
            next_state_value = c_net(next_state_tensor)
            
            td_error = reward + (0.01 * next_state_value) - state_value
            
            # Compute losses
            loss_actor = dist.log_prob(action) * td_error.detach()
            loss_critic = td_error.pow(2)

            #### OPTIM ACTOR ####
            optimizer_actor.zero_grad()
            loss_actor.backward()
            optimizer_actor.step()
            
            #### OPTIM CRITIC ####
            optimizer_critic.zero_grad()
            loss_critic.backward()
            optimizer_critic.step()
            
            if done:
                break
            
            state = next_state
        
        print(f"Episode {episode+1}, Reward: {episode_reward}")


In [4]:
import random
from collections import deque
import torch
import torch.nn.functional as F
from torch.distributions import Categorical

# Assuming Environment, a_net, c_net, optimizer_actor, and optimizer_critic are defined elsewhere

class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def add(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def size(self):
        return len(self.buffer)

def train3(n_episodes, max_steps, batch_size, replay_capacity):
    env = Environment()
    replay_buffer = ReplayBuffer(replay_capacity)
    
    for episode in range(n_episodes):
        state = env.reset(90, 135)
        episode_reward = 0
        
        for step in range(max_steps):
            #### ACTOR ####
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            action_probs = a_net(state_tensor)
            
            # Sample action from the probability distribution
            dist = Categorical(action_probs)
            action = dist.sample()
            
            # Take action in the environment
            next_state, reward, done, _ = env.step(action.item())
            episode_reward += reward

            # Store experience in replay buffer
            replay_buffer.add((state, action.item(), reward, next_state, int(done)))

            state = next_state

            # Only start training once we have enough experiences in the replay buffer
            if replay_buffer.size() >= batch_size:
                experiences = replay_buffer.sample(batch_size)
                batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(*experiences)

                # Convert to tensors
                batch_state_tensor = torch.FloatTensor(batch_state)
                batch_action_tensor = torch.LongTensor(batch_action)
                batch_reward_tensor = torch.FloatTensor(batch_reward)
                batch_next_state_tensor = torch.FloatTensor(batch_next_state)
                batch_done_tensor = torch.FloatTensor(batch_done)

                #### CRITIC ####
                state_values = c_net(batch_state_tensor).squeeze()
                next_state_values = c_net(batch_next_state_tensor).squeeze()
                td_errors = batch_reward_tensor + (0.99 * next_state_values * (1 - batch_done_tensor)) - state_values

                # Compute losses
                action_probs = a_net(batch_state_tensor)
                dist = Categorical(action_probs)
                log_probs = dist.log_prob(batch_action_tensor)
                loss_actor = -(log_probs * td_errors.detach()).mean()
                print("loss_actor", loss_actor)
                loss_critic = td_errors.mean()
                print("loss_critic", loss_critic)

                #### OPTIM ACTOR ####
                optimizer_actor.zero_grad()
                loss_actor.backward()
                optimizer_actor.step()
                
                #### OPTIM CRITIC ####
                optimizer_critic.zero_grad()
                loss_critic.backward()
                optimizer_critic.step()

            if done:
                break
        
        print(state)
        print(f"Episode {episode + 1}, Reward: {episode_reward}")

In [5]:
# Train the agent
a_net = Actor(7,4)
c_net = Critic(7)

optimizer_actor  = optim.Adam(a_net.parameters(), lr = 1e-2)
optimizer_critic = optim.Adam(c_net.parameters(), lr = 1e-2)
train3(n_episodes=100, max_steps=100, batch_size=128, replay_capacity=1000)

(np.float64(150.1527377077958), np.float64(277.9893514867113), np.float64(90.0), np.float64(0.0030240466631540597), np.float64(-0.00021080637293957396), np.float64(0.0), 3000.0)
Episode 1, Reward: 1000
loss_actor tensor(1.1272e-06, grad_fn=<NegBackward0>)
loss_critic tensor(9.4555, grad_fn=<MeanBackward0>)
loss_actor tensor(8.6910e-07, grad_fn=<NegBackward0>)
loss_critic tensor(7.2906, grad_fn=<MeanBackward0>)
loss_actor tensor(6.1415e-07, grad_fn=<NegBackward0>)
loss_critic tensor(5.1518, grad_fn=<MeanBackward0>)
loss_actor tensor(3.5675e-07, grad_fn=<NegBackward0>)
loss_critic tensor(2.9927, grad_fn=<MeanBackward0>)
loss_actor tensor(9.5906e-08, grad_fn=<NegBackward0>)
loss_critic tensor(0.8045, grad_fn=<MeanBackward0>)
loss_actor tensor(-1.6984e-07, grad_fn=<NegBackward0>)
loss_critic tensor(-1.4247, grad_fn=<MeanBackward0>)
loss_actor tensor(-4.4234e-07, grad_fn=<NegBackward0>)
loss_critic tensor(-3.7106, grad_fn=<MeanBackward0>)
loss_actor tensor(-7.2496e-07, grad_fn=<NegBackward0