In [1]:
import torch
import torch.optim as optim
from src.Environment import Environment
from src.actor_critic import ActorCritic, Actor, Critic
import torch.distributions as dist
from torch.distributions import Categorical

In [2]:
def train(n_episodes, max_steps):
    env = Environment()
    for episode in range(n_episodes):
        state = env.reset(45,135)
        episode_reward = 0
        
        for step in range(max_steps):
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            action_probs, state_value = ac_net(state_tensor)
            
            # Sample action from the probability distribution
            #print(action_probs)
            dist = Categorical(action_probs)
            action = dist.sample()
            
            # Take action in the environment
            next_state, reward, done, _ = env.step(action.item())
            episode_reward += reward
            
            # Compute TD error
            next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
            _, next_state_value = ac_net(next_state_tensor)
            td_error = reward + (0.99 * next_state_value * (1 - int(done))) - state_value
            
            # Compute losses
            actor_loss = dist.log_prob(action) * td_error.detach()
            critic_loss = td_error * state_value
            print(actor_loss.item(),"..", critic_loss.item())
            loss = actor_loss + critic_loss/1000
            
            
            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if done:
                break
            
            state = next_state
        
        print(f"Episode {episode+1}, Reward: {episode_reward}")


In [3]:
def train2(n_episodes, max_steps):
    env = Environment()
    for episode in range(n_episodes):
        state = env.reset(90,135)
        episode_reward = 0
        print(state)
        
        for step in range(max_steps):
            #### ACTOR ####
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            action_probs = a_net(state_tensor)
            #print(action_probs)
            
            # Sample action from the probability distribution
            dist = Categorical(action_probs)
            action = dist.sample()
            
            # Take action in the environment
            next_state, reward, done, _ = env.step(action.item())
            # print("reward", reward)
            episode_reward += reward

            #### CRITIC ####
            state_value = c_net(state_tensor)
            next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
            next_state_value = c_net(next_state_tensor)
            
            td_error = reward + (0.01 * next_state_value) - state_value
            
            # Compute losses
            loss_actor = dist.log_prob(action) * td_error.detach()
            loss_critic = td_error.pow(2)

            #### OPTIM ACTOR ####
            optimizer_actor.zero_grad()
            loss_actor.backward()
            optimizer_actor.step()
            
            #### OPTIM CRITIC ####
            optimizer_critic.zero_grad()
            loss_critic.backward()
            optimizer_critic.step()
            
            if done:
                break
            
            state = next_state
        
        print(f"Episode {episode+1}, Reward: {episode_reward}")


In [4]:
import random
from collections import deque
import torch
import torch.nn.functional as F
from torch.distributions import Categorical

# Assuming Environment, a_net, c_net, optimizer_actor, and optimizer_critic are defined elsewhere

class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def add(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def size(self):
        return len(self.buffer)

def train3(n_episodes, max_steps, batch_size, replay_capacity):
    env = Environment()
    replay_buffer = ReplayBuffer(replay_capacity)
    
    for episode in range(n_episodes):
        state = env.reset(90, 135)
        episode_reward = 0
        
        for step in range(max_steps):
            #### ACTOR ####
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            action_probs = a_net(state_tensor)
            
            # Sample action from the probability distribution
            dist = Categorical(action_probs)
            action = dist.sample()
            
            # Take action in the environment
            next_state, reward, done, _ = env.step(action.item())
            episode_reward += reward

            # Store experience in replay buffer
            replay_buffer.add((state, action.item(), reward, next_state, int(done)))

            state = next_state

            # Only start training once we have enough experiences in the replay buffer
            if replay_buffer.size() >= batch_size:
                experiences = replay_buffer.sample(batch_size)
                batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(*experiences)

                # Convert to tensors
                batch_state_tensor = torch.FloatTensor(batch_state)
                batch_action_tensor = torch.LongTensor(batch_action)
                batch_reward_tensor = torch.FloatTensor(batch_reward)
                batch_next_state_tensor = torch.FloatTensor(batch_next_state)
                batch_done_tensor = torch.FloatTensor(batch_done)

                #### CRITIC ####
                state_values = c_net(batch_state_tensor).squeeze()
                next_state_values = c_net(batch_next_state_tensor).squeeze()
                td_errors = batch_reward_tensor + (0.99 * next_state_values * (1 - batch_done_tensor)) - state_values

                # Compute losses
                action_probs = a_net(batch_state_tensor)
                dist = Categorical(action_probs)
                log_probs = dist.log_prob(batch_action_tensor)
                loss_actor = -(log_probs * td_errors.detach()).mean()
                loss_critic = -td_errors.pow(2).mean()

                #### OPTIM ACTOR ####
                optimizer_actor.zero_grad()
                loss_actor.backward()
                optimizer_actor.step()
                
                #### OPTIM CRITIC ####
                optimizer_critic.zero_grad()
                loss_critic.backward()
                optimizer_critic.step()

            if done:
                break
        
        print(state)
        print(f"Episode {episode + 1}, Reward: {episode_reward}")

In [None]:
# Train the agent
a_net = Actor(7,4)
c_net = Critic(7)

optimizer_actor  = optim.Adam(a_net.parameters(), lr = 1e-2)
optimizer_critic = optim.Adam(c_net.parameters(), lr = 1e-2)
train3(n_episodes=1000, max_steps=500, batch_size=512, replay_capacity=10000)
#train2(n_episodes=1000, max_steps=2000)

(np.float64(62.12015467696554), np.float64(1318.768179201108), np.float64(137.72970773009177), np.float64(-4.477327584904561), np.float64(22.066122678653535), np.float64(1.2149380149477957), 2860.5)
Episode 1, Reward: 587.5
(np.float64(-4.388308580205468), np.float64(829.451111113848), np.float64(173.65716045783358), np.float64(-8.583446523795194), np.float64(14.452269834995603), np.float64(2.603438603459562), 2895.0)
Episode 2, Reward: 77.5
(np.float64(-4.028571733770867), np.float64(1065.1497427679963), np.float64(156.82159082212848), np.float64(-6.500313522032571), np.float64(16.609353744999016), np.float64(1.5620631620757368), 2888.5)
Episode 3, Reward: 502.5
(np.float64(-3.446603457585228), np.float64(1317.6272715507848), np.float64(147.62277442323807), np.float64(-6.970142050064414), np.float64(22.196349745848764), np.float64(1.2149380149477957), 2853.0)
Episode 4, Reward: 495.0
(np.float64(25.427007054279436), np.float64(1310.9512612335598), np.float64(149.01127501174983), np.fl

  batch_done_tensor = torch.FloatTensor(batch_done)


(np.float64(-1.7852017447492567), np.float64(797.4140335766272), np.float64(288.90270930431075), np.float64(-6.857416192569143), np.float64(7.644985013013398), np.float64(7.983878383942662), 2924.0)
Episode 7, Reward: 0.0
(np.float64(-2.4057341744962555), np.float64(509.99042793371734), np.float64(320.1439725458256), np.float64(-5.711140047830477), np.float64(2.9657612247832046), np.float64(8.851691251762514), 2949.0)
Episode 8, Reward: 0.0
(np.float64(-2.4057341744962555), np.float64(509.99042793371734), np.float64(320.1439725458256), np.float64(-5.711140047830477), np.float64(2.9657612247832046), np.float64(8.851691251762514), 2949.0)
Episode 9, Reward: 0.0
(np.float64(-2.4057341744962555), np.float64(509.99042793371734), np.float64(320.1439725458256), np.float64(-5.711140047830477), np.float64(2.9657612247832046), np.float64(8.851691251762514), 2949.0)
Episode 10, Reward: 0.0
(np.float64(-2.4057341744962555), np.float64(509.99042793371734), np.float64(320.1439725458256), np.float64(