In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from simulations import sim_randomised, sim_constrained, sim_automated
import random

In [2]:
class ActorCritic(nn.Module):
    def __init__(self, n_states, n_actions):
        super(ActorCritic, self).__init__()
        # Network layers !! Replace with GNN implementation
        
        self.actor = nn.Sequential(
            nn.Linear(n_states, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions),
            nn.Softmax(dim=-1)
        )
        self.critic = nn.Sequential(
            nn.Linear(n_states, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, state):
        action_probs = self.actor(state)
        state_value = self.critic(state)
        return action_probs, state_value

In [3]:
class PPO:
    def __init__(self, n_states, n_actions, lr_actor, lr_critic, gamma, K_epochs, eps_clip):
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs

        self.policy = ActorCritic(n_states, n_actions)
        self.optimizer = optim.Adam([
            {'params': self.policy.actor.parameters(), 'lr': lr_actor},
            {'params': self.policy.critic.parameters(), 'lr': lr_critic}
        ])

        self.policy_old = ActorCritic(n_states, n_actions)
        self.policy_old.load_state_dict(self.policy.state_dict())

        self.MseLoss = nn.MSELoss()

    def select_action(self, state, memory,action_mask):
        state = torch.FloatTensor(state.reshape(1, -1))
        with torch.no_grad():
            action_probs, _ = self.policy_old(state)
        masked_action_probs = action_probs * action_mask
        print(action_probs)
        #action = np.random.choice(len(masked_action_probs.squeeze()), p=masked_action_probs.squeeze().numpy())
        action = torch.multinomial(masked_action_probs, 1).item()
        memory.states.append(state)
        memory.actions.append(action)
        return action

    def update(self, memory):
        # Monte Carlo estimate of state rewards:
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)

        # Normalizing the rewards
        rewards = torch.tensor(rewards)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)

        # Convert list to tensor
        old_states = torch.squeeze(torch.stack(memory.states).detach())
        old_actions = torch.squeeze(torch.tensor(memory.actions).detach())

        # Optimize policy for K epochs
        for _ in range(self.K_epochs):
            # Evaluating old actions and values
            logprobs, state_values = self.policy(old_states)
            state_values = torch.squeeze(state_values)

            # Match state-action pairs
            dist_entropy = -torch.sum(logprobs * torch.exp(logprobs), dim=1)
            new_logprobs = logprobs.gather(1, old_actions.unsqueeze(1)).squeeze(1)
            old_logprobs = self.policy_old(old_states).gather(1, old_actions.unsqueeze(1)).squeeze(1)

            # Calculate ratio (pi_theta / pi_theta__old)
            ratios = torch.exp(new_logprobs - old_logprobs.detach())

            # Calculate surrogate losses
            advantages = rewards - state_values.detach()
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages

            # Final loss
            loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy

            # Take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        # Copy new weights into old policy
        self.policy_old.load_state_dict(self.policy.state_dict())

In [4]:
class Memory:
    def __init__(self):
        self.actions = []
        self.states = []
        self.rewards = []
        self.is_terminals = []

    def clear_memory(self):
        del self.actions[:]
        del self.states[:]
        del self.rewards[:]
        del self.is_terminals[:]

In [5]:
# Initialize your environment and the PPO agent
env = sim_constrained.Env(visualise=False, verbose=False)
n_states = env.n_states
n_actions = env.n_actions

ppo_agent = PPO(n_states, n_actions, lr_actor=0.0003, lr_critic=0.0003, gamma=0.99, K_epochs=4, eps_clip=0.2)
memory = Memory()

# Training loop
num_episodes = 100
for episode in range(num_episodes):
    state, action_mask = env.reset()
    print(state)
    
    done = False

    while not done:
        action = ppo_agent.select_action(state, memory, action_mask)
        state, reward, done, action_mask = env.step(action)
        print(reward)

        # Save in memory
        memory.rewards.append(reward)
        memory.is_terminals.append(done)

    # Update PPO agent
    ppo_agent.update(memory)
    memory.clear_memory()

    # Log results, etc.


[180.0, 270.0, 180.0, 180.0, 140.0, 180.0, 270.0, 405.0, 270.0, 270.0, 210.0, 270.0, 25.0, 25.0, 180.0, 270.0, 180.0, 180.0, 140.0, 180.0, 140.0, 210.0, 210.0, 120.0, 190.0, 190.0, 190.0, 190.0, 30.0, 45.0, 45.0] only setup and processing times
[180.0, 270.0, 180.0, 180.0, 140.0, 180.0, 270.0, 405.0, 270.0, 270.0, 210.0, 270.0, 25.0, 25.0, 180.0, 270.0, 180.0, 180.0, 140.0, 180.0, 140.0, 210.0, 210.0, 120.0, 190.0, 190.0, 190.0, 190.0, 30.0, 45.0, 45.0, 55] orders not initialised


SyntaxError: 'return' outside function (3973421556.py, line 14)