In [1]:
from dvrprl.networks import ParallelMultilayerPerceptron, Transformer, RecurrentNeuralNetwork
from dvrprl.ppo import PPOAgent
from dvrprl.tsp_env import TSPEnv
from dvrprl.dqn import DQNAgent

In [2]:
# The first step is to setup the environment, to do this we need to import the environment class and instantiate it. The environment class is called TSPEnv and is located in dvrprl/tsp_env.py. The TSPEnv class inherits from the base class Env from the gym library. The TSPEnv class has the following arguments: 
# num_nodes: Number of nodes in each generated graph. Defaults to 20.
# seed: Seed of the environment. Defaults to 123.

# First, we need to import the environment
from dvrprl.tsp_env import TSPEnv

# Instantiate the environment
env = TSPEnv()

# Instantiate the environment with a different number of nodes
env = TSPEnv(num_nodes=10)

# Instantiate the environment with a set random seed
env = TSPEnv(seed=1234)

[5]
[1]
[11]


In [3]:
# The next step is to define an agent, the agent contains all of the code needed to train a reinforcement learning agent in an environment. The algorithm used to train the agent differs for the DQN and PPO agents. The DQN agent uses the DQN algorithm. The PPO agent uses the PPO algorithm (More on this at a later point). The agent is defined in dvrprl/agent.py:

# When defining an agent, it is necessary to supply it with one or more neural networks to act as trainable function approximators
# For a DQN agent this is a Q network which is copied within the agent to make the target network.

# To define a Parallel Multilayer Perceptron network
from dvrprl.networks import ParallelMultilayerPerceptron

EMBEDDING_DIM = 64

# Note that the mode argument is set to "action-value" to indicate that the network should output action-values. This will be the only mode necessary for the DQN agent. Other modes will be necessary for the PPO agent. For the dvrprl environment the input features will always be 4, the x and y coordinates of the node and the current x and y coordinates of the vehicle. The hidden layers argument is a list of hidden layer sizes. The first hidden layer will have 64 nodes, the second hidden layer will have 64 nodes. The final layer will have 1 node, the action-value of the state-action pair. The network can be defined as follows:
q_network = ParallelMultilayerPerceptron(in_features=4, embedding_dim=EMBEDDING_DIM, hidden_layers=[64, 64], mode="action-value")

# The only argument that needs to be passed when creating a dqn agent is the value_network but there are many other arguments that can be passed to the agent. The full list of arguments can be found in the documentation. For now, the agent can be created as follows:
agent = DQNAgent(value_network=q_network)

# Give a name to the folder in which the logs will be saved

logdir = "logs/dqn-test-pmlp-24-07-23"

# The agent can now be trained in the environment. The agent can be trained for 1000 episodes with the following code:
# Again, there are more arguments that can be passed to the train method. The full list of arguments can be found in the documentation. For now, the agent can be trained as follows:

agent.train(env, episodes=1000, batch_size=64, logdir=logdir)

  0%|          | 0/1000 [00:00<?, ?episode/s]

[15]
[[0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]]
[[0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.59220053]
 [0.80913712 0.5922

KeyboardInterrupt: 

In [None]:


# Give name of folder to save the results

logdir = "logs/dqn-pmlp-24-07-23"

agent.train(env, episodes=100, batch_size=64, logdir=logdir)

In [None]:
# Testing PPO agent

env = TSPEnv(num_nodes=20)

# Define the networks
policy_network = ParallelMultilayerPerceptron(in_features=4, embedding_dim=128, hidden_layers=[128, 128], mode="policy")

value_network = ParallelMultilayerPerceptron(in_features=4, embedding_dim=128, hidden_layers=[128, 128], mode="value")

# Define the agent

agent = PPOAgent(policy_network=policy_network, value_network=value_network)

# Give name of folder to save the results, change this each time of results will be overwritten

logdir = "logs/ppo-pmlp-24-07-23"

agent.train(env, epochs=10, episodes=100, batch_size=64, logdir=logdir)

  0%|          | 0/10 [00:00<?, ?epoch/s]

  0%|          | 0/100 [00:00<?, ?episodes/s]

KeyboardInterrupt: 