In [1]:
import os
import numpy as np
import tsplib95 as tsp

import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box

from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

%load_ext tensorboard

In [2]:
def load_tsp(problem_file):
    problem_path = os.path.join(problem_file)
    problem = tsp.load(problem_path)
    return problem

In [3]:
gr24 = load_tsp("gr24.tsp")
#print(gr24)
gr24_opt = load_tsp("gr24.opt.tour")
#print(gr24_opt)

In [4]:
class tspEnv(Env):
    def __init__(self, problem):
        self.problem = problem
        self.action_space = Discrete(self.problem.dimension)
        self.observation_space = Box(0, np.inf, (self.problem.dimension,))
        self.tour = []
        self.max_length = 2*self.problem.dimension
        self.start = 0
        self.w_matrix = self._get_w_matrix()
        
    def step(self, action):
         # Get current state
        state = self._get_state()
        new_obs = self._get_obs(action)

        # Get reward for such a move
        reward = self._get_reward(state,action)

        # Append reached node to tour
        self.tour.append(int(action))
        self._update_matrix(action)
        
        tour_sort = self.tour.copy()
        tour_sort.sort()
        if (action == self.start) and self._is_subset(list(range(self.problem.dimension)), self.tour):
            done = True
            reward += 10**3
            if len(self.tour) == self.problem.dimension + 1:
                reward += 10**3
        else:
            done = False
        
        if len(self.tour) == self.max_length:
            force_stop = True
        else:
            force_stop = False

        info = {"tour": self.tour}

        return new_obs, reward, done, force_stop, info

    def next_rand_action(self):
        if len(self.tour) == (self.problem.dimension):
            return self.start
        else:
            while True:
                a = self.action_space.sample()
                if (((a not in self.tour) and (a != self.start)) and (a != self._get_state())):
                    break
            return a
        
    def render(self):
        pass
        
    def reset(self, seed = None, option = None):
        super().reset(seed=seed)
        self.tour = []
        self.start = 0
        self.tour.append(self.start)
        self.w_matrix = self._get_w_matrix()
        info = {}
        return self._get_obs(self.start), info

    def _get_state(self):
        return self.tour[-1]

    def _get_reward(self, state, new_state):
        return -self.w_matrix[state][new_state]

    def _get_obs(self, action):
        return np.array(self.w_matrix[action])

    def _get_w_matrix(self):
        data = []
        weight = []
        for i in self.problem.edge_weights:
            for j in i:
                data.append(j)

        # convert lower triangle matrix to square matrix
        if self.problem.edge_weight_format == "LOWER_DIAG_ROW":
            for x in range(self.problem.dimension):   # format lower triangle matrix
                node = []
                w = data.pop(0)
                while w != 0:
                    node.append(w)
                    w = data.pop(0)
                while len(node) != self.problem.dimension:
                    node.append(0)
                weight.append(node)
            matrix = np.triu(np.array(weight).T,1) + weight   #convert to square matrix
            
        matrix[matrix == 0] = 10**5
        return matrix

    def _update_matrix(self, new_state):
        self.w_matrix[:, new_state] = 10**5

    def _is_subset(self, sub_list, list):
        if set(sub_list).intersection(set(list)) == set(sub_list):
            return True
        else:
            return False
        

In [5]:
env = tspEnv(gr24)

In [6]:
episode = 5
for episode in range(episode):
    obs = env.reset()
    done = False
    score = 0

    while not done:
        action = env.next_rand_action()
        obs, reward, done, force_stop, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score-10**5))
    print(env.tour)
env.close

Episode:0 Score:-101979
[0, 3, 18, 15, 22, 10, 21, 20, 8, 2, 11, 1, 5, 4, 13, 7, 14, 9, 17, 16, 6, 19, 23, 12, 0]
Episode:1 Score:-101534
[0, 14, 17, 11, 16, 7, 20, 21, 10, 12, 3, 18, 1, 23, 4, 9, 6, 8, 13, 2, 22, 19, 5, 15, 0]
Episode:2 Score:-101639
[0, 2, 19, 14, 12, 4, 23, 16, 8, 15, 5, 7, 11, 9, 22, 17, 20, 18, 21, 3, 1, 6, 13, 10, 0]
Episode:3 Score:-101735
[0, 6, 18, 11, 10, 2, 16, 1, 5, 8, 19, 9, 17, 3, 21, 20, 14, 22, 15, 12, 7, 13, 23, 4, 0]
Episode:4 Score:-101518
[0, 12, 9, 6, 1, 16, 14, 15, 8, 17, 20, 13, 22, 21, 4, 11, 3, 19, 23, 18, 2, 10, 5, 7, 0]


<bound method Env.close of <__main__.tspEnv object at 0x000001A6AAE05D30>>

In [7]:
env.reset()

(array([100000,    257,    187,     91,    150,     80,    130,    134,
           243,    185,    214,     70,    272,    219,    293,     54,
           211,    290,    268,    261,    175,    250,    192,    121]),
 {})

In [8]:
log_path = os.path.join('Training', 'Logs')
timesteps = 1000000

In [9]:
model1 = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)
model1.learn(total_timesteps = timesteps)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\PPO_1
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 47        |
|    ep_rew_mean     | -2.62e+06 |
| time/              |           |
|    fps             | 390       |
|    iterations      | 1         |
|    time_elapsed    | 5         |
|    total_timesteps | 2048      |
----------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 47            |
|    ep_rew_mean          | -2.63e+06     |
| time/                   |               |
|    fps                  | 315           |
|    iterations           | 2             |
|    time_elapsed         | 12            |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 1.1001248e-08 |
|    clip_fraction        | 0           

<stable_baselines3.ppo.ppo.PPO at 0x1a6ac351520>

In [10]:
model2 = A2C('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)
model2.learn(total_timesteps = timesteps)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\A2C_1
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 47        |
|    ep_rew_mean        | -2.65e+06 |
| time/                 |           |
|    fps                | 223       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -3.12     |
|    explained_variance | -1.67e-06 |
|    learning_rate      | 0.0007    |
|    n_updates          | 99        |
|    policy_loss        | -6.94e+05 |
|    value_loss         | 6.03e+10  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 47        |
|    ep_rew_mean        | -2.66e+06 |
| time/                 |           |
|    fps                | 228       |


<stable_baselines3.a2c.a2c.A2C at 0x1a696853ce0>

In [None]:
model3 = DQN('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)
model3.learn(total_timesteps = timesteps)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\DQN_1
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 47       |
|    ep_rew_mean      | -2.7e+06 |
|    exploration_rate | 0.998    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 482      |
|    time_elapsed     | 0        |
|    total_timesteps  | 188      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 5.79e+04 |
|    n_updates        | 21       |
----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 47        |
|    ep_rew_mean      | -2.63e+06 |
|    exploration_rate | 0.996     |
| time/               |           |
|    episodes         | 8         |
|    fps              | 399       |
|    time_elapsed     | 0         |
|    total_timesteps  | 3

In [None]:
evaluate_policy(model1, env, n_eval_episodes = 10)

In [None]:
evaluate_policy(model2, env, n_eval_episodes = 10)

In [None]:
evaluate_policy(model3, env, n_eval_episodes = 10)

In [None]:
for x in range(10):
    obs, info = env.reset()
    result = 0
    for i in range(100):
        action, _states = model1.predict(obs)
        obs, reward, done, force_stop, info = env.step(action)
        result += reward
        
        if done or force_stop:
            break
    
    print(env.tour)
    print(len(env.tour))
    print(result)
    print(result)
    print("---------------------------------")

In [None]:
for x in range(10):
    obs, info = env.reset()
    result = 0
    for i in range(100):
        action, _states = model2.predict(obs)
        obs, reward, done, force_stop, info = env.step(action)
        result += reward
        
        if done or force_stop:
            break
    
    print(env.tour)
    print(len(env.tour))
    print(result)
    print(result)
    print("---------------------------------")

In [None]:
for x in range(10):
    obs, info = env.reset()
    result = 0
    for i in range(100):
        action, _states = model3.predict(obs)
        obs, reward, done, force_stop, info = env.step(action)
        result += reward
        
        if done or force_stop:
            break
    
    print(env.tour)
    print(len(env.tour))
    print(result)
    print(result)
    print("---------------------------------")

In [None]:
print(gr24_opt)

In [22]:
%tensorboard --logdir='F:\UCL\Year_3\FYP\src\Training\Logs\PPO_1'

Reusing TensorBoard on port 6006 (pid 30036), started 0:08:10 ago. (Use '!kill 30036' to kill it.)

In [23]:
%tensorboard --logdir='F:\UCL\Year_3\FYP\src\Training\Logs\A2C_1'

Reusing TensorBoard on port 6007 (pid 32472), started 0:08:27 ago. (Use '!kill 32472' to kill it.)

In [24]:
%tensorboard --logdir='F:\UCL\Year_3\FYP\src\Training\Logs\DQN_1'

Reusing TensorBoard on port 6006 (pid 40384), started 3 days, 19:08:54 ago. (Use '!kill 40384' to kill it.)