# Imports

In [1]:
import os
import tempfile
import glob
import numpy as np
import tsplib95 as tsp
import math

import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box

from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback

%load_ext tensorboard

## Tensorboard trouble shooting

In [2]:
# delete content on this path to reset tensorboard if error occurred 
tensor_path = tempfile.gettempdir() + '\\.tensorboard-info'
print(tensor_path)

# run this command to kill all tensorboard background activity to reset if error occurred
# !taskkill /IM "tensorboard.exe" /F

C:\Users\im_ki\AppData\Local\Temp\.tensorboard-info


# Helper Functions

In [3]:
def load_tsp(problem_file):
    problem_path = os.path.join(problem_file)
    problem = tsp.load(problem_path)
    return problem

In [4]:
def random_tour(env, episode = 10):
    sum_length = 0
    for i in range(episode):
        obs, info = env.reset()
        done = False
        score = 0

        while not done or force_stop:
            action = env.next_rand_action()
            obs, reward, done, force_stop, info = env.step(action)
            score += reward
        length = score-2*env.penalty
        sum_length += (length)
        
        print('Episode:{} Score:{} Length:{}'.format(i, score, length))
        print(env.tour)
    env.close
    return sum_length / episode

In [5]:
def manual_eval(model, env, episode = 10):
    for x in range(episode):
        obs, info = env.reset()
        result = 0
        for i in range(env.max_length):
            action, _states = model.predict(obs)
            obs, reward, done, force_stop, info = env.step(action)
            result += reward
        
            if done or force_stop:
                break
    
        print(env.tour)
        print(len(env.tour))
        print(result)
        print("---------------------------------")
    env.close

In [6]:
def best_score_eval(model, env, episode = 10000):
    best_r = 0 
    for x in range(episode):
        obs, info = env.reset()
        result = 0
        for i in range(env.max_length):
            action, _states = model.predict(obs)
            obs, reward, done, force_stop, info = env.step(action)
            result += reward
        
            if done or force_stop:
                break
                
        if result > best_r:
            best_r = result
            min_length = -(best_r - 2*env.penalty)
            
            print(env.tour)
            print(len(env.tour))
            print(min_length)
            print("---------------------------------")
    env.close
    return min_length

# Environment Class

In [7]:
class tspEnv(Env):
    def __init__(self, problem, penalty = 10**4):
        self.problem = problem
        self.action_space = Discrete(self.problem.dimension)
        self.observation_space = Box(0, np.inf, (self.problem.dimension,))
        self.tour = []
        self.max_length = 2*self.problem.dimension
        self.start = 0
        self.penalty = penalty
        self.w0_matrix = self._get_w_matrix()
        self.w_matrix = self.w0_matrix
        
    def step(self, action):
         # Get current state
        state = self._get_state()
        new_obs = self._get_obs(action)

        # Get reward for such a move
        reward = self._get_reward(state,action)

        # Append reached node to tour
        self.tour.append(int(action))
        self._update_matrix(action, self.penalty)
        
        done = False
        force_stop = False
        if self._is_subset(list(range(self.problem.dimension)), self.tour):
            done = True
            reward += self.penalty
            reward -= np.array(self.w0_matrix[action])[self.start]
            if len(self.tour) == self.problem.dimension:
                reward += self.penalty
        
        if len(self.tour) == self.max_length:
            force_stop = True

        info = {"tour": self.tour}

        return new_obs, reward, done, force_stop, info

    def next_rand_action(self):
        while True:
            a = self.action_space.sample()
            if (((a not in self.tour) and (a != self.start)) and (a != self._get_state())):
                break
        return a
        
    def render(self):
        pass
        
    def reset(self, seed = None, option = None):
        super().reset(seed=seed)
        self.tour = []
        self.tour.append(self.start)
        self.w_matrix = self._get_w_matrix()
        self._update_matrix(self.start, self.penalty)
        info = {}
        return self._get_obs(self.start), info

    def _get_state(self):
        return self.tour[-1]

    def _get_reward(self, state, new_state):
        return -self.w_matrix[state][new_state]

    def _get_obs(self, action):
        return np.array(self.w_matrix[action])

    def _get_w_matrix(self):
    
        # convert lower triangle matrix to square matrix
        if self.problem.edge_weight_type == "EXPLICIT":
            if self.problem.edge_weight_format == "LOWER_DIAG_ROW":
                data = []
                weight = []
                for i in self.problem.edge_weights:
                    for j in i:
                        data.append(j)
                for x in range(self.problem.dimension):   # format lower triangle matrix
                    node = []
                    w = data.pop(0)
                    while w != 0:
                        node.append(w)
                        w = data.pop(0)
                    while len(node) != self.problem.dimension:
                        node.append(0)
                    weight.append(node)
                matrix = np.triu(np.array(weight).T,1) + weight   #convert to square matrix
        # extract weight between nodes in euclidean 2d coordinate system
        elif self.problem.edge_weight_type == "EUC_2D":
            matrix = np.zeros((self.problem.dimension,self.problem.dimension))
            for i in range(self.problem.dimension):
                for j in range(self.problem.dimension):
                    x1, y1 = self.problem.node_coords[i+1]
                    x2, y2 = self.problem.node_coords[j+1]
                    w = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
                    matrix[i,j] = round(w)
                    
        else: raise Exception("Invalid edge_weight_type")
            
        matrix[matrix == 0] = self.penalty
        return matrix

    def _update_matrix(self, new_state, penalty):
        self.w_matrix[:, new_state] = penalty

    def _is_subset(self, sub_list, list):
        if set(sub_list).intersection(set(list)) == set(sub_list):
            return True
        else:
            return False
        

# Problem gr24

## Problem attributes display

In [21]:
# loading tsplib files into tsplib95 custom class
gr24 = load_tsp("gr24.tsp")
print(gr24)
print()
# loading optimised solution for study purpose
gr24_opt = load_tsp("gr24.opt.tour")
print(gr24_opt)

NAME: gr24
COMMENT: 24-city problem (Groetschel)
TYPE: TSP
DIMENSION: 24
EDGE_WEIGHT_TYPE: EXPLICIT
EDGE_WEIGHT_FORMAT: LOWER_DIAG_ROW
EDGE_WEIGHT_SECTION:
0 257 0 187 196 0 91 228 158 0 150 112
96 120 0 80 196 88 77 63 0 130 167 59
101 56 25 0 134 154 63 105 34 29 22 0
243 209 286 159 190 216 229 225 0 185 86 124
156 40 124 95 82 207 0 214 223 49 185 123
115 86 90 313 151 0 70 191 121 27 83 47
64 68 173 119 148 0 272 180 315 188 193 245
258 228 29 159 342 209 0 219 83 172 149 79
139 134 112 126 62 199 153 97 0 293 50 232
264 148 232 203 190 248 122 259 227 219 134 0
54 219 92 82 119 31 43 58 238 147 84 53
267 170 255 0 211 74 81 182 105 150 121 108
310 37 160 145 196 99 125 173 0 290 139 98
261 144 176 164 136 389 116 147 224 275 178 154
190 79 0 268 53 138 239 123 207 178 165 367
86 187 202 227 130 68 230 57 86 0 261 43
200 232 98 200 171 131 166 90 227 195 137 69
82 223 90 176 90 0 175 128 76 146 32 76
47 30 222 56 103 109 225 104 164 99 57 112
114 134 0 250 99 89 221 105 189 160 14

## Training

In [79]:
# establish training environment
gr24_env = tspEnv(gr24, 22000)

In [36]:
# testing environment with random actions
random_tour(gr24_env, episode = 10)

Episode:0 Score:16296 Length:-3704
[0, 15, 19, 23, 9, 17, 22, 2, 6, 14, 7, 3, 21, 8, 13, 1, 11, 5, 10, 16, 4, 20, 18, 12]
Episode:1 Score:16747 Length:-3253
[0, 5, 7, 11, 10, 23, 6, 21, 20, 1, 18, 9, 2, 17, 8, 15, 19, 4, 13, 16, 12, 22, 14, 3]
Episode:2 Score:16390 Length:-3610
[0, 16, 14, 10, 23, 9, 22, 20, 7, 17, 1, 15, 3, 13, 21, 8, 18, 19, 12, 2, 4, 11, 5, 6]
Episode:3 Score:16431 Length:-3569
[0, 13, 4, 14, 10, 7, 22, 17, 1, 21, 19, 5, 11, 15, 16, 6, 20, 3, 8, 2, 18, 9, 12, 23]
Episode:4 Score:16590 Length:-3410
[0, 1, 2, 18, 21, 3, 11, 12, 17, 6, 19, 8, 9, 7, 22, 13, 10, 23, 5, 14, 16, 4, 20, 15]
Episode:5 Score:16554 Length:-3446
[0, 3, 18, 15, 11, 16, 21, 14, 5, 1, 8, 2, 12, 19, 22, 13, 17, 7, 10, 23, 6, 4, 9, 20]
Episode:6 Score:16676 Length:-3324
[0, 2, 15, 18, 7, 6, 5, 16, 8, 3, 19, 13, 17, 23, 10, 11, 12, 1, 14, 21, 20, 9, 4, 22]
Episode:7 Score:16567 Length:-3433
[0, 23, 20, 9, 7, 6, 14, 5, 2, 18, 3, 10, 19, 22, 15, 4, 16, 17, 11, 21, 13, 12, 8, 1]
Episode:8 Score:16307 Le

-3539.8

In [6]:
gr24_env.reset()

(array([1000,  257,  187,   91,  150,   80,  130,  134,  243,  185,  214,
          70,  272,  219,  293,   54,  211,  290,  268,  261,  175,  250,
         192,  121]),
 {})

In [84]:
# Set up log path to save log files
log_path = os.path.join('Training', 'Logs')
save_path = os.path.join('Training', 'Saved Models')

# Setup evaluation environment for locating best model
gr24_eval_env = tspEnv(gr24, 22000)

# Use deterministic actions for evaluation
eval_callback = EvalCallback(gr24_eval_env, best_model_save_path = save_path,
                             eval_freq = 10000,
                             deterministic=True, render=False)
timesteps = 750000

### PPO

In [9]:
model_PPO = PPO('MlpPolicy', gr24_env, verbose = 1, tensorboard_log=log_path)
model_PPO.learn(total_timesteps = timesteps)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\PPO_1
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 47        |
|    ep_rew_mean     | -2.63e+05 |
| time/              |           |
|    fps             | 375       |
|    iterations      | 1         |
|    time_elapsed    | 5         |
|    total_timesteps | 2048      |
----------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 47            |
|    ep_rew_mean          | -2.65e+05     |
| time/                   |               |
|    fps                  | 312           |
|    iterations           | 2             |
|    time_elapsed         | 13            |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 1.7276034e-07 |
|    clip_fraction        | 0           

<stable_baselines3.ppo.ppo.PPO at 0x277cbdeb8f0>

In [85]:
# tsplib95 function for evaluating model on environment for result checking
evaluate_policy(model_PPO, gr24_env, n_eval_episodes = 10)

NameError: name 'model1' is not defined

In [15]:
# manual evaluation displaying founded tour and reward
manual_eval(model_PPO, gr24_env, episode = 10):

[0, 17, 21, 20, 14, 23, 6, 4, 11, 14, 3, 20, 11, 16, 13, 16, 3, 17, 14, 20, 10, 17, 7, 16, 16, 1, 8, 2, 7, 0, 5, 22, 3, 4, 9, 5, 9, 5, 14, 19, 14, 11, 11, 22, 19, 16, 22, 20]
48
-263632
-263632
---------------------------------
[0, 1, 19, 9, 11, 5, 15, 14, 0, 18, 19, 5, 17, 16, 3, 16, 10, 13, 7, 14, 14, 12, 19, 12, 2, 11, 15, 21, 20, 14, 6, 22, 13, 10, 8, 8, 2, 15, 19, 20, 2, 23, 15, 3, 6, 20, 10, 15]
48
-244653
-244653
---------------------------------
[0, 16, 13, 7, 4, 7, 11, 20, 22, 10, 8, 0, 0, 2, 22, 8, 23, 1, 2, 22, 15, 8, 3, 7, 5, 19, 21, 5, 15, 17, 13, 15, 10, 4, 7, 7, 7, 19, 12, 9, 23, 9, 7, 22, 20, 12, 21, 14]
48
-254219
-254219
---------------------------------
[0, 12, 19, 16, 5, 20, 0, 23, 22, 8, 0, 10, 5, 3, 2, 8, 14, 3, 21, 8, 9, 12, 22, 11, 21, 7, 8, 14, 16, 1, 2, 17, 7, 16, 15, 9, 17, 1, 20, 3, 19, 8, 8, 4, 19, 5, 12, 1]
48
-264003
-264003
---------------------------------
[0, 15, 18, 18, 9, 7, 14, 19, 8, 18, 12, 1, 18, 18, 23, 3, 21, 9, 16, 2, 5, 19, 23, 4, 20, 11, 16,

### A2C

In [10]:
model_A2C = A2C('MlpPolicy', gr24_env, verbose = 1, tensorboard_log=log_path)
model_A2C.learn(total_timesteps = timesteps)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\A2C_1
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 47        |
|    ep_rew_mean        | -2.68e+05 |
| time/                 |           |
|    fps                | 188       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -3.03     |
|    explained_variance | -5.02e-05 |
|    learning_rate      | 0.0007    |
|    n_updates          | 99        |
|    policy_loss        | -4.51e+04 |
|    value_loss         | 2.68e+08  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 47        |
|    ep_rew_mean        | -2.74e+05 |
| time/                 |           |
|    fps                | 187       |


<stable_baselines3.a2c.a2c.A2C at 0x277db513c20>

In [13]:
evaluate_policy(model_A2C, gr24_env, n_eval_episodes = 10)

(-321641.0, 0.0)

In [16]:
manual_eval(model_A2C, gr24_env)

[0, 3, 5, 16, 21, 1, 18, 4, 20, 12, 6, 13, 23, 15, 18, 8, 22, 18, 2, 8, 7, 8, 8, 22, 18, 11, 10, 3, 19, 0, 0, 6, 8, 8, 2, 8, 8, 8, 9, 8, 7, 8, 8, 0, 8, 22, 18, 3]
48
-254035
-254035
---------------------------------
[0, 3, 11, 1, 16, 8, 14, 5, 1, 22, 0, 9, 20, 6, 19, 4, 14, 15, 8, 8, 7, 8, 21, 23, 1, 17, 0, 4, 8, 1, 8, 1, 1, 4, 1, 1, 4, 8, 4, 8, 3, 4, 0, 16, 0, 4, 1, 8]
48
-284061
-284061
---------------------------------
[0, 3, 1, 16, 14, 1, 11, 5, 9, 8, 1, 1, 7, 12, 13, 23, 16, 20, 4, 10, 8, 18, 21, 19, 21, 8, 15, 21, 3, 21, 21, 6, 1, 7, 20, 1, 7, 7, 20, 7, 20, 3, 3, 1, 7, 7, 1, 1]
48
-272829
-272829
---------------------------------
[0, 17, 1, 16, 20, 3, 2, 5, 1, 1, 1, 1, 13, 23, 21, 19, 11, 14, 12, 18, 4, 5, 1, 1, 9, 6, 1, 1, 1, 1, 1, 15, 0, 8, 1, 22, 3, 1, 7, 7, 15, 3, 1, 1, 1, 1, 1, 1]
48
-244426
-244426
---------------------------------
[0, 1, 22, 8, 2, 20, 8, 1, 11, 10, 5, 7, 3, 4, 14, 16, 6, 19, 15, 8, 2, 3, 8, 21, 17, 0, 13, 14, 8, 23, 4, 7, 4, 4, 3, 4, 4, 16, 16, 21, 15, 4, 

### Saving PPO & A2C Models

In [None]:
PPO_Path  = os.path.join('Training', 'Saved Models', 'PPO_Model')
A2C_Path  = os.path.join('Training', 'Saved Models', 'A2C_Model')
model1.save()
model2.save()

### DQN_1

In [None]:
model_DQN_1 = DQN('MlpPolicy', gr24_env, verbose = 1, tensorboard_log=log_path)
model_DQN_1.learn(total_timesteps = timesteps, callback = eval_callback)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\DQN_2
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 47        |
|    ep_rew_mean      | -2.59e+05 |
|    exploration_rate | 0.998     |
| time/               |           |
|    episodes         | 4         |
|    fps              | 445       |
|    time_elapsed     | 0         |
|    total_timesteps  | 188       |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 4.99e+03  |
|    n_updates        | 21        |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 47        |
|    ep_rew_mean      | -2.64e+05 |
|    exploration_rate | 0.996     |
| time/               |           |
|    episodes         | 8         |
|    fps              | 548       |
|    time_elapsed     | 0         |
|    total



Eval num_timesteps=10000, episode_reward=-332164.00 +/- 0.00
Episode length: 47.00 +/- 0.00
-----------------------------------
| eval/               |           |
|    mean_ep_length   | 47        |
|    mean_reward      | -3.32e+05 |
| rollout/            |           |
|    exploration_rate | 0.905     |
| time/               |           |
|    total_timesteps  | 10000     |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 2.85e+03  |
|    n_updates        | 2474      |
-----------------------------------
New best mean reward!
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 47        |
|    ep_rew_mean      | -2.59e+05 |
|    exploration_rate | 0.904     |
| time/               |           |
|    episodes         | 216       |
|    fps              | 739       |
|    time_elapsed     | 13        |
|    total_timesteps  | 10149     |
| train/              |           |
|    learning_rate    

In [14]:
evaluate_policy(model_DQN_1, gr24_env, n_eval_episodes = 10)

(-14355.0, 0.0)

In [17]:
manual_eval(model_DQN_1, gr24_env)

[0, 23, 21, 5, 18, 20, 9, 4, 6, 14, 3, 15, 17, 12, 10, 19, 20, 1, 13, 22, 16, 7, 11, 0, 0, 2, 8, 0]
28
-24639
-24639
---------------------------------
[0, 23, 21, 5, 18, 20, 9, 4, 6, 16, 15, 3, 17, 12, 10, 14, 19, 22, 13, 7, 1, 11, 0, 0, 2, 8, 0]
27
-14704
-14704
---------------------------------
[0, 23, 21, 5, 18, 20, 9, 4, 6, 14, 3, 12, 17, 10, 15, 19, 1, 13, 22, 16, 7, 11, 0, 0, 21, 2, 8, 0]
28
-24257
-24257
---------------------------------
[0, 23, 21, 5, 18, 20, 9, 4, 6, 14, 3, 12, 17, 10, 15, 19, 1, 13, 22, 16, 7, 11, 0, 0, 2, 8, 0]
27
-14355
-14355
---------------------------------
[0, 23, 21, 5, 18, 3, 9, 6, 20, 4, 12, 14, 17, 10, 15, 19, 1, 13, 22, 16, 3, 22, 7, 11, 0, 0, 20, 2, 8, 0]
30
-44229
-44229
---------------------------------
[0, 23, 21, 5, 18, 20, 9, 4, 6, 14, 3, 12, 17, 10, 15, 19, 1, 13, 22, 16, 7, 11, 0, 0, 2, 1, 8, 0]
28
-24278
-24278
---------------------------------
[0, 15, 6, 3, 9, 19, 14, 12, 10, 18, 1, 23, 21, 20, 5, 13, 17, 4, 22, 16, 7, 11, 0, 0, 2, 8, 0]


Load best DQN model from model_DQN_1

In [11]:
best_model_path = os.path.join('Training', 'Saved Models', 'best_DQN_Model_gr24_1')
best_model = DQN.load(best_model_path, env = gr24_env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [12]:
evaluate_policy(model_best, gr24_env, n_eval_episodes = 10)

(16409.0, 0.0)

In [17]:
# manual evaluation to output the best score
# the score displayed is altered by +20000 due to model rewarding scheme
best_score_eval(best_model, gr24_env, episode = 10000)

[0, 5, 20, 2, 10, 18, 3, 7, 4, 1, 19, 8, 16, 14, 23, 9, 13, 22, 12, 11, 15, 6, 17, 21, 0]
25
16409
---------------------------------
[0, 5, 20, 2, 10, 18, 11, 23, 14, 21, 7, 19, 4, 8, 12, 13, 22, 1, 16, 9, 3, 15, 6, 17, 0]
25
16411
---------------------------------
[0, 5, 20, 2, 10, 18, 3, 7, 4, 1, 19, 14, 8, 16, 23, 9, 13, 22, 12, 11, 15, 6, 17, 21, 0]
25
16452
---------------------------------
[0, 5, 20, 2, 10, 18, 11, 23, 4, 14, 21, 7, 19, 8, 12, 13, 22, 1, 16, 9, 3, 15, 6, 17, 0]
25
16528
---------------------------------
[0, 5, 20, 2, 10, 18, 3, 23, 7, 4, 19, 1, 8, 14, 16, 9, 13, 22, 12, 11, 15, 6, 17, 21, 0]
25
16638
---------------------------------
[0, 5, 20, 2, 10, 7, 23, 18, 14, 1, 19, 8, 3, 4, 16, 9, 13, 22, 12, 11, 15, 6, 17, 21, 0]
25
16951
---------------------------------


### DQN_2

In [None]:
model_DQN_2 = DQN('MlpPolicy', gr24_env, verbose = 1, tensorboard_log=log_path)
model_DQN_2.learn(total_timesteps = timesteps, callback = eval_callback)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\DQN_8
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 47        |
|    ep_rew_mean      | -6.09e+05 |
|    exploration_rate | 0.998     |
| time/               |           |
|    episodes         | 4         |
|    fps              | 672       |
|    time_elapsed     | 0         |
|    total_timesteps  | 188       |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 9.64e+03  |
|    n_updates        | 21        |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 46.5      |
|    ep_rew_mean      | -5.68e+05 |
|    exploration_rate | 0.995     |
| time/               |           |
|    episodes         | 8         |
|    fps              | 503       |
|    time_elapsed     | 0         |
|    total

In [66]:
best_model_path = os.path.join('Training', 'Saved Models', 'best_DQN_Model_gr24_2')
best_model = DQN.load(best_model_path, env = gr24_env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [67]:
# manual evaluation to output the best score
# the score displayed is altered by +44000 due to model rewarding scheme
best_score_eval(best_model, gr24_env, episode = 10000)

[0, 8, 13, 3, 12, 19, 20, 14, 18, 7, 4, 21, 16, 15, 10, 1, 9, 22, 23, 17, 2, 6, 11, 5]
24
41054
---------------------------------
[0, 8, 13, 3, 12, 19, 20, 14, 18, 7, 4, 21, 16, 10, 15, 1, 9, 22, 23, 17, 2, 6, 11, 5]
24
41071
---------------------------------
[0, 8, 13, 17, 16, 10, 4, 21, 12, 19, 1, 20, 15, 9, 14, 18, 7, 22, 23, 2, 6, 11, 5, 3]
24
41082
---------------------------------
[0, 23, 13, 19, 14, 1, 2, 10, 8, 21, 16, 7, 20, 9, 4, 6, 15, 18, 17, 12, 11, 5, 22, 3]
24
41084
---------------------------------
[0, 7, 13, 19, 16, 17, 21, 10, 23, 15, 9, 14, 18, 11, 5, 1, 4, 22, 8, 12, 2, 6, 20, 3]
24
41381
---------------------------------
[0, 7, 13, 19, 16, 17, 21, 10, 23, 15, 20, 9, 14, 1, 2, 4, 8, 22, 12, 18, 6, 11, 5, 3]
24
41514
---------------------------------


## Tensorboard log analysis

In [38]:
%tensorboard --logdir='F:\UCL\Year_3\FYP\src\Training\Logs\PPO_gr24'

In [39]:
%tensorboard --logdir='F:\UCL\Year_3\FYP\src\Training\Logs\A2C_gr24'

In [40]:
%tensorboard --logdir='F:\UCL\Year_3\FYP\src\Training\Logs\DQN_gr24_1'

In [63]:
%tensorboard --logdir='F:\UCL\Year_3\FYP\src\Training\Logs\DQN_gr24_2'

# Problem gr48

## Problem Attributes display

In [8]:
# loading tsplib files into tsplib95 custom class
gr48 = load_tsp("gr48.tsp")
print(gr48)
print()
# loading optimised solution for study purpose
gr48_opt = load_tsp("gr48.opt.tour")
print(gr48_opt)

NAME: gr48
COMMENT: 48-city problem (Groetschel)
TYPE: TSP
DIMENSION: 48
EDGE_WEIGHT_TYPE: EXPLICIT
EDGE_WEIGHT_FORMAT: LOWER_DIAG_ROW
EDGE_WEIGHT_SECTION:
0 593 0 409 258 0 566 331 171 0
633 586 723 874 0 257 602 522 679 390
0 91 509 325 482 598 228 0 412 627
506 663 227 169 383 0 378 755 634 791
397 175 349 167 0 593 416 564 721 271
445 509 293 429 0 150 598 414 571 488
112 120 267 233 541 0 659 488 630 787
205 511 575 304 470 76 607 0 80 566
382 539 572 196 77 351 317 563 63 629
0 434 893 699 856 524 231 405 303 138
595 289 606 373 0 455 417 433 590 313
304 371 228 394 158 399 224 425 530 0
134 583 399 566 530 154 105 309 275 575
34 638 29 298 434 0 649 945 824 981
446 423 620 357 280 649 504 648 588 416
584 546 0 259 364 180 337 555 272 175
338 466 403 264 469 232 549 265 249 656
0 505 354 110 70 819 618 421 602 730
660 509 728 478 795 529 494 920 276 0
710 117 375 354 679 693 626 720 848 533
715 610 683 986 534 700 1038 481 345 0
488 784 663 820 289 262 459 196 119 488
343 502 427

## Training

In [9]:
# establish training environment
gr48_env = tspEnv(gr48)
random_tour(gr48_env, episode = 10)

Episode:0 Score:-1103 Length:-21103
[0, 46, 24, 6, 29, 11, 14, 31, 44, 9, 47, 32, 28, 10, 2, 34, 18, 22, 26, 35, 30, 40, 33, 1, 20, 25, 36, 27, 4, 12, 13, 19, 45, 43, 17, 41, 42, 23, 3, 8, 7, 16, 37, 38, 5, 21, 15, 39]
Episode:1 Score:99 Length:-19901
[0, 5, 47, 16, 37, 19, 15, 4, 12, 38, 27, 39, 13, 24, 46, 9, 42, 14, 6, 21, 35, 31, 32, 30, 2, 22, 20, 40, 44, 26, 10, 18, 25, 17, 41, 36, 34, 7, 8, 45, 43, 28, 33, 1, 3, 29, 23, 11]
Episode:2 Score:-1176 Length:-21176
[0, 1, 30, 36, 35, 41, 34, 32, 2, 11, 23, 27, 31, 37, 46, 9, 42, 19, 24, 45, 6, 18, 38, 15, 21, 33, 16, 22, 13, 43, 17, 4, 28, 26, 3, 40, 5, 10, 47, 20, 7, 8, 39, 29, 44, 25, 12, 14]
Episode:3 Score:-812 Length:-20812
[0, 19, 34, 9, 6, 32, 11, 25, 47, 24, 17, 10, 33, 31, 38, 22, 4, 5, 15, 43, 13, 30, 28, 36, 46, 8, 7, 14, 44, 35, 29, 1, 39, 45, 23, 18, 16, 20, 41, 37, 26, 3, 27, 42, 12, 21, 40, 2]
Episode:4 Score:-2566 Length:-22566
[0, 39, 22, 6, 46, 27, 38, 16, 20, 21, 26, 17, 9, 2, 42, 12, 36, 35, 34, 25, 1, 10, 19, 40, 

-20869.7

In [22]:
# set penalty as the averege score of random tour
gr48_env = tspEnv(gr48, 20000)
gr48_eval_env = tspEnv(gr48, 20000)

In [23]:
# Set up log path to save log files
log_path = os.path.join('Training', 'Logs')
save_path = os.path.join('Training', 'Saved Models')

# Use deterministic actions for evaluation
eval_callback = EvalCallback(gr48_eval_env, best_model_save_path = save_path,
                             eval_freq = 10000,
                             deterministic=True, render=False)
timesteps = 750000

In [24]:
model_DQN = DQN('MlpPolicy', gr48_env, verbose = 1, tensorboard_log=log_path)
model_DQN.learn(total_timesteps = timesteps, callback = eval_callback, reset_num_timesteps = True)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\DQN_4
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 95        |
|    ep_rew_mean      | -1.11e+06 |
|    exploration_rate | 0.995     |
| time/               |           |
|    episodes         | 4         |
|    fps              | 445       |
|    time_elapsed     | 0         |
|    total_timesteps  | 380       |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 9.97e+03  |
|    n_updates        | 69        |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 95        |
|    ep_rew_mean      | -1.11e+06 |
|    exploration_rate | 0.99      |
| time/               |           |
|    episodes         | 8         |
|    fps              | 375       |
|    time_elapsed     | 2         |
|    total

<stable_baselines3.dqn.dqn.DQN at 0x226aeb4b6b0>

In [25]:
best_model_path = os.path.join('Training', 'Saved Models', 'best_model_gr48')
best_model = DQN.load(best_model_path, env = gr48_env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [26]:
manual_eval(best_model,gr48_env)

[0, 13, 25, 28, 42, 17, 40, 22, 26, 36, 24, 18, 1, 9, 45, 39, 32, 12, 33, 47, 20, 43, 23, 44, 34, 35, 37, 11, 8, 29, 15, 10, 3, 19, 31, 7, 3, 21, 6, 4, 16, 14, 41, 5, 27, 30, 2, 46, 38]
49
-21563
---------------------------------
[0, 13, 25, 17, 36, 20, 8, 40, 22, 24, 1, 45, 42, 43, 18, 39, 26, 12, 28, 29, 9, 33, 10, 32, 11, 37, 15, 7, 47, 3, 34, 46, 6, 14, 16, 35, 11, 23, 31, 41, 5, 19, 44, 4, 27, 30, 2, 21, 38]
49
-21048
---------------------------------
[0, 13, 25, 17, 36, 20, 8, 40, 22, 24, 1, 45, 42, 18, 39, 47, 23, 34, 26, 32, 12, 9, 29, 4, 28, 37, 33, 6, 10, 15, 3, 11, 23, 35, 14, 31, 19, 16, 43, 41, 44, 5, 27, 30, 2, 7, 21, 46, 38]
49
-21523
---------------------------------
[0, 13, 25, 26, 17, 36, 12, 24, 39, 45, 11, 32, 22, 1, 20, 47, 43, 15, 8, 29, 3, 7, 33, 10, 46, 37, 21, 6, 27, 14, 16, 34, 9, 28, 18, 35, 2, 23, 31, 4, 41, 5, 19, 44, 30, 42, 40, 38]
48
18043
---------------------------------
[0, 13, 25, 17, 36, 20, 8, 40, 22, 24, 1, 45, 42, 18, 39, 47, 26, 32, 12, 34, 9, 2

In [27]:
%tensorboard --logdir='F:\UCL\Year_3\FYP\src\Training\Logs\DQN_gr48'

# Problem kroA100

## Problem Attributes display

In [44]:
kroA100 = load_tsp("kroA100.tsp")
print(kroA100)
kroA100_opt = load_tsp("kroA100.opt.tour")
print(kroA100_opt)

NAME: kroA100
COMMENT: 100-city problem A (Krolak/Felts/Nelson)
TYPE: TSP
DIMENSION: 100
EDGE_WEIGHT_TYPE: EUC_2D
NODE_COORD_SECTION:
1 1380 939
2 2848 96
3 3510 1671
4 457 334
5 3888 666
6 984 965
7 2721 1482
8 1286 525
9 2716 1432
10 738 1325
11 1251 1832
12 2728 1698
13 3815 169
14 3683 1533
15 1247 1945
16 123 862
17 1234 1946
18 252 1240
19 611 673
20 2576 1676
21 928 1700
22 53 857
23 1807 1711
24 274 1420
25 2574 946
26 178 24
27 2678 1825
28 1795 962
29 3384 1498
30 3520 1079
31 1256 61
32 1424 1728
33 3913 192
34 3085 1528
35 2573 1969
36 463 1670
37 3875 598
38 298 1513
39 3479 821
40 2542 236
41 3955 1743
42 1323 280
43 3447 1830
44 2936 337
45 1621 1830
46 3373 1646
47 1393 1368
48 3874 1318
49 938 955
50 3022 474
51 2482 1183
52 3854 923
53 376 825
54 2519 135
55 2945 1622
56 953 268
57 2628 1479
58 2097 981
59 890 1846
60 2139 1806
61 2421 1007
62 2290 1810
63 1115 1052
64 2588 302
65 327 265
66 241 341
67 1917 687
68 2991 792
69 2573 599
70 19 674
71 3911 1673
72 872 155

## Training

In [49]:
kroA100_env = tspEnv(kroA100, 22000)

In [50]:
# Set up log path to save log files
log_path = os.path.join('Training', 'Logs')
save_path = os.path.join('Training', 'Saved Models')

# Setup evaluation environment for locating best model
kroA100_eval_env = tspEnv(kroA100, 22000)

# Use deterministic actions for evaluation
eval_callback = EvalCallback(kroA100_eval_env, best_model_save_path = save_path,
                             eval_freq = 10000,
                             deterministic=True, render=False)
timesteps = 750000

In [None]:
### DQN

In [51]:
model_DQN = DQN('MlpPolicy', kroA100_env, verbose = 1, tensorboard_log=log_path)
model_DQN.learn(total_timesteps = timesteps, callback = eval_callback)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\DQN_7
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 199       |
|    ep_rew_mean      | -2.56e+06 |
|    exploration_rate | 0.99      |
| time/               |           |
|    episodes         | 4         |
|    fps              | 382       |
|    time_elapsed     | 2         |
|    total_timesteps  | 796       |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 9.08e+03  |
|    n_updates        | 173       |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 199       |
|    ep_rew_mean      | -2.61e+06 |
|    exploration_rate | 0.98      |
| time/               |           |
|    episodes         | 8         |
|    fps              | 368       |
|    time_elapsed     | 4         |
|    total

<stable_baselines3.dqn.dqn.DQN at 0x1d7824987d0>

In [53]:
best_model_path = os.path.join('Training', 'Saved Models', 'best_model_karA100')
best_model = DQN.load(best_model_path, env = kroA100_env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [58]:
manual_eval(best_model, kroA100_env)

[0, 72, 51, 34, 10, 7, 39, 57, 12, 0, 38, 76, 82, 91, 74, 84, 90, 85, 92, 8, 67, 61, 91, 44, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 92, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 63, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 3, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 55, 59, 53, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 93]
101
-1621783.0
---------------------------------
[0, 72, 51, 34, 57, 7, 39, 36, 20, 12, 76, 43, 34, 97, 14, 79, 31, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 20, 43, 57, 57, 57, 57, 57, 96, 26, 82, 18, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31]
101
-1812708.0
---------------------------------
[0, 72, 51, 34, 57, 7, 39, 40, 91, 84, 38, 43, 53, 31, 82, 59, 76, 66, 47, 73, 44, 84, 84, 84, 84, 

## Tensorboard log analysis

In [59]:
%tensorboard --logdir='F:\UCL\Year_3\FYP\src\Training\Logs\DQN_karA100'