In [34]:
import os
import tempfile
import glob
import numpy as np
import tsplib95 as tsp

import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box

from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback

%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [2]:
def load_tsp(problem_file):
    problem_path = os.path.join(problem_file)
    problem = tsp.load(problem_path)
    return problem

In [3]:
gr24 = load_tsp("gr24.tsp")
#print(gr24)
gr24_opt = load_tsp("gr24.opt.tour")
#print(gr24_opt)

In [4]:
class tspEnv(Env):
    def __init__(self, problem):
        self.problem = problem
        self.action_space = Discrete(self.problem.dimension)
        self.observation_space = Box(0, np.inf, (self.problem.dimension,))
        self.tour = []
        self.max_length = 2*self.problem.dimension
        self.start = 0
        self.large_reward = 10**4
        self.w_matrix = self._get_w_matrix()
        
        
    def step(self, action):
         # Get current state
        state = self._get_state()
        new_obs = self._get_obs(action)

        # Get reward for such a move
        reward = self._get_reward(state,action)

        # Append reached node to tour
        self.tour.append(int(action))
        self._update_matrix(action, self.large_reward)
        
        tour_sort = self.tour.copy()
        tour_sort.sort()
        
        done = False
        force_stop = False
        if (action == self.start) and self._is_subset(list(range(self.problem.dimension)), self.tour):
            done = True
            reward += self.large_reward
            if len(self.tour) == self.problem.dimension + 1:
                reward += self.large_reward
        
        if len(self.tour) == self.max_length:
            force_stop = True

        info = {"tour": self.tour}

        return new_obs, reward, done, force_stop, info

    def next_rand_action(self):
        if len(self.tour) == (self.problem.dimension):
            return self.start
        else:
            while True:
                a = self.action_space.sample()
                if (((a not in self.tour) and (a != self.start)) and (a != self._get_state())):
                    break
            return a
        
    def render(self):
        pass
        
    def reset(self, seed = None, option = None):
        super().reset(seed=seed)
        self.tour = []
        self.start = 0
        self.tour.append(self.start)
        self.w_matrix = self._get_w_matrix()
        self._update_matrix(self.start, self.large_reward/10)
        info = {}
        return self._get_obs(self.start), info

    def _get_state(self):
        return self.tour[-1]

    def _get_reward(self, state, new_state):
        return -self.w_matrix[state][new_state]

    def _get_obs(self, action):
        return np.array(self.w_matrix[action])

    def _get_w_matrix(self):
        data = []
        weight = []
        for i in self.problem.edge_weights:
            for j in i:
                data.append(j)

        # convert lower triangle matrix to square matrix
        if self.problem.edge_weight_format == "LOWER_DIAG_ROW":
            for x in range(self.problem.dimension):   # format lower triangle matrix
                node = []
                w = data.pop(0)
                while w != 0:
                    node.append(w)
                    w = data.pop(0)
                while len(node) != self.problem.dimension:
                    node.append(0)
                weight.append(node)
            matrix = np.triu(np.array(weight).T,1) + weight   #convert to square matrix
            
        matrix[matrix == 0] = self.large_reward
        return matrix

    def _update_matrix(self, new_state, reward):
        self.w_matrix[:, new_state] = reward

    def _is_subset(self, sub_list, list):
        if set(sub_list).intersection(set(list)) == set(sub_list):
            return True
        else:
            return False
        

In [5]:
env = tspEnv(gr24)

In [6]:
episode = 5
for episode in range(episode):
    obs = env.reset()
    done = False
    score = 0

    while not done:
        action = env.next_rand_action()
        obs, reward, done, force_stop, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score-10**5))
    print(env.tour)
env.close

Episode:0 Score:-83912
[0, 14, 17, 9, 18, 21, 16, 11, 20, 2, 4, 19, 7, 13, 6, 3, 23, 22, 1, 8, 12, 10, 15, 5, 0]
Episode:1 Score:-84268
[0, 11, 9, 23, 20, 2, 19, 3, 12, 15, 14, 5, 7, 4, 1, 16, 21, 6, 17, 10, 8, 22, 18, 13, 0]
Episode:2 Score:-84306
[0, 21, 11, 22, 3, 4, 5, 8, 20, 10, 14, 1, 17, 9, 15, 23, 19, 13, 6, 7, 18, 12, 2, 16, 0]
Episode:3 Score:-84658
[0, 13, 2, 1, 20, 5, 16, 4, 23, 14, 9, 10, 17, 21, 15, 11, 22, 7, 8, 18, 12, 6, 3, 19, 0]
Episode:4 Score:-84264
[0, 12, 6, 20, 17, 5, 16, 3, 18, 19, 8, 2, 10, 4, 22, 11, 15, 9, 14, 23, 7, 13, 21, 1, 0]


<bound method Env.close of <__main__.tspEnv object at 0x00000277CBD7A720>>

In [6]:
env.reset()

(array([1000,  257,  187,   91,  150,   80,  130,  134,  243,  185,  214,
          70,  272,  219,  293,   54,  211,  290,  268,  261,  175,  250,
         192,  121]),
 {})

In [7]:
log_path = os.path.join('Training', 'Logs')
save_path = os.path.join('Training', 'Saved Models')
# Separate evaluation env
eval_env = tspEnv(gr24)
# Use deterministic actions for evaluation
eval_callback = EvalCallback(eval_env, best_model_save_path = save_path,
                             eval_freq = 10000,
                             deterministic=True, render=False)
timesteps = 1000000

In [9]:
model1 = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)
model1.learn(total_timesteps = timesteps)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\PPO_1
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 47        |
|    ep_rew_mean     | -2.63e+05 |
| time/              |           |
|    fps             | 375       |
|    iterations      | 1         |
|    time_elapsed    | 5         |
|    total_timesteps | 2048      |
----------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 47            |
|    ep_rew_mean          | -2.65e+05     |
| time/                   |               |
|    fps                  | 312           |
|    iterations           | 2             |
|    time_elapsed         | 13            |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 1.7276034e-07 |
|    clip_fraction        | 0           

<stable_baselines3.ppo.ppo.PPO at 0x277cbdeb8f0>

In [10]:
model2 = A2C('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)
model2.learn(total_timesteps = timesteps)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\A2C_1
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 47        |
|    ep_rew_mean        | -2.68e+05 |
| time/                 |           |
|    fps                | 188       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -3.03     |
|    explained_variance | -5.02e-05 |
|    learning_rate      | 0.0007    |
|    n_updates          | 99        |
|    policy_loss        | -4.51e+04 |
|    value_loss         | 2.68e+08  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 47        |
|    ep_rew_mean        | -2.74e+05 |
| time/                 |           |
|    fps                | 187       |


<stable_baselines3.a2c.a2c.A2C at 0x277db513c20>

In [None]:
model3 = DQN('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)
model3.learn(total_timesteps = timesteps, callback = eval_callback)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\DQN_2
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 47        |
|    ep_rew_mean      | -2.59e+05 |
|    exploration_rate | 0.998     |
| time/               |           |
|    episodes         | 4         |
|    fps              | 445       |
|    time_elapsed     | 0         |
|    total_timesteps  | 188       |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 4.99e+03  |
|    n_updates        | 21        |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 47        |
|    ep_rew_mean      | -2.64e+05 |
|    exploration_rate | 0.996     |
| time/               |           |
|    episodes         | 8         |
|    fps              | 548       |
|    time_elapsed     | 0         |
|    total



Eval num_timesteps=10000, episode_reward=-332164.00 +/- 0.00
Episode length: 47.00 +/- 0.00
-----------------------------------
| eval/               |           |
|    mean_ep_length   | 47        |
|    mean_reward      | -3.32e+05 |
| rollout/            |           |
|    exploration_rate | 0.905     |
| time/               |           |
|    total_timesteps  | 10000     |
| train/              |           |
|    learning_rate    | 0.0001    |
|    loss             | 2.85e+03  |
|    n_updates        | 2474      |
-----------------------------------
New best mean reward!
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 47        |
|    ep_rew_mean      | -2.59e+05 |
|    exploration_rate | 0.904     |
| time/               |           |
|    episodes         | 216       |
|    fps              | 739       |
|    time_elapsed     | 13        |
|    total_timesteps  | 10149     |
| train/              |           |
|    learning_rate    

In [12]:
evaluate_policy(model1, env, n_eval_episodes = 10)



(-420474.0, 0.0)

In [13]:
evaluate_policy(model2, env, n_eval_episodes = 10)

(-321641.0, 0.0)

In [14]:
evaluate_policy(model3, env, n_eval_episodes = 10)

(-14355.0, 0.0)

In [15]:
for x in range(10):
    obs, info = env.reset()
    result = 0
    for i in range(100):
        action, _states = model1.predict(obs)
        obs, reward, done, force_stop, info = env.step(action)
        result += reward
        
        if done or force_stop:
            break
    
    print(env.tour)
    print(len(env.tour))
    print(result)
    print("---------------------------------")

[0, 17, 21, 20, 14, 23, 6, 4, 11, 14, 3, 20, 11, 16, 13, 16, 3, 17, 14, 20, 10, 17, 7, 16, 16, 1, 8, 2, 7, 0, 5, 22, 3, 4, 9, 5, 9, 5, 14, 19, 14, 11, 11, 22, 19, 16, 22, 20]
48
-263632
-263632
---------------------------------
[0, 1, 19, 9, 11, 5, 15, 14, 0, 18, 19, 5, 17, 16, 3, 16, 10, 13, 7, 14, 14, 12, 19, 12, 2, 11, 15, 21, 20, 14, 6, 22, 13, 10, 8, 8, 2, 15, 19, 20, 2, 23, 15, 3, 6, 20, 10, 15]
48
-244653
-244653
---------------------------------
[0, 16, 13, 7, 4, 7, 11, 20, 22, 10, 8, 0, 0, 2, 22, 8, 23, 1, 2, 22, 15, 8, 3, 7, 5, 19, 21, 5, 15, 17, 13, 15, 10, 4, 7, 7, 7, 19, 12, 9, 23, 9, 7, 22, 20, 12, 21, 14]
48
-254219
-254219
---------------------------------
[0, 12, 19, 16, 5, 20, 0, 23, 22, 8, 0, 10, 5, 3, 2, 8, 14, 3, 21, 8, 9, 12, 22, 11, 21, 7, 8, 14, 16, 1, 2, 17, 7, 16, 15, 9, 17, 1, 20, 3, 19, 8, 8, 4, 19, 5, 12, 1]
48
-264003
-264003
---------------------------------
[0, 15, 18, 18, 9, 7, 14, 19, 8, 18, 12, 1, 18, 18, 23, 3, 21, 9, 16, 2, 5, 19, 23, 4, 20, 11, 16,

In [16]:
for x in range(10):
    obs, info = env.reset()
    result = 0
    for i in range(100):
        action, _states = model2.predict(obs)
        obs, reward, done, force_stop, info = env.step(action)
        result += reward
        
        if done or force_stop:
            break
    
    print(env.tour)
    print(len(env.tour))
    print(result)
    print("---------------------------------")

[0, 3, 5, 16, 21, 1, 18, 4, 20, 12, 6, 13, 23, 15, 18, 8, 22, 18, 2, 8, 7, 8, 8, 22, 18, 11, 10, 3, 19, 0, 0, 6, 8, 8, 2, 8, 8, 8, 9, 8, 7, 8, 8, 0, 8, 22, 18, 3]
48
-254035
-254035
---------------------------------
[0, 3, 11, 1, 16, 8, 14, 5, 1, 22, 0, 9, 20, 6, 19, 4, 14, 15, 8, 8, 7, 8, 21, 23, 1, 17, 0, 4, 8, 1, 8, 1, 1, 4, 1, 1, 4, 8, 4, 8, 3, 4, 0, 16, 0, 4, 1, 8]
48
-284061
-284061
---------------------------------
[0, 3, 1, 16, 14, 1, 11, 5, 9, 8, 1, 1, 7, 12, 13, 23, 16, 20, 4, 10, 8, 18, 21, 19, 21, 8, 15, 21, 3, 21, 21, 6, 1, 7, 20, 1, 7, 7, 20, 7, 20, 3, 3, 1, 7, 7, 1, 1]
48
-272829
-272829
---------------------------------
[0, 17, 1, 16, 20, 3, 2, 5, 1, 1, 1, 1, 13, 23, 21, 19, 11, 14, 12, 18, 4, 5, 1, 1, 9, 6, 1, 1, 1, 1, 1, 15, 0, 8, 1, 22, 3, 1, 7, 7, 15, 3, 1, 1, 1, 1, 1, 1]
48
-244426
-244426
---------------------------------
[0, 1, 22, 8, 2, 20, 8, 1, 11, 10, 5, 7, 3, 4, 14, 16, 6, 19, 15, 8, 2, 3, 8, 21, 17, 0, 13, 14, 8, 23, 4, 7, 4, 4, 3, 4, 4, 16, 16, 21, 15, 4, 

In [17]:
for x in range(10):
    obs, info = env.reset()
    result = 0
    for i in range(100):
        action, _states = model3.predict(obs)
        obs, reward, done, force_stop, info = env.step(action)
        result += reward
        
        if done or force_stop:
            break
    
    print(env.tour)
    print(len(env.tour))
    print(result)
    print("---------------------------------")

[0, 23, 21, 5, 18, 20, 9, 4, 6, 14, 3, 15, 17, 12, 10, 19, 20, 1, 13, 22, 16, 7, 11, 0, 0, 2, 8, 0]
28
-24639
-24639
---------------------------------
[0, 23, 21, 5, 18, 20, 9, 4, 6, 16, 15, 3, 17, 12, 10, 14, 19, 22, 13, 7, 1, 11, 0, 0, 2, 8, 0]
27
-14704
-14704
---------------------------------
[0, 23, 21, 5, 18, 20, 9, 4, 6, 14, 3, 12, 17, 10, 15, 19, 1, 13, 22, 16, 7, 11, 0, 0, 21, 2, 8, 0]
28
-24257
-24257
---------------------------------
[0, 23, 21, 5, 18, 20, 9, 4, 6, 14, 3, 12, 17, 10, 15, 19, 1, 13, 22, 16, 7, 11, 0, 0, 2, 8, 0]
27
-14355
-14355
---------------------------------
[0, 23, 21, 5, 18, 3, 9, 6, 20, 4, 12, 14, 17, 10, 15, 19, 1, 13, 22, 16, 3, 22, 7, 11, 0, 0, 20, 2, 8, 0]
30
-44229
-44229
---------------------------------
[0, 23, 21, 5, 18, 20, 9, 4, 6, 14, 3, 12, 17, 10, 15, 19, 1, 13, 22, 16, 7, 11, 0, 0, 2, 1, 8, 0]
28
-24278
-24278
---------------------------------
[0, 15, 6, 3, 9, 19, 14, 12, 10, 18, 1, 23, 21, 20, 5, 13, 17, 4, 22, 16, 7, 11, 0, 0, 2, 8, 0]


In [18]:
print(gr24_opt)

NAME: gr24.opt.tour
COMMENT: Optimal solution for gr24 (1272)
TYPE: TOUR
DIMENSION: 24
TOUR_SECTION:
16 11 3 7 6 24 8 21 5 10 17 22 18 19 15 2 20 14 13 9 23 4 12 1 -1
-1
EOF


In [37]:
tensor_path = tempfile.gettempdir() + '\\.tensorboard-info'
print(tensor_path)
# !taskkill /IM "tensorboard.exe" /F

C:\Users\im_ki\AppData\Local\Temp\.tensorboard-info


In [38]:
%tensorboard --logdir='F:\UCL\Year_3\FYP\src\Training\Logs\PPO_1'

In [39]:
%tensorboard --logdir='F:\UCL\Year_3\FYP\src\Training\Logs\A2C_1'

In [40]:
%tensorboard --logdir='F:\UCL\Year_3\FYP\src\Training\Logs\DQN_2'

In [None]:
PPO_Path  = os.path.join('Training', 'Saved Models', 'PPO_Model')
A2C_Path  = os.path.join('Training', 'Saved Models', 'A2C_Model')
model1.save()
model2.save()

In [10]:
best_model_path = os.path.join('Training', 'Saved Models', 'best_model')

In [11]:
model4 = DQN.load(best_model_path, env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [12]:
evaluate_policy(model4, env, n_eval_episodes = 10)

(16409.0, 0.0)

In [17]:
best_r = 0 
for x in range(10000):
    obs, info = env.reset()
    result = 0
    for i in range(48):
        action, _states = model4.predict(obs)
        obs, reward, done, force_stop, info = env.step(action)
        result += reward
        
        if done or force_stop:
            break
    if result > best_r:
        best_r = result
        print(env.tour)
        print(len(env.tour))
        print(result)
        print("---------------------------------")

[0, 5, 20, 2, 10, 18, 3, 7, 4, 1, 19, 8, 16, 14, 23, 9, 13, 22, 12, 11, 15, 6, 17, 21, 0]
25
16409
---------------------------------
[0, 5, 20, 2, 10, 18, 11, 23, 14, 21, 7, 19, 4, 8, 12, 13, 22, 1, 16, 9, 3, 15, 6, 17, 0]
25
16411
---------------------------------
[0, 5, 20, 2, 10, 18, 3, 7, 4, 1, 19, 14, 8, 16, 23, 9, 13, 22, 12, 11, 15, 6, 17, 21, 0]
25
16452
---------------------------------
[0, 5, 20, 2, 10, 18, 11, 23, 4, 14, 21, 7, 19, 8, 12, 13, 22, 1, 16, 9, 3, 15, 6, 17, 0]
25
16528
---------------------------------
[0, 5, 20, 2, 10, 18, 3, 23, 7, 4, 19, 1, 8, 14, 16, 9, 13, 22, 12, 11, 15, 6, 17, 21, 0]
25
16638
---------------------------------
[0, 5, 20, 2, 10, 7, 23, 18, 14, 1, 19, 8, 3, 4, 16, 9, 13, 22, 12, 11, 15, 6, 17, 21, 0]
25
16951
---------------------------------
