In [3]:
import os
import numpy as np
import tsplib95 as tsp

import gym
from gym import Env
from gym.spaces import Discrete

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

In [4]:
def load_tsp(problem_file):
    script_dir = os.getcwd()
    problem_path = os.path.join(script_dir, problem_file)
    problem = tsp.load(problem_path)
    return problem

In [5]:
gr24 = load_tsp("gr24.tsp")
#print(gr24)
gr24_opt = load_tsp("gr24.opt.tour")
#print(gr24_opt)

In [76]:
class tspEnv(Env):
    def __init__(self, problem, start_node = 0):
        self.problem = problem
        self.action_space = Discrete(self.problem.dimension)
        self.observation_space = Discrete(self.problem.dimension)
        self.tour = []
        self.max_length = self.problem.dimension
        self.start = start_node
        self.w_matrix = self._get_w_matrix()
        
    def step(self, action):
         # Get current state
        state = self._get_state()
        new_state = action

        # Get reward for such a move
        reward = self._get_reward(state,new_state)

        self._update_matrix(new_state)

        # Append new_state to stops
        self.tour.append(new_state)
        done = len(self.tour) == self.max_length

        return new_state, reward ,done

    def next_rand_action(self):
        if len(self.tour) == (self.max_length - 1):
            return self.start
        else:
            while True:
                a = self.action_space.sample()
                if (((a not in self.tour) and (a != self.start)) and (a != self._get_state())):
                    break
            return a
        
    def render(self):
        pass
        
    def reset(self):
        self.tour = []
        self.tour.append(self.start)
        self.w_matrix = self._get_w_matrix()
        return self.start

    def _get_state(self):
        return self.tour[-1]

    def _get_reward(self, state, new_state):
        return -self.w_matrix[state][new_state]

    def _get_w_matrix(self):
        data = []
        weight = []
        for i in self.problem.edge_weights:
            for j in i:
                data.append(j)

        # convert lower triangle matrix to square matrix
        if self.problem.edge_weight_format == "LOWER_DIAG_ROW":
            for x in range(self.problem.dimension):   # format lower triangle matrix
                node = []
                w = data.pop(0)
                while w != 0:
                    node.append(w)
                    w = data.pop(0)
                while len(node) != self.problem.dimension:
                    node.append(0)
                weight.append(node)
            matrix = np.triu(np.array(weight).T,1) + weight   #convert to square matrix
        matrix[x == 0] = 10**5
        return matrix

    def _update_matrix(self, new_state):
        self.w_matrix[:, new_state] = 10**5

In [77]:
env = tspEnv(gr24)

In [78]:
episode = 5
for episode in range(episode):
    obs = env.reset()
    done = False
    score = 0

    while not done:
        action = env.next_rand_action()
        obs, reward, done = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close

Episode:0 Score:-3521
Episode:1 Score:-3339
Episode:2 Score:-3010
Episode:3 Score:-3108
Episode:4 Score:-3514


<bound method Env.close of <__main__.tspEnv object at 0x00000146C44E7B60>>