In [12]:
pip install gymnasium



In [16]:
"""Environment for Travelling Salesman Problem."""

from typing import Dict, List, Optional, Tuple

import gymnasium as gym
import numpy as np


class TSP(gym.Env):
    """Traveling Salesman Problem (TSP) RL environment for persistent monitoring.

    The agent navigates a set of targets based on precomputed distances. It aims to visit
    all targets in the least number of steps, with rewards determined by the distance traveled.
    """

    def __init__(self, num_targets: int, max_area: int = 30, seed: int = None) -> None:
        """Initialize the TSP environment.

        Args:
            num_targets (int): Number of targets the agent needs to visit.
            max_area (int): Max Square area where the targets are defined. Defaults to 30
            seed (int, optional): Random seed for reproducibility. Defaults to None.
        """
        super().__init__()
        if seed is not None:
            np.random.seed(seed=seed)

        self.steps: int = 0
        self.num_targets: int = num_targets

        self.max_steps: int = num_targets
        self.max_area: int = max_area

        self.locations: np.ndarray = self._generate_points(self.num_targets)
        self.distances: np.ndarray = self._calculate_distances(self.locations)

        # Observation Space : {current loc (loc), dist_array (distances), coordintates (locations)}
        self.obs_low = np.concatenate(
            [
                np.array([0], dtype=np.float32),
                np.zeros(self.num_targets, dtype=np.float32),
                np.zeros(2 * self.num_targets, dtype=np.float32),
            ]
        )

        self.obs_high = np.concatenate(
            [
                np.array([self.num_targets], dtype=np.float32),
                2 * self.max_area * np.ones(self.num_targets, dtype=np.float32),
                self.max_area * np.ones(2 * self.num_targets, dtype=np.float32),
            ]
        )

        # Action Space : {next_target}
        self.observation_space = gym.spaces.Box(low=self.obs_low, high=self.obs_high)
        self.action_space = gym.spaces.Discrete(self.num_targets)

    def reset(
        self,
        *,
        seed: Optional[int] = None,
        options: Optional[dict] = None,
    ) -> Tuple[np.ndarray, Dict[str, None]]:
        """Reset the environment to the initial state.

        Args:
            seed (Optional[int], optional): Seed to reset the environment. Defaults to None.
            options (Optional[dict], optional): Additional reset options. Defaults to None.

        Returns:
            Tuple[np.ndarray, Dict[str, None]]: The initial state of the environment and an empty info dictionary.
        """
        self.steps: int = 0

        self.loc: int = 0
        self.visited_targets: List = []
        self.dist: List = self.distances[self.loc]

        state = np.concatenate(
            (
                np.array([self.loc]),
                np.array(self.dist),
                np.array(self.locations).reshape(-1),
            ),
            dtype=np.float32,
        )
        return state, {}

    def step(
        self, action: int
    ) -> Tuple[np.ndarray, float, bool, bool, Dict[str, None]]:
        """Take an action (move to the next target).

        Args:
            action (int): The index of the next target to move to.

        Returns:
            Tuple[np.ndarray, float, bool, bool, Dict[str, None]]:
                - The new state of the environment.
                - The reward for the action.
                - A boolean indicating whether the episode has terminated.
                - A boolean indicating if the episode is truncated.
                - An empty info dictionary.
        """
        self.steps += 1
        past_loc = self.loc
        next_loc = action

        reward = self._get_rewards(past_loc, next_loc)
        self.visited_targets.append(next_loc)

        next_dist = self.distances[next_loc]
        terminated = bool(self.steps == self.max_steps)
        truncated = False

        next_state = np.concatenate(
            [
                np.array([next_loc]),
                next_dist,
                np.array(self.locations).reshape(-1),
            ],
            dtype=np.float32,
        )

        self.loc, self.dist = next_loc, next_dist
        return (next_state, reward, terminated, truncated, {})

    def _generate_points(self, num_points: int) -> np.ndarray:
        """Generate random 2D points representing target locations.

        Args:
            num_points (int): Number of points to generate.

        Returns:
            np.ndarray: Array of 2D coordinates for each target.
        """
        points = []
        # Generate n random 2D points within the 10x10 grid
        while len(points) < num_points:
            x = np.random.random() * self.max_area
            y = np.random.random() * self.max_area
            if [x, y] not in points:
                points.append([x, y])

        return np.array(points)

    def _calculate_distances(self, locations: List) -> float:
        """Calculate the distance matrix between all target locations.

        Args:
            locations (List): List of 2D target locations.

        Returns:
            np.ndarray: Matrix of pairwise distances between targets.
        """
        n = len(locations)

        distances = np.zeros((n, n))
        for i in range(n):
            for j in range(n):
                distances[i, j] = np.linalg.norm(locations[i] - locations[j])
        return distances

    def _get_rewards(self, past_loc: int, next_loc: int) -> float:
        """Calculate the reward based on the distance traveled, however if a target gets visited again then it incurs a high penalty.

        Args:
            past_loc (int): Previous location of the agent.
            next_loc (int): Next location of the agent.

        Returns:
            float: Reward based on the travel distance between past and next locations, or negative reward if repeats visit.
        """
        if next_loc not in self.visited_targets:
            reward = -self.distances[past_loc][next_loc]
        else:
            reward = -10000
        return reward


if __name__ == "__main__":
    num_targets = 50

    env = TSP(num_targets)
    obs = env.reset()
    ep_rets = []

    for ep in range(100):
        ret = 0
        obs = env.reset()
        for _ in range(100):
            action = (
                env.action_space.sample()
            )  # You need to replace this with your algorithm that predicts the action.

            obs_, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            ret += reward

            if done:
                break

        ep_rets.append(ret)
        print(f"Episode {ep} : {ret}")

    print(np.mean(ep_rets))

Episode 0 : -130627.18664893821
Episode 1 : -180514.35306187256
Episode 2 : -170424.04792070037
Episode 3 : -180499.15038853214
Episode 4 : -160516.74462474248
Episode 5 : -200486.27271322315
Episode 6 : -150485.29296108687
Episode 7 : -140593.1220440809
Episode 8 : -210431.3748213466
Episode 9 : -140608.58114143155
Episode 10 : -170521.38383412457
Episode 11 : -180523.60975241387
Episode 12 : -160570.50151922947
Episode 13 : -190521.42235727346
Episode 14 : -210460.71191647908
Episode 15 : -180534.45793769782
Episode 16 : -180507.9919525675
Episode 17 : -210418.90442410234
Episode 18 : -160543.68273922364
Episode 19 : -150568.98963839532
Episode 20 : -180493.50656491696
Episode 21 : -190508.64347938471
Episode 22 : -170535.66117748973
Episode 23 : -190410.551913164
Episode 24 : -170506.75282639923
Episode 25 : -180511.83450659947
Episode 26 : -210393.81399415366
Episode 27 : -150504.401913685
Episode 28 : -150531.26094496765
Episode 29 : -160521.70676653576
Episode 30 : -170484.693374

## Dynamic Programming

In [18]:
import numpy as np

class DPSolver:
    def __init__(self, env: TSP, learning_rate: float = 0.1, discount_factor: float = 0.9):
        self.env = env
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.q_table = np.zeros((env.num_targets, env.num_targets))

    def train(self, num_episodes: int = 1000):
        for episode in range(num_episodes):
            state, _ = self.env.reset()
            current_target = int(state[0])
            done = False

            while not done:
                action = np.argmax(self.q_table[current_target])
                next_state, reward, done, _, _ = self.env.step(action)
                next_target = int(next_state[0])

                # Q-learning update
                best_next_action = np.argmax(self.q_table[next_target])
                td_target = reward + self.discount_factor * self.q_table[next_target, best_next_action]
                td_error = td_target - self.q_table[current_target, action]
                self.q_table[current_target, action] += self.learning_rate * td_error

                current_target = next_target

            if episode % 100 == 0:
                print(f"Episode {episode} completed")

    def get_optimal_policy(self):
        return np.argmax(self.q_table, axis=1)

if __name__ == "__main__":
    num_targets = 50
    env = TSP(num_targets)
    solver = DPSolver(env)

    print("Executing DP")
    solver.train()

    optimal_policy = solver.get_optimal_policy()
    print("Optimal policy:", optimal_policy)

    # Evaluate the policy
    state, _ = env.reset()
    total_reward = 0
    for _ in range(100):
        action = optimal_policy[int(state[0])]
        state, reward, done, _, _ = env.step(action)
        total_reward += reward
        if done:
            break

    print(f"Total reward using optimal policy: {total_reward}")

Executing DP solver
Episode 0 completed
Episode 100 completed
Episode 200 completed
Episode 300 completed
Episode 400 completed
Episode 500 completed
Episode 600 completed
Episode 700 completed
Episode 800 completed
Episode 900 completed
Optimal policy: [47  0 23 21  5 23 15 39 22  1 46 31 45 14  8 25 41 49  8 46 33 27 34  9
 29  9 49 46 12 25 36 46 25 20 27 48 16 27  2 34 35  4  8 21 19 21 43 18
 25 37]
Total reward using optimal policy: -410107.4259394547


## Monte-Carlo

In [19]:
import numpy as np
from collections import defaultdict

class MCSolver:
    def __init__(self, env: TSP, epsilon: float = 0.1, discount_factor: float = 0.9):
        self.env = env
        self.epsilon = epsilon
        self.discount_factor = discount_factor
        self.q_table = defaultdict(lambda: np.zeros(env.num_targets))
        self.returns = defaultdict(list)

    def choose_action(self, state):
        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        else:
            return np.argmax(self.q_table[state])

    def train(self, num_episodes: int = 1000):
        for episode in range(num_episodes):
            episode_history = []
            state, _ = self.env.reset()
            state = int(state[0])
            done = False

            while not done:
                action = self.choose_action(state)
                next_state, reward, done, _, _ = self.env.step(action)
                episode_history.append((state, action, reward))
                state = int(next_state[0])

            # Calculating returns and update Q-values
            G = 0
            for state, action, reward in reversed(episode_history):
                G = reward + self.discount_factor * G
                self.returns[(state, action)].append(G)
                self.q_table[state][action] = np.mean(self.returns[(state, action)])

            if episode % 100 == 0:
                print(f"Episode {episode} completed")

    def get_optimal_policy(self):
        return {state: np.argmax(actions) for state, actions in self.q_table.items()}

if __name__ == "__main__":
    num_targets = 50
    env = TSP(num_targets)
    solver = MCSolver(env)

    print("Executing MC")
    solver.train()

    optimal_policy = solver.get_optimal_policy()
    print("Optimal policy:", optimal_policy)

    # Evaluate the policy
    state, _ = env.reset()
    total_reward = 0
    for _ in range(100):
        action = optimal_policy.get(int(state[0]), env.action_space.sample())
        state, reward, done, _, _ = env.step(action)
        total_reward += reward
        if done:
            break

    print(f"Total reward using optimal policy: {total_reward}")

Executing MC
Episode 0 completed
Episode 100 completed
Episode 200 completed
Episode 300 completed
Episode 400 completed
Episode 500 completed
Episode 600 completed
Episode 700 completed
Episode 800 completed
Episode 900 completed
Optimal policy: {4: 12, 8: 1, 0: 47, 29: 20, 48: 5, 47: 18, 1: 22, 42: 14, 37: 33, 41: 24, 23: 36, 7: 46, 2: 19, 20: 36, 36: 8, 32: 44, 3: 49, 45: 13, 39: 45, 25: 35, 40: 7, 5: 39, 31: 2, 15: 40, 6: 41, 35: 11, 38: 48, 34: 6, 19: 15, 27: 33, 28: 37, 30: 5, 9: 45, 21: 32, 10: 9, 11: 31, 17: 25, 14: 18, 12: 10, 46: 4, 24: 42, 13: 23, 49: 2, 16: 4, 22: 26, 18: 28, 26: 17, 44: 4, 33: 3, 43: 7}
Total reward using optimal policy: -200491.4952307807
