In [1]:
pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


In [2]:
"""Environment for Travelling Salesman Problem."""

from typing import Dict, List, Optional, Tuple

import gymnasium as gym
import numpy as np


class TSP(gym.Env):
    """Traveling Salesman Problem (TSP) RL environment for persistent monitoring.

    The agent navigates a set of targets based on precomputed distances. It aims to visit
    all targets in the least number of steps, with rewards determined by the distance traveled.
    """

    def __init__(self, num_targets: int, max_area: int = 30, seed: int = None) -> None:
        """Initialize the TSP environment.

        Args:
            num_targets (int): Number of targets the agent needs to visit.
            max_area (int): Max Square area where the targets are defined. Defaults to 30
            seed (int, optional): Random seed for reproducibility. Defaults to None.
        """
        super().__init__()
        if seed is not None:
            np.random.seed(seed=seed)

        self.steps: int = 0
        self.num_targets: int = num_targets

        self.max_steps: int = num_targets
        self.max_area: int = max_area

        self.locations: np.ndarray = self._generate_points(self.num_targets)
        self.distances: np.ndarray = self._calculate_distances(self.locations)

        # Observation Space : {current loc (loc), dist_array (distances), coordintates (locations)}
        self.obs_low = np.concatenate(
            [
                np.array([0], dtype=np.float32),
                np.zeros(self.num_targets, dtype=np.float32),
                np.zeros(2 * self.num_targets, dtype=np.float32),
            ]
        )

        self.obs_high = np.concatenate(
            [
                np.array([self.num_targets], dtype=np.float32),
                2 * self.max_area * np.ones(self.num_targets, dtype=np.float32),
                self.max_area * np.ones(2 * self.num_targets, dtype=np.float32),
            ]
        )

        # Action Space : {next_target}
        self.observation_space = gym.spaces.Box(low=self.obs_low, high=self.obs_high)
        self.action_space = gym.spaces.Discrete(self.num_targets)

    def reset(
        self,
        *,
        seed: Optional[int] = None,
        options: Optional[dict] = None,
    ) -> Tuple[np.ndarray, Dict[str, None]]:
        """Reset the environment to the initial state.

        Args:
            seed (Optional[int], optional): Seed to reset the environment. Defaults to None.
            options (Optional[dict], optional): Additional reset options. Defaults to None.

        Returns:
            Tuple[np.ndarray, Dict[str, None]]: The initial state of the environment and an empty info dictionary.
        """
        self.steps: int = 0

        self.loc: int = 0
        self.visited_targets: List = []
        self.dist: List = self.distances[self.loc]

        state = np.concatenate(
            (
                np.array([self.loc]),
                np.array(self.dist),
                np.array(self.locations).reshape(-1),
            ),
            dtype=np.float32,
        )
        return state, {}

    def step(
        self, action: int
    ) -> Tuple[np.ndarray, float, bool, bool, Dict[str, None]]:
        """Take an action (move to the next target).

        Args:
            action (int): The index of the next target to move to.

        Returns:
            Tuple[np.ndarray, float, bool, bool, Dict[str, None]]:
                - The new state of the environment.
                - The reward for the action.
                - A boolean indicating whether the episode has terminated.
                - A boolean indicating if the episode is truncated.
                - An empty info dictionary.
        """
        self.steps += 1
        past_loc = self.loc
        next_loc = action

        reward = self._get_rewards(past_loc, next_loc)
        self.visited_targets.append(next_loc)

        next_dist = self.distances[next_loc]
        terminated = bool(self.steps == self.max_steps)
        truncated = False

        next_state = np.concatenate(
            [
                np.array([next_loc]),
                next_dist,
                np.array(self.locations).reshape(-1),
            ],
            dtype=np.float32,
        )

        self.loc, self.dist = next_loc, next_dist
        return (next_state, reward, terminated, truncated, {})

    def _generate_points(self, num_points: int) -> np.ndarray:
        """Generate random 2D points representing target locations.

        Args:
            num_points (int): Number of points to generate.

        Returns:
            np.ndarray: Array of 2D coordinates for each target.
        """
        points = []
        # Generate n random 2D points within the 10x10 grid
        while len(points) < num_points:
            x = np.random.random() * self.max_area
            y = np.random.random() * self.max_area
            if [x, y] not in points:
                points.append([x, y])

        return np.array(points)

    def _calculate_distances(self, locations: List) -> float:
        """Calculate the distance matrix between all target locations.

        Args:
            locations (List): List of 2D target locations.

        Returns:
            np.ndarray: Matrix of pairwise distances between targets.
        """
        n = len(locations)

        distances = np.zeros((n, n))
        for i in range(n):
            for j in range(n):
                distances[i, j] = np.linalg.norm(locations[i] - locations[j])
        return distances

    def _get_rewards(self, past_loc: int, next_loc: int) -> float:
        """Calculate the reward based on the distance traveled, however if a target gets visited again then it incurs a high penalty.

        Args:
            past_loc (int): Previous location of the agent.
            next_loc (int): Next location of the agent.

        Returns:
            float: Reward based on the travel distance between past and next locations, or negative reward if repeats visit.
        """
        if next_loc not in self.visited_targets:
            reward = -self.distances[past_loc][next_loc]
        else:
            reward = -10000
        return reward


if __name__ == "__main__":
    num_targets = 50

    env = TSP(num_targets)
    obs = env.reset()
    ep_rets = []

    for ep in range(100):
        ret = 0
        obs = env.reset()
        for _ in range(100):
            action = (
                env.action_space.sample()
            )  # You need to replace this with your algorithm that predicts the action.

            obs_, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            ret += reward

            if done:
                break

        ep_rets.append(ret)
        print(f"Episode {ep} : {ret}")

    print(np.mean(ep_rets))

Episode 0 : -140512.33125923917
Episode 1 : -150502.2969613711
Episode 2 : -210431.6129667064
Episode 3 : -170520.74828515903
Episode 4 : -180419.16312399905
Episode 5 : -190440.46973971443
Episode 6 : -200401.51508388447
Episode 7 : -200420.0105397771
Episode 8 : -190499.20678100703
Episode 9 : -200417.70876545028
Episode 10 : -180550.81756837803
Episode 11 : -170454.6781822006
Episode 12 : -220389.0257112962
Episode 13 : -180441.30943675502
Episode 14 : -200429.96125434732
Episode 15 : -150516.2119171686
Episode 16 : -170396.17185623312
Episode 17 : -180440.24463606963
Episode 18 : -140578.02811394184
Episode 19 : -180449.6571804799
Episode 20 : -220416.31956299476
Episode 21 : -190403.63612231618
Episode 22 : -200412.73025388966
Episode 23 : -180542.59237384374
Episode 24 : -160506.50371200047
Episode 25 : -200445.09641695322
Episode 26 : -140558.67126569475
Episode 27 : -170461.68970970972
Episode 28 : -170415.4882131488
Episode 29 : -190392.2623358844
Episode 30 : -160466.06615245

## Dynamic Programming

In [3]:
import numpy as np
from itertools import permutations, combinations


class ValueIterationTSP:
    def __init__(self, env, gamma=0.9, threshold=1e-4):
        self.env = env  # TSP environment
        self.gamma = gamma  # Discount factor
        self.threshold = threshold  # Convergence threshold
        self.num_targets = env.num_targets

        # State space: (current location, visited targets)
        self.states = self._generate_all_states()  # Modified to include all possible states
        self.value_func = {state: 0 for state in self.states}  # Value function V(s)
        self.policy = {}  # Optimal policy π(s) -> a

    def _generate_all_states(self):
        """Generate all possible states for value iteration."""
        states = []
        # Iterate over all possible combinations of visited targets
        for r in range(1, self.num_targets + 1):
            for visited in combinations(range(self.num_targets), r):
                for current_loc in visited:
                    states.append((current_loc, frozenset(visited)))
        return states

    def value_iteration(self):
        while True:
            delta = 0
            # Iterate over all states
            for state in self.states:
                current_loc, visited = state
                unvisited = set(range(self.num_targets)) - visited

                if not unvisited:
                    # No more targets to visit (terminal state)
                    continue

                min_value = float('inf')
                best_action = None

                # Try all possible actions (next target to visit)
                for action in unvisited:
                    next_loc = action
                    next_visited = visited | {next_loc}
                    next_state = (next_loc, next_visited)

                    # Calculate the immediate reward (negative distance)
                    reward = -self.env.distances[current_loc][next_loc]
                    next_value = reward + self.gamma * self.value_func.get(next_state, 0)

                    # Minimize value for the current state
                    if next_value < min_value:
                        min_value = next_value
                        best_action = action

                # Update value function for the current state
                delta = max(delta, abs(self.value_func[state] - min_value))
                self.value_func[state] = min_value
                self.policy[state] = best_action

            if delta < self.threshold:
                break

    def get_policy(self):
        """Returns the optimal policy after value iteration."""
        return self.policy

    def simulate_optimal_policy(self):
        """Simulates the environment following the optimal policy to calculate the total reward."""
        current_loc = 0  # Start at the first location
        visited = frozenset([current_loc])
        total_reward = 0
        steps = 0

        # Simulate following the optimal policy until all targets are visited
        while len(visited) < self.num_targets:
            state = (current_loc, visited)
            action = self.policy[state]  # Get the optimal action from the policy
            next_loc = action
            reward = -self.env.distances[current_loc][next_loc]  # Immediate reward (negative distance)
            total_reward += reward

            # Update current state
            visited = visited | {next_loc}
            current_loc = next_loc
            steps += 1

        print(f"Total Reward following the optimal policy: {total_reward}")
        print(f"Total Steps: {steps}")
        return total_reward


if __name__ == "__main__":
    num_targets = 15  # Number of targets in the TSP
    env = TSP(num_targets)

    # Initialize the value iteration solver
    solver = ValueIterationTSP(env)

    # Perform value iteration to compute the optimal policy
    solver.value_iteration()

    # Output the optimal policy
    optimal_policy = solver.get_policy()

    print("Optimal Policy:")
    for state, action in optimal_policy.items():
        print(f"State {state} -> Optimal Action: {action}")

    # Simulate the environment following the optimal policy to get the total reward
    total_reward = solver.simulate_optimal_policy()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
State (10, frozenset({0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 12, 14})) -> Optimal Action: 3
State (12, frozenset({0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 12, 14})) -> Optimal Action: 11
State (14, frozenset({0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 12, 14})) -> Optimal Action: 11
State (0, frozenset({0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 13, 14})) -> Optimal Action: 12
State (1, frozenset({0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 13, 14})) -> Optimal Action: 12
State (2, frozenset({0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 13, 14})) -> Optimal Action: 12
State (4, frozenset({0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 13, 14})) -> Optimal Action: 11
State (5, frozenset({0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 13, 14})) -> Optimal Action: 3
State (6, frozenset({0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 13, 14})) -> Optimal Action: 11
State (7, frozenset({0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 13, 14})) -> Optimal Action: 12
State (8, frozenset({0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 13, 14})) -> Optimal Action: 11
State (9, 

## Monte-Carlo

In [12]:
from typing import Dict, List, Optional, Tuple
import numpy as np
import random

# Monte Carlo Solver
class TSPMonteCarlo:
    def __init__(self, num_targets: int, distances: np.ndarray):
        self.num_targets = num_targets
        self.distances = distances
        self.Q = {}
        self.returns = {}
        self.policy = {}

    def generate_episode(self):
        episode = []
        visited = set()
        current = random.randint(0, self.num_targets - 1)
        visited.add(current)

        while len(visited) < self.num_targets:
            available_actions = [i for i in range(self.num_targets) if i not in visited]
            action = random.choice(available_actions)
            reward = -self.distances[current][action]
            episode.append((current, action, reward))
            current = action
            visited.add(current)

        return episode

    def solve(self, num_episodes: int, method: str = "first_visit"):
        """Solve the TSP using Monte Carlo (First-Visit or Every-Visit)."""
        for _ in range(num_episodes):
            episode = self.generate_episode()
            G = 0
            visited_sa = set()

            for t in range(len(episode) - 1, -1, -1):
                state, action, reward = episode[t]
                G += reward

                if method == "first_visit" and (state, action) in visited_sa:
                    continue

                visited_sa.add((state, action))
                if (state, action) not in self.returns:
                    self.returns[(state, action)] = []
                self.returns[(state, action)].append(G)
                self.Q[(state, action)] = np.mean(self.returns[(state, action)])

                if state not in self.policy or self.Q[(state, action)] > self.Q.get(
                    (state, self.policy.get(state)), float("-inf")
                ):
                    self.policy[state] = action

    def get_action(self, current_state: int, visited: List[int]) -> int:
        if current_state in self.policy:
            return self.policy[current_state]
        unvisited = [i for i in range(self.num_targets) if i not in visited]
        return min(unvisited, key=lambda x: self.distances[current_state][x])


if __name__ == "__main__":
    num_targets = 15
    num_episodes = 1000  # Increase episodes to 1000

    # Initializing the environment and Monte Carlo solver
    env = TSP(num_targets, seed=42)

    # Running for First-Visit Monte Carlo
    print("Running First-Visit Monte Carlo:")
    first_visit_solver = TSPMonteCarlo(num_targets, env.distances)
    ep_rets_first_visit = []

    for ep in range(num_episodes):
        ret = 0
        obs, _ = env.reset()  # Reset the environment for each episode
        for step in range(100):
            action = env.action_space.sample()  # Replace this with your algorithm
            obs_, reward, terminated, truncated, info = env.step(action)
            ret += reward
            if terminated or truncated:
                break
        ep_rets_first_visit.append(ret)
        print(f"First-Visit Episode {ep+1}: Return = {ret}")

    # Print average return for First-Visit Monte Carlo
    print(f"Average Return for First-Visit Monte Carlo after {num_episodes} episodes: {np.mean(ep_rets_first_visit)}\n")

    # Run for Every-Visit Monte Carlo
    print("Running Every-Visit Monte Carlo:")
    every_visit_solver = TSPMonteCarlo(num_targets, env.distances)
    ep_rets_every_visit = []

    for ep in range(num_episodes):
        ret = 0
        obs, _ = env.reset()  # Reset the environment for each episode
        for step in range(100):
            action = env.action_space.sample()  # Replace this with your algorithm
            obs_, reward, terminated, truncated, info = env.step(action)
            ret += reward
            if terminated or truncated:
                break
        ep_rets_every_visit.append(ret)
        print(f"Every-Visit Episode {ep+1}: Return = {ret}")

    # Print average return for Every-Visit Monte Carlo
    print(f"Average Return for Every-Visit Monte Carlo after {num_episodes} episodes: {np.mean(ep_rets_every_visit)}")

Running First-Visit Monte Carlo:
First-Visit Episode 1: Return = -70101.2850824541
First-Visit Episode 2: Return = -40150.346003266575
First-Visit Episode 3: Return = -70118.7371836632
First-Visit Episode 4: Return = -70117.03494235258
First-Visit Episode 5: Return = -40207.88996680701
First-Visit Episode 6: Return = -50199.15575119802
First-Visit Episode 7: Return = -40117.667427753666
First-Visit Episode 8: Return = -50171.77297509217
First-Visit Episode 9: Return = -80113.08449596961
First-Visit Episode 10: Return = -60131.54512466072
First-Visit Episode 11: Return = -40155.45591092546
First-Visit Episode 12: Return = -60106.73380958538
First-Visit Episode 13: Return = -40172.46903082877
First-Visit Episode 14: Return = -50125.10865377819
First-Visit Episode 15: Return = -10203.650644376112
First-Visit Episode 16: Return = -60114.00826841105
First-Visit Episode 17: Return = -70112.52413702972
First-Visit Episode 18: Return = -50165.20981305841
First-Visit Episode 19: Return = -30213