In [None]:
pip install gymnasium



In [None]:
"""Environment for Travelling Salesman Problem."""

from typing import Dict, List, Optional, Tuple

import gymnasium as gym
import numpy as np


class TSP(gym.Env):
    """Traveling Salesman Problem (TSP) RL environment for persistent monitoring.

    The agent navigates a set of targets based on precomputed distances. It aims to visit
    all targets in the least number of steps, with rewards determined by the distance traveled.
    """

    def __init__(self, num_targets: int, max_area: int = 30, seed: int = None) -> None:
        """Initialize the TSP environment.

        Args:
            num_targets (int): Number of targets the agent needs to visit.
            max_area (int): Max Square area where the targets are defined. Defaults to 30
            seed (int, optional): Random seed for reproducibility. Defaults to None.
        """
        super().__init__()
        if seed is not None:
            np.random.seed(seed=seed)

        self.steps: int = 0
        self.num_targets: int = num_targets

        self.max_steps: int = num_targets
        self.max_area: int = max_area

        self.locations: np.ndarray = self._generate_points(self.num_targets)
        self.distances: np.ndarray = self._calculate_distances(self.locations)

        # Observation Space : {current loc (loc), dist_array (distances), coordintates (locations)}
        self.obs_low = np.concatenate(
            [
                np.array([0], dtype=np.float32),
                np.zeros(self.num_targets, dtype=np.float32),
                np.zeros(2 * self.num_targets, dtype=np.float32),
            ]
        )

        self.obs_high = np.concatenate(
            [
                np.array([self.num_targets], dtype=np.float32),
                2 * self.max_area * np.ones(self.num_targets, dtype=np.float32),
                self.max_area * np.ones(2 * self.num_targets, dtype=np.float32),
            ]
        )

        # Action Space : {next_target}
        self.observation_space = gym.spaces.Box(low=self.obs_low, high=self.obs_high)
        self.action_space = gym.spaces.Discrete(self.num_targets)

    def reset(
        self,
        *,
        seed: Optional[int] = None,
        options: Optional[dict] = None,
    ) -> Tuple[np.ndarray, Dict[str, None]]:
        """Reset the environment to the initial state.

        Args:
            seed (Optional[int], optional): Seed to reset the environment. Defaults to None.
            options (Optional[dict], optional): Additional reset options. Defaults to None.

        Returns:
            Tuple[np.ndarray, Dict[str, None]]: The initial state of the environment and an empty info dictionary.
        """
        self.steps: int = 0

        self.loc: int = 0
        self.visited_targets: List = []
        self.dist: List = self.distances[self.loc]

        state = np.concatenate(
            (
                np.array([self.loc]),
                np.array(self.dist),
                np.array(self.locations).reshape(-1),
            ),
            dtype=np.float32,
        )
        return state, {}

    def step(
        self, action: int
    ) -> Tuple[np.ndarray, float, bool, bool, Dict[str, None]]:
        """Take an action (move to the next target).

        Args:
            action (int): The index of the next target to move to.

        Returns:
            Tuple[np.ndarray, float, bool, bool, Dict[str, None]]:
                - The new state of the environment.
                - The reward for the action.
                - A boolean indicating whether the episode has terminated.
                - A boolean indicating if the episode is truncated.
                - An empty info dictionary.
        """
        self.steps += 1
        past_loc = self.loc
        next_loc = action

        reward = self._get_rewards(past_loc, next_loc)
        self.visited_targets.append(next_loc)

        next_dist = self.distances[next_loc]
        terminated = bool(self.steps == self.max_steps)
        truncated = False

        next_state = np.concatenate(
            [
                np.array([next_loc]),
                next_dist,
                np.array(self.locations).reshape(-1),
            ],
            dtype=np.float32,
        )

        self.loc, self.dist = next_loc, next_dist
        return (next_state, reward, terminated, truncated, {})

    def _generate_points(self, num_points: int) -> np.ndarray:
        """Generate random 2D points representing target locations.

        Args:
            num_points (int): Number of points to generate.

        Returns:
            np.ndarray: Array of 2D coordinates for each target.
        """
        points = []
        # Generate n random 2D points within the 10x10 grid
        while len(points) < num_points:
            x = np.random.random() * self.max_area
            y = np.random.random() * self.max_area
            if [x, y] not in points:
                points.append([x, y])

        return np.array(points)

    def _calculate_distances(self, locations: List) -> float:
        """Calculate the distance matrix between all target locations.

        Args:
            locations (List): List of 2D target locations.

        Returns:
            np.ndarray: Matrix of pairwise distances between targets.
        """
        n = len(locations)

        distances = np.zeros((n, n))
        for i in range(n):
            for j in range(n):
                distances[i, j] = np.linalg.norm(locations[i] - locations[j])
        return distances

    def _get_rewards(self, past_loc: int, next_loc: int) -> float:
        """Calculate the reward based on the distance traveled, however if a target gets visited again then it incurs a high penalty.

        Args:
            past_loc (int): Previous location of the agent.
            next_loc (int): Next location of the agent.

        Returns:
            float: Reward based on the travel distance between past and next locations, or negative reward if repeats visit.
        """
        if next_loc not in self.visited_targets:
            reward = -self.distances[past_loc][next_loc]
        else:
            reward = -10000
        return reward


if __name__ == "__main__":
    num_targets = 50

    env = TSP(num_targets)
    obs = env.reset()
    ep_rets = []

    for ep in range(100):
        ret = 0
        obs = env.reset()
        for _ in range(100):
            action = (
                env.action_space.sample()
            )  # You need to replace this with your algorithm that predicts the action.

            obs_, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            ret += reward

            if done:
                break

        ep_rets.append(ret)
        print(f"Episode {ep} : {ret}")

    print(np.mean(ep_rets))

Episode 0 : -130627.18664893821
Episode 1 : -180514.35306187256
Episode 2 : -170424.04792070037
Episode 3 : -180499.15038853214
Episode 4 : -160516.74462474248
Episode 5 : -200486.27271322315
Episode 6 : -150485.29296108687
Episode 7 : -140593.1220440809
Episode 8 : -210431.3748213466
Episode 9 : -140608.58114143155
Episode 10 : -170521.38383412457
Episode 11 : -180523.60975241387
Episode 12 : -160570.50151922947
Episode 13 : -190521.42235727346
Episode 14 : -210460.71191647908
Episode 15 : -180534.45793769782
Episode 16 : -180507.9919525675
Episode 17 : -210418.90442410234
Episode 18 : -160543.68273922364
Episode 19 : -150568.98963839532
Episode 20 : -180493.50656491696
Episode 21 : -190508.64347938471
Episode 22 : -170535.66117748973
Episode 23 : -190410.551913164
Episode 24 : -170506.75282639923
Episode 25 : -180511.83450659947
Episode 26 : -210393.81399415366
Episode 27 : -150504.401913685
Episode 28 : -150531.26094496765
Episode 29 : -160521.70676653576
Episode 30 : -170484.693374

## Dynamic Programming

In [None]:
import numpy as np
from itertools import permutations, combinations


class ValueIterationTSP:
    def __init__(self, env, gamma=0.9, threshold=1e-4):
        self.env = env  # TSP environment
        self.gamma = gamma  # Discount factor
        self.threshold = threshold  # Convergence threshold
        self.num_targets = env.num_targets

        # State space: (current location, visited targets)
        self.states = self._generate_all_states()  # Modified to include all possible states
        self.value_func = {state: 0 for state in self.states}  # Value function V(s)
        self.policy = {}  # Optimal policy π(s) -> a

    def _generate_all_states(self):
        """Generate all possible states for value iteration."""
        states = []
        # Iterate over all possible combinations of visited targets
        for r in range(1, self.num_targets + 1):
            for visited in combinations(range(self.num_targets), r):
                for current_loc in visited:
                    states.append((current_loc, frozenset(visited)))
        return states

    def value_iteration(self):
        while True:
            delta = 0
            # Iterate over all states
            for state in self.states:
                current_loc, visited = state
                unvisited = set(range(self.num_targets)) - visited

                if not unvisited:
                    # No more targets to visit (terminal state)
                    continue

                min_value = float('inf')
                best_action = None

                # Try all possible actions (next target to visit)
                for action in unvisited:
                    next_loc = action
                    next_visited = visited | {next_loc}
                    next_state = (next_loc, next_visited)

                    # Calculate the immediate reward (negative distance)
                    reward = -self.env.distances[current_loc][next_loc]
                    next_value = reward + self.gamma * self.value_func.get(next_state, 0)

                    # Minimize value for the current state
                    if next_value < min_value:
                        min_value = next_value
                        best_action = action

                # Update value function for the current state
                delta = max(delta, abs(self.value_func[state] - min_value))
                self.value_func[state] = min_value
                self.policy[state] = best_action

            if delta < self.threshold:
                break

    def get_policy(self):
        """Returns the optimal policy after value iteration."""
        return self.policy

    def simulate_optimal_policy(self):
        """Simulates the environment following the optimal policy to calculate the total reward."""
        current_loc = 0  # Start at the first location
        visited = frozenset([current_loc])
        total_reward = 0
        steps = 0

        # Simulate following the optimal policy until all targets are visited
        while len(visited) < self.num_targets:
            state = (current_loc, visited)
            action = self.policy[state]  # Get the optimal action from the policy
            next_loc = action
            reward = -self.env.distances[current_loc][next_loc]  # Immediate reward (negative distance)
            total_reward += reward

            # Update current state
            visited = visited | {next_loc}
            current_loc = next_loc
            steps += 1

        print(f"Total Reward following the optimal policy: {total_reward}")
        print(f"Total Steps: {steps}")
        return total_reward


if __name__ == "__main__":
    num_targets = 15  # Number of targets in the TSP
    env = TSP(num_targets)

    # Initialize the value iteration solver
    solver = ValueIterationTSP(env)

    # Perform value iteration to compute the optimal policy
    solver.value_iteration()

    # Output the optimal policy
    optimal_policy = solver.get_policy()

    print("Optimal Policy:")
    for state, action in optimal_policy.items():
        print(f"State {state} -> Optimal Action: {action}")

    # Simulate the environment following the optimal policy to get the total reward
    total_reward = solver.simulate_optimal_policy()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
State (10, frozenset({0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 12, 14})) -> Optimal Action: 3
State (12, frozenset({0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 12, 14})) -> Optimal Action: 3
State (14, frozenset({0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 12, 14})) -> Optimal Action: 13
State (0, frozenset({0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 13, 14})) -> Optimal Action: 12
State (1, frozenset({0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 13, 14})) -> Optimal Action: 3
State (2, frozenset({0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 13, 14})) -> Optimal Action: 12
State (4, frozenset({0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 13, 14})) -> Optimal Action: 3
State (5, frozenset({0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 13, 14})) -> Optimal Action: 12
State (6, frozenset({0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 13, 14})) -> Optimal Action: 12
State (7, frozenset({0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 13, 14})) -> Optimal Action: 12
State (8, frozenset({0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 13, 14})) -> Optimal Action: 3
State (9, fro

## Monte-Carlo

In [44]:
import numpy as np
import random
from typing import List, Tuple, Dict
import gymnasium as gym

class TSPMonteCarloEpsilonGreedy:
    def __init__(self, num_targets: int, distances: np.ndarray, epsilon: float = 0.1):
        self.num_targets = num_targets
        self.distances = distances
        self.Q = {}  # Action-value function
        self.returns = {}  # Returns for each state-action pair
        self.policy = {}  # Policy (best action for each state)
        self.epsilon = epsilon  # Exploration rate

    def epsilon_greedy_action(self, state: int, available_actions: List[int]) -> int:
        """Choose an action based on epsilon-greedy policy."""
        if random.random() < self.epsilon:  # Exploration
            return random.choice(available_actions)
        else:  # Exploitation (choose the action with highest Q value)
            q_values = [self.Q.get((state, a), 0) for a in available_actions]
            return available_actions[np.argmax(q_values)]

    def generate_episode(self) -> List[Tuple[int, int, float]]:
        """Generate an episode (sequence of state-action-reward) using the current policy."""
        episode = []
        visited = set()
        current = random.randint(0, self.num_targets - 1)  # Start at a random location
        visited.add(current)

        while len(visited) < self.num_targets:
            available_actions = [i for i in range(self.num_targets) if i not in visited]
            action = self.epsilon_greedy_action(current, available_actions)
            reward = -self.distances[current][action]  # Negative distance as reward
            episode.append((current, action, reward))
            current = action
            visited.add(current)

        # Return to the starting point after visiting all nodes
        reward = -self.distances[current][episode[0][0]]  # Reward for returning to start
        episode.append((current, episode[0][0], reward))

        return episode

    def solve(self, num_episodes: int, method: str = "first_visit"):
        """Solve the TSP using Monte Carlo method (first-visit or every-visit) and calculate total return."""
        total_returns = []  # To store the return (G) for each episode

        for ep in range(num_episodes):
            episode = self.generate_episode()
            G = 0  # Total return for the current episode
            visited_sa = set()  # To track visited state-action pairs (for first-visit)

            # Loop backwards through the episode to calculate returns and update Q values
            for t in range(len(episode) - 1, -1, -1):
                state, action, reward = episode[t]
                G += reward  # Calculate total return

                if method == "first_visit" and (state, action) in visited_sa:
                    continue  # Skip if it's not the first visit to this state-action pair

                visited_sa.add((state, action))

                if (state, action) not in self.returns:
                    self.returns[(state, action)] = []
                self.returns[(state, action)].append(G)
                self.Q[(state, action)] = np.mean(self.returns[(state, action)])  # Update Q-value

                # Update the policy based on updated Q values
                available_actions = [i for i in range(self.num_targets) if i != state]
                if available_actions:  # Choose the action with the highest Q value
                    best_action = max(available_actions, key=lambda a: self.Q.get((state, a), 0))
                    self.policy[state] = best_action

            total_returns.append(G)

        return total_returns  # Return all the total returns for the episodes

    def get_action(self, current_state: int, visited: List[int]) -> int:
        """Get action based on the current policy for the given state."""
        available_actions = [i for i in range(self.num_targets) if i not in visited]
        if not available_actions:
            return current_state  # Return to starting point if all targets visited
        return self.epsilon_greedy_action(current_state, available_actions)


# Environment setup
class TSP(gym.Env):
    """Travelling Salesman Problem (TSP) RL environment."""

    def __init__(self, num_targets: int, max_area: int = 30, seed: int = None) -> None:
        super().__init__()
        if seed is not None:
            np.random.seed(seed=seed)

        self.steps: int = 0
        self.num_targets: int = num_targets
        self.max_steps: int = num_targets
        self.max_area: int = max_area

        self.locations: np.ndarray = self._generate_points(self.num_targets)
        self.distances: np.ndarray = self._calculate_distances(self.locations)

        # Observation and Action Spaces
        self.obs_low = np.concatenate(
            [
                np.array([0], dtype=np.float32),
                np.zeros(self.num_targets, dtype=np.float32),
                np.zeros(2 * self.num_targets, dtype=np.float32),
            ]
        )

        self.obs_high = np.concatenate(
            [
                np.array([self.num_targets], dtype=np.float32),
                2 * self.max_area * np.ones(self.num_targets, dtype=np.float32),
                self.max_area * np.ones(2 * self.num_targets, dtype=np.float32),
            ]
        )

        self.observation_space = gym.spaces.Box(low=self.obs_low, high=self.obs_high)
        self.action_space = gym.spaces.Discrete(self.num_targets)

    def reset(self, *, seed: int = None, options: dict = None) -> Tuple[np.ndarray, Dict]:
        self.steps: int = 0
        self.loc: int = 0
        self.visited_targets: List[int] = []
        self.dist: List[float] = self.distances[self.loc]

        state = np.concatenate(
            (
                np.array([self.loc]),
                np.array(self.dist),
                np.array(self.locations).reshape(-1),
            ),
            dtype=np.float32,
        )
        return state, {}

    def step(self, action: int) -> Tuple[np.ndarray, float, bool, bool, Dict]:
        self.steps += 1
        past_loc = self.loc
        next_loc = action

        reward = self._get_rewards(past_loc, next_loc)
        self.visited_targets.append(next_loc)

        next_dist = self.distances[next_loc]
        terminated = bool(self.steps == self.max_steps)
        truncated = False

        next_state = np.concatenate(
            [
                np.array([next_loc]),
                next_dist,
                np.array(self.locations).reshape(-1),
            ],
            dtype=np.float32,
        )

        self.loc, self.dist = next_loc, next_dist
        return next_state, reward, terminated, truncated, {}

    def _generate_points(self, num_points: int) -> np.ndarray:
        points = []
        while len(points) < num_points:
            x = np.random.random() * self.max_area
            y = np.random.random() * self.max_area
            if [x, y] not in points:
                points.append([x, y])
        return np.array(points)

    def _calculate_distances(self, locations: np.ndarray) -> np.ndarray:
        n = len(locations)
        distances = np.zeros((n, n))
        for i in range(n):
            for j in range(n):
                distances[i, j] = np.linalg.norm(locations[i] - locations[j])
        return distances

    def _get_rewards(self, past_loc: int, next_loc: int) -> float:
        if next_loc not in self.visited_targets:
            reward = -self.distances[past_loc][next_loc]
        else:
            reward = -10000
        return reward


# Running the solver
if __name__ == "__main__":
    num_targets = 15  # Set number of targets
    env = TSP(num_targets)  # Create TSP environment

    # Create and run Monte Carlo solver
    mc_solver, total_returns = create_mc_epsilon_greedy_solver(env, num_episodes=10000, method="every_visit", epsilon=0.1)

    # Print the learned policy
    print("Learned Policy:", mc_solver.policy)

    # Print the total return from the last few episodes
    print("Total Returns", total_returns[:])
    print("Average Return:", np.mean(total_returns))

Learned Policy: {14: 5, 13: 12, 11: 9, 10: 3, 9: 11, 8: 2, 7: 1, 6: 0, 4: 2, 3: 1, 2: 4, 1: 7, 0: 10, 5: 4, 12: 7}
Total Returns [-237.06428465804817, -259.5131465902842, -257.4921024812908, -273.5267568740959, -229.45411299258294, -236.7461355027317, -243.54520400028696, -227.7671689032901, -282.41515858936526, -299.0730623358298, -281.1136388816738, -304.4614222880889, -284.8136647644675, -282.71912139262076, -271.68615134178435, -239.4166712444811, -215.90497236498481, -262.4950517909797, -215.46521944269543, -219.14295147594333, -241.24003643742444, -229.76166994831564, -229.23588165988414, -246.22243634533072, -228.96847474106653, -236.67768490368945, -261.5573292753064, -255.83656306936547, -258.9730962879461, -226.19394136202925, -252.74060103900666, -272.84431113797837, -221.43508579635596, -236.39933014621883, -242.16306437585936, -216.87203723141678, -229.62323980772686, -209.7431988564846, -206.15481814897726, -218.61675428390856, -237.95119933459733, -219.10416100022968, -2