In [2]:
pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


In [10]:
import numpy as np
from typing import Dict, List, Optional, Tuple
import gymnasium as gym


class TSP(gym.Env):
    """Traveling Salesman Problem (TSP) RL environment."""

    def __init__(self, num_targets: int, max_area: int = 30, seed: int = None) -> None:
        """Initialize the environment by generating random target locations."""
        super().__init__()
        if seed is not None:
            np.random.seed(seed=seed)
        self.num_targets = num_targets
        self.max_area = max_area
        self.locations = self._generate_points(num_targets)
        self.distances = self._calculate_distances(self.locations)
        self.visited = None
        self.current_location = None
        self.steps = 0

    def _generate_points(self, num_points: int) -> np.ndarray:
        """Generate random 2D coordinates for each target."""
        points = []
        while len(points) < num_points:
            x = np.random.random() * self.max_area
            y = np.random.random() * self.max_area
            points.append([x, y])
        return np.array(points)

    def _calculate_distances(self, locations: List) -> np.ndarray:
        """Compute pairwise distances between all target locations."""
        n = len(locations)
        distances = np.zeros((n, n))
        for i in range(n):
            for j in range(n):
                distances[i, j] = np.linalg.norm(locations[i] - locations[j])
        return distances

    def reset(self) -> Tuple[np.ndarray, dict]:
        """Reset the environment by setting the agent at the starting location and resetting visits."""
        self.steps = 0
        self.current_location = 0  # Start at the first location
        self.visited = [False] * self.num_targets
        self.visited[self.current_location] = True
        state = (self.current_location, tuple(self.visited))
        return state, {}

    def step(self, action: int) -> Tuple[np.ndarray, float, bool, bool, dict]:
        """Move to the selected target (action)."""
        if self.visited[action]:
            return (self.current_location, tuple(self.visited)), -10000, True, False, {}

        reward = -self.distances[self.current_location, action]
        self.visited[action] = True
        self.current_location = action
        self.steps += 1

        # Check if all targets have been visited
        done = all(self.visited)
        return (self.current_location, tuple(self.visited)), reward, done, False, {}


def value_iteration(env: TSP, gamma=1.0, max_iterations=1000, tolerance=1e-6):
    """Run Value Iteration to solve the TSP."""
    n = env.num_targets
    state_space_size = (n, 1 << n)  # State space: num_targets * 2^num_targets
    value_function = np.zeros(state_space_size)
    policy = np.zeros(state_space_size, dtype=int)

    for iteration in range(max_iterations):
        delta = 0

        for loc in range(n):
            for visited_set in range(1 << n):
                if (visited_set & (1 << loc)) == 0:  # Skipping invalid states
                    continue

                best_value = float('inf')
                best_action = -1

                # Check all possible next locations
                for next_loc in range(n):
                    if visited_set & (1 << next_loc):  # Skipping already visited locations
                        continue

                    next_visited_set = visited_set | (1 << next_loc)
                    reward = -env.distances[loc, next_loc]
                    value = reward + gamma * value_function[next_loc, next_visited_set]

                    if value < best_value:
                        best_value = value
                        best_action = next_loc

                if best_action != -1:
                    delta = max(delta, abs(value_function[loc, visited_set] - best_value))
                    value_function[loc, visited_set] = best_value
                    policy[loc, visited_set] = best_action

        # If the change in value is smaller than tolerance, exit early
        if delta < tolerance:
            print(f"Value Iteration converged after {iteration} iterations.")
            break

    return value_function, policy


def extract_policy(env: TSP, policy: np.ndarray) -> List[int]:
    """Extract the optimal path based on the policy."""
    current_city = 0
    visited_set = 1  # Start at city 0 (marked as visited)
    path = [current_city]

    while len(path) < env.num_targets:
        next_city = policy[current_city, visited_set]
        path.append(int(next_city))  # Adding the next city to the path
        visited_set |= (1 << next_city)  # Marking the next city as visited
        current_city = next_city

    return path


if __name__ == "__main__":
    # Initializing environment with 10 targets
    num_targets = 10
    env = TSP(num_targets)

    # Running value iteration
    value_function, policy = value_iteration(env)

    # Extracting and display the optimal path
    optimal_path = extract_policy(env, policy)
    print(f"Optimal Path (Visiting Order): {optimal_path}")

    # Displaying state values and policy decisions after convergence
    print("\nOptimal State Values and Policy Decisions:")
    for loc in range(num_targets):
        for visited_set in range(1 << num_targets):
            if (visited_set & (1 << loc)) != 0:  # Only show valid states
                state_value = value_function[loc, visited_set]
                next_city = policy[loc, visited_set]
                visited_set_str = bin(visited_set)[2:].zfill(num_targets)
                print(f"From City {loc} with Visited {visited_set_str}: Value={state_value:.4f}, "
                      f"Next City={next_city}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
From City 0 with Visited 0011110001: Value=-92.7195, Next City=3
From City 0 with Visited 0011110011: Value=-78.7163, Next City=3
From City 0 with Visited 0011110101: Value=-74.6954, Next City=1
From City 0 with Visited 0011110111: Value=-56.1268, Next City=3
From City 0 with Visited 0011111001: Value=-68.4654, Next City=1
From City 0 with Visited 0011111011: Value=-48.4509, Next City=2
From City 0 with Visited 0011111101: Value=-45.8759, Next City=1
From City 0 with Visited 0011111111: Value=-18.8649, Next City=8
From City 0 with Visited 0100000001: Value=-166.3476, Next City=2
From City 0 with Visited 0100000011: Value=-132.5246, Next City=2
From City 0 with Visited 0100000101: Value=-140.1280, Next City=3
From City 0 with Visited 0100000111: Value=-105.6470, Next City=3
From City 0 with Visited 0100001001: Value=-132.4521, Next City=2
From City 0 with Visited 0100001011: Value=-97.9710, Next City=2
From City 0 with Vis