In [1]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/958.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-1.0.0


In [6]:
"""Traveling Salesman Problem (TSP) Solving with Reinforcement Learning Techniques."""

import numpy as np
import gymnasium as gym
from typing import List, Tuple, Optional, Dict


class TravelingSalesmanEnviroment(gym.Env):
    """Custom Gymnasium environment for the Traveling Salesman Problem."""

    def __init__(self, num_targts: int, max_arena: int = 50, seed: Optional[int] = None):
        """
        Initialize the TSP environment with configurable parameters.

        Args:
            num_targts: Number of cities/targets to visit
            max_arena: Maximum coordinate range for target locations
            seed: Random seed for reproducibility
        """
        super().__init__()
        np.random.seed(seed)

        self.num_targts = num_targts
        self.max_arena = max_arena
        self.max_steps = num_targts

        self.locations = self._genrate_unique_points()
        self.dist_matrix = self._compute_dist_matrix()

        obs_shape = 1 + self.num_targts + 2 * self.num_targts
        obs_low = np.zeros(obs_shape)
        obs_high = np.concatenate([
            [num_targts],
            2 * max_arena * np.ones(self.num_targts),
            max_arena * np.ones(2 * self.num_targts)
        ])

        self.observation_space = gym.spaces.Box(low=obs_low, high=obs_high, dtype=np.float32)
        self.action_space = gym.spaces.Discrete(num_targts)

    def _genrate_unique_points(self) -> np.ndarray:
        """Generate unique random points within the defined area."""
        points = set()
        while len(points) < self.num_targts:
            x = np.random.uniform(0, self.max_arena)
            y = np.random.uniform(0, self.max_arena)
            points.add((x, y))
        return np.array(list(points))

    def _compute_dist_matrix(self) -> np.ndarray:
        """Compute pairwise Euclidean distances between points."""
        return np.linalg.norm(
            self.locations[:, np.newaxis] - self.locations,
            axis=2
        )

    def reset(self, seed=None, options=None) -> Tuple[np.ndarray, Dict]:
        """Reset the environment to initial state."""
        super().reset(seed=seed)

        self.curr_location = 0
        self.visited_targts = set()
        self.steps = 0

        state = np.concatenate([
            [self.curr_location],
            self.dist_matrix[self.curr_location],
            self.locations.flatten()
        ])

        return state, {}

    def step(self, action: int) -> Tuple[np.ndarray, float, bool, bool, Dict]:
        """Take a step in the environment."""
        self.steps += 1

        reward = -self.dist_matrix[self.curr_location][action]

        if action in self.visited_targts:
            reward -= 1000

        self.curr_location = action
        self.visited_targts.add(action)

        terminated = len(self.visited_targts) == self.num_targts
        truncated = self.steps >= self.max_steps

        next_state = np.concatenate([
            [self.curr_location],
            self.dist_matrix[self.curr_location],
            self.locations.flatten()
        ])

        return next_state, reward, terminated, truncated, {}


class TSPValueIterSolver:
    """Value Iteration approach for solving Traveling Salesman Problem."""

    def __init__(self, env, gamma=0.9, convergence_thresh=1e-5):
        self.env = env
        self.gamma = gamma
        self.convergence_thresh = convergence_thresh
        self.val_table = np.zeros(env.num_targts)

    def solve(self) -> np.ndarray:
        """Perform value iteration to find optimal policy."""
        while True:
            max_change = 0
            for state in range(self.env.num_targts):
                old_value = self.val_table[state]

                act_values = [
                    -self.env.dist_matrix[state][action] +
                    self.gamma * self.val_table[action]
                    for action in range(self.env.num_targts)
                ]

                self.val_table[state] = max(act_values)
                max_change = max(max_change, abs(old_value - self.val_table[state]))

            if max_change < self.convergence_thresh:
                break

        return np.argmax(
            [
                -self.env.dist_matrix[state][action] +
                self.gamma * self.val_table[action]
                for action in range(self.env.num_targts)
            ]
            for state in range(self.env.num_targts)
        )


class TSPMonteCarloPolSolver:
    """Monte Carlo Learning approach for solving Traveling Salesman Problem."""

    def __init__(self, env, gamma=0.95):
        self.env = env
        self.gamma = gamma
        self.policy = np.random.randint(env.num_targts, size=env.num_targts)
        self.retrns = {(s, a): [] for s in range(env.num_targts) for a in range(env.num_targts)}

    def genrate_episode(self, epsilon=0.1) -> List[Tuple]:
        """Generate an episode using epsilon-greedy policy."""
        episode = []
        state, _ = self.env.reset()
        visited = set()

        for _ in range(self.env.num_targts):
            curr_state = int(state[0])

            # Epsilon-greedy action selection
            if np.random.random() < epsilon:
                action = np.random.choice(self.env.num_targts)
            else:
                action = self.policy[curr_state]

            next_state, reward, done, _, _ = self.env.step(action)
            episode.append((curr_state, action, reward))

            visited.add(curr_state)
            state = next_state

            if done:
                break

        return episode

    def monte_carlo_ctl(self, episodes=1000):
        """Perform Monte Carlo Control to improve policy."""
        for _ in range(episodes):
            episode = self.genrate_episode()
            G = 0
            visited_state_actions = set()

            for t in reversed(range(len(episode))):
                state, action, reward = episode[t]
                G = self.gamma * G + reward

                if (state, action) not in visited_state_actions:
                    self.retrns[(state, action)].append(G)
                    visited_state_actions.add((state, action))

                    # Update value-based policy
                    action_values = [
                        np.mean(self.retrns.get((state, a), [0]))
                        for a in range(self.env.num_targts)
                    ]
                    self.policy[state] = np.argmax(action_values)

        return self.policy


def main():
    """Demonstrate TSP solving techniques."""
    np.random.seed(42)
    env = TravelingSalesmanEnviroment(num_targts=6)

    print("Value Iteration Solution:")
    vi_solver = TSPValueIterSolver(env)
    vi_policy = vi_solver.solve()
    print("Policy:", vi_policy)
    print("Value Table:", vi_solver.val_table)

    print("\nMonte Carlo Solution:")
    mc_solver = TSPMonteCarloPolSolver(env)
    mc_policy = mc_solver.monte_carlo_ctl(episodes=500)
    print("Policy:", mc_policy)


if __name__ == "__main__":
    main()


Value Iteration Solution:
Policy: 0
Value Table: [0. 0. 0. 0. 0. 0.]

Monte Carlo Solution:
Policy: [0 2 2 3 1 5]
