In [64]:
import numpy as np
from tsp import TSP

In [65]:
class TSP_DP:
    def __init__(self, env, gamma=0.7, theta=1e-6):
        self.env = env
        self.gamma = gamma
        self.theta = theta
        self.value_table = np.zeros((env.num_targets,))  # Value for each target (state)

    def value_iteration(self):
        """Performs value iteration to calculate the optimal value function."""
        while True:
            delta = 0
            for state_index in range(self.env.num_targets):
                # Store current value of the state
                v = self.value_table[state_index]

                # Calculate the action values for all possible actions from this state
                action_values = self._get_action_values(state_index)

                # Select the best action value
                best_action_value = max(action_values)

                # Update the value table for this state
                self.value_table[state_index] = best_action_value

                # Calculate the maximum difference for convergence
                delta = max(delta, abs(v - best_action_value))

            # If the difference is small enough, we assume convergence
            if delta < self.theta:
                break

    def _get_action_values(self, state_index):
        """Helper function to calculate the value of all actions from a given state."""
        action_values = []
        for action in range(self.env.num_targets):
            # We simulate taking the action by calculating the reward and expected value
            current_state = self.env.locations[state_index]
            next_state = self.env.locations[action]
            
            # Get the distance between the current state and the action (next state)
            distance = self.env.distances[state_index, action]
            
            # Reward is negative distance to minimize the travel cost
            reward = -distance
            
            # Calculate the action value: immediate reward + discounted future value
            action_value = reward + self.gamma * self.value_table[action]
            action_values.append(action_value)

        return action_values

    def get_policy(self):
        """Extract the optimal policy based on the value table."""
        policy = np.zeros(self.env.num_targets, dtype=int)
        for state_index in range(self.env.num_targets):
            # For each state, calculate the best action
            action_values = self._get_action_values(state_index)
            best_action = np.argmax(action_values)
            policy[state_index] = best_action
        return policy


In [66]:
import numpy as np

class TSP_MC:
    def __init__(self, env, gamma=0.9):
        self.env = env
        self.gamma = gamma
        self.returns_sum = np.zeros((env.num_targets, env.num_targets))
        self.returns_count = np.zeros((env.num_targets, env.num_targets))
        self.value_table = np.zeros(env.num_targets)
        self.policy = np.random.choice(env.num_targets, env.num_targets)  # Random initial policy

    def epsilon_greedy_policy(self, state, epsilon=0.1, decay=0.999):
        """Select an action using an epsilon-greedy policy."""
        if np.random.random() < epsilon:
            return np.random.choice(self.env.num_targets)  # Explore: random action
        else:
            return self.policy[state]  # Exploit: greedy action

    def generate_episode(self, epsilon=0.1):
        """Generate an episode using epsilon-greedy action selection, ensuring all targets are visited."""
        episode = []
        state, _ = self.env.reset()
        visited_targets = set()
        
        for _ in range(self.env.num_targets):
            action = self.epsilon_greedy_policy(int(state[0]), epsilon)
            next_state, reward, done, _, _ = self.env.step(action)
            episode.append((state, action, reward))
            
            visited_targets.add(int(state[0]))  # Add the target to visited set
            state = next_state
            
            # End only if all targets have been visited
            if len(visited_targets) == self.env.num_targets:
                break
        
        return episode

    def first_visit_mc(self, episodes=5000, epsilon=0.1, decay=0.999):
        """Monte Carlo with First-Visit updating."""
        for _ in range(episodes):
            episode = self.generate_episode(epsilon)
            visited = set()
            G = 0
            for t in reversed(range(len(episode))):
                state, action, reward = episode[t]
                G = self.gamma * G + reward
                state_action = (int(state[0]), action)
                if state_action not in visited:
                    visited.add(state_action)
                    self.returns_sum[state_action] += G
                    self.returns_count[state_action] += 1
                    self.value_table[int(state[0])] = (
                        self.returns_sum[state_action] / self.returns_count[state_action]
                    )
                    
            epsilon *= decay
            self.update_policy()
            
    def every_visit_mc(self, episodes=5000, epsilon=0.1, decay=0.999):
        """Monte Carlo with Every-Visit updating."""
        for _ in range(episodes):
            episode = self.generate_episode(epsilon)
            G = 0
            for t in reversed(range(len(episode))):
                state, action, reward = episode[t]
                G = self.gamma * G + reward
                state_action = (int(state[0]), action)
                self.returns_sum[state_action] += G
                self.returns_count[state_action] += 1
                self.value_table[int(state[0])] = (
                    self.returns_sum[state_action] / self.returns_count[state_action]
                )
                
            epsilon *= decay
            self.update_policy()

    def update_policy(self):
        """Update the policy based on the current value table."""
        for state in range(self.env.num_targets):
            action_values = []
            for action in range(self.env.num_targets):
                # Use the stored value table instead of interacting with the environment
                next_state_value = self.value_table[action]
                distance = self.env.distances[state, action]
                reward = -distance  # Reward is negative distance
                action_values.append(reward + self.gamma * next_state_value)
            self.policy[state] = np.argmax(action_values)


In [67]:
import numpy as np

# Initialize the TSP environment
num_targets = 6  # You can set this to 50 as per the problem statement
env = TSP(num_targets=num_targets, max_area=30, seed=42)

# Dynamic Programming (Value Iteration) Solution
print("Running Dynamic Programming (Value Iteration) Solution...")
dp_solver = TSP_DP(env)
dp_solver.value_iteration()
dp_policy = dp_solver.get_policy()

print("Optimal Policy from Dynamic Programming (DP):", dp_policy)
print("Value Table from DP:", dp_solver.value_table)

# Monte Carlo (First-Visit) Solution
print("\nRunning Monte Carlo (First-Visit) Solution...")
mc_solver_first_visit = TSP_MC(env)
mc_solver_first_visit.first_visit_mc(episodes=500)  # You can adjust the number of episodes

print("Policy from Monte Carlo (First-Visit):", mc_solver_first_visit.policy)
print("Value Table from MC (First-Visit):", mc_solver_first_visit.value_table)

# Monte Carlo (Every-Visit) Solution
print("\nRunning Monte Carlo (Every-Visit) Solution...")
mc_solver_every_visit = TSP_MC(env)
mc_solver_every_visit.every_visit_mc(episodes=500)  # You can adjust the number of episodes

print("Policy from Monte Carlo (Every-Visit):", mc_solver_every_visit.policy)
print("Value Table from MC (Every-Visit):", mc_solver_every_visit.value_table)

# Compare DP and MC policies
print("\nComparison of DP and MC Solutions:")
print("DP Policy: ", dp_policy)
print("MC First-Visit Policy: ", mc_solver_first_visit.policy)
print("MC Every-Visit Policy: ", mc_solver_every_visit.policy)


Running Dynamic Programming (Value Iteration) Solution...
Optimal Policy from Dynamic Programming (DP): [0 1 2 3 4 5]
Value Table from DP: [0. 0. 0. 0. 0. 0.]

Running Monte Carlo (First-Visit) Solution...
Policy from Monte Carlo (First-Visit): [1 1 1 1 1 1]
Value Table from MC (First-Visit): [-33992.57752487 -10737.27231851 -23785.12859689 -19091.83465818
 -20285.12775162 -21143.81121275]

Running Monte Carlo (Every-Visit) Solution...
Policy from Monte Carlo (Every-Visit): [2 1 2 2 1 2]
Value Table from MC (Every-Visit): [-34064.52352334 -25210.29997131 -25199.47501583 -28182.03728123
 -26088.30601061 -25338.93792017]

Comparison of DP and MC Solutions:
DP Policy:  [0 1 2 3 4 5]
MC First-Visit Policy:  [1 1 1 1 1 1]
MC Every-Visit Policy:  [2 1 2 2 1 2]
