In [1]:
import numpy as np

# AVI implementation
class ApproximateValueIteration:
    def __init__(self, state_dim, action_dim, feature_dim, gamma=0.9, epsilon=1e-6, max_iterations=1000):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.feature_dim = feature_dim
        self.gamma = gamma
        self.epsilon = epsilon
        self.max_iterations = max_iterations
        self.weights = np.zeros((action_dim, feature_dim))  # Initialize weights to zeros

    def compute_q_values(self, state):
        # Compute Q-values for a given state using the weights
        q_values = np.dot(self.weights, state)
        return q_values

    def train(self, feature_matrix, reward_matrix):
        for _ in range(self.max_iterations):
            prev_weights = np.copy(self.weights)  # Save the current weights
            for state_idx in range(self.state_dim):
                state = feature_matrix[state_idx]  # Get the feature vector for the current state
                q_values = self.compute_q_values(state)  # Compute Q-values for the current state
                best_action_value = np.max(q_values)  # Get the best action value
                for action_idx in range(self.action_dim):
                    reward = reward_matrix[action_idx, state_idx]  # Get the reward for the current action and state
                    bellman_residual = reward + self.gamma * best_action_value - q_values[action_idx]
                    self.weights[action_idx] += np.dot(state, bellman_residual)  # Update the weights
            if np.linalg.norm(prev_weights - self.weights) < self.epsilon:  # Check for convergence
                break

    def get_policy(self, feature_matrix):
        policy = np.zeros(self.state_dim, dtype=int)  # Initialize the policy
        for state_idx in range(self.state_dim):
            state = feature_matrix[state_idx]  # Get the feature vector for the current state
            q_values = self.compute_q_values(state)  # Compute Q-values for the current state
            policy[state_idx] = np.argmax(q_values)  # Select the action with the highest Q-value
        return policy

# API implementation
class ApproximatePolicyIteration:
    def __init__(self, state_dim, action_dim, feature_dim, gamma=0.9, epsilon=1e-6, max_iterations=1000):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.feature_dim = feature_dim
        self.gamma = gamma
        self.epsilon = epsilon
        self.max_iterations = max_iterations
        self.weights = np.zeros((action_dim, feature_dim))  # Initialize weights to zeros

    def compute_q_values(self, state):
        # Compute Q-values for a given state using the weights
        q_values = np.dot(self.weights, state)
        return q_values

    def compute_value_function(self, feature_matrix):
        value_function = np.zeros(self.state_dim)  # Initialize the value function
        for state_idx in range(self.state_dim):
            state = feature_matrix[state_idx]  # Get the feature vector for the current state
            q_values = self.compute_q_values(state)  # Compute Q-values for the current state
            value_function[state_idx] = np.max(q_values)  # Select the maximum Q-value as the value function
        return value_function

    def train(self, feature_matrix, reward_matrix):
        for _ in range(self.max_iterations):
            prev_weights = np.copy(self.weights)  # Save the current weights
            for _ in range(self.max_iterations):
                prev_value_function = self.compute_value_function(feature_matrix)  # Compute the previous value function
                for state_idx in range(self.state_dim):
                    state = feature_matrix[state_idx]  # Get the feature vector for the current state
                    q_values = self.compute_q_values(state)  # Compute Q-values for the current state
                    policy = np.argmax(q_values)  # Select the action with the highest Q-value
                    reward = reward_matrix[policy, state_idx]  # Get the reward for the chosen action and state
                    bellman_residual = reward + self.gamma * prev_value_function[state_idx] - q_values[policy]
                    self.weights[policy] += np.dot(state, bellman_residual)  # Update the weights
                value_function = self.compute_value_function(feature_matrix)  # Compute the new value function
                if np.linalg.norm(prev_value_function - value_function) < self.epsilon:  # Check for convergence
                    break
            if np.linalg.norm(prev_weights - self.weights) < self.epsilon:  # Check for convergence
                break

    def get_policy(self, feature_matrix):
        policy = np.zeros(self.state_dim, dtype=int)  # Initialize the policy
        for state_idx in range(self.state_dim):
            state = feature_matrix[state_idx]  # Get the feature vector for the current state
            q_values = self.compute_q_values(state)  # Compute Q-values for the current state
            policy[state_idx] = np.argmax(q_values)  # Select the action with the highest Q-value
        return policy

# Example Usage and Output:

# Define example data
state_dim = 5
action_dim = 2
feature_dim = 3
gamma = 0.9
epsilon = 1e-6
max_iterations = 1000

feature_matrix = np.random.rand(state_dim, feature_dim)  # Randomly generate the feature matrix
reward_matrix = np.random.rand(action_dim, state_dim)  # Randomly generate the reward matrix

# Instantiate and train AVI
avi = ApproximateValueIteration(state_dim, action_dim, feature_dim, gamma, epsilon, max_iterations)
avi.train(feature_matrix, reward_matrix)

# Obtain AVI policy
avi_policy = avi.get_policy(feature_matrix)
print("Approximate Value Iteration (AVI) Policy:")
print(avi_policy)

# Instantiate and train API
api = ApproximatePolicyIteration(state_dim, action_dim, feature_dim, gamma, epsilon, max_iterations)
api.train(feature_matrix, reward_matrix)

# Obtain API policy
api_policy = api.get_policy(feature_matrix)
print("Approximate Policy Iteration (API) Policy:")
print(api_policy)


Approximate Value Iteration (AVI) Policy:
[1 1 1 1 1]
Approximate Policy Iteration (API) Policy:
[0 0 0 0 0]
