In [None]:
import numpy as np
import gym


def policy_function(M, state, delta, nu, v2, sigma=None, mu=None):
    if v2:
        # In v2, action probability is computed by adjusting the policy weights (M) by a perturbation (nu * delta),
        # then scaling by the inverse of the covariance matrix (sigma) and centering by the mean state (mu).
        # The resulting value (policy) represents the likelihood of choosing one action over another.
        M_plus_delta = M + nu * delta
        sigma_sqrt_inv = np.sqrt(1 / np.diag(sigma))
        product = M_plus_delta @ np.diag(sigma_sqrt_inv)
        diff = state - mu

        policy = product @ diff

    else:
        # The action probability is computed by adjusting the policy weights (M) by a perturbation (nu * delta)
        # and then applying these adjusted weights to the current state.
        policy = (M + nu * delta) @ state

    return policy


def evaluate_policy(policy, env, max_steps=1000):
    state = env.reset()   # Reset the environment to its initial state and get the initial state observation.
    total_reward = 0      # Initialize the total accumulated reward to 0.
    done = False          # Initialize 'done' to False, indicating the episode hasn't ended.
    steps = 0             # Initialize a step counter to track the number of steps taken.
    states = []

    # Continue taking steps in the environment until the episode ends ('done' becomes True)
    # or the number of steps reaches the specified maximum ('max_steps').
    while not done and steps < max_steps:
        # Determine the action to take in the current state by calling the provided 'policy' function.
        action = policy(state)
        if isinstance(env.action_space, gym.spaces.Discrete):
            try:
                action = 1 if action > 0 else 0
            except ValueError as ve:
                action = np.argmax(action)
        else:
            try:
                action = np.clip(action, env.action_space.low, env.action_space.high)
            except ValueError as ve:
                action = np.clip(np.argmax(action), env.action_space.low, env.action_space.high)

        state, reward, done, _ = env.step(action)   # Apply the action in the environment, which returns the next state,
                                                    # the reward from taking the action, whether the episode has ended,
                                                    # and additional info (ignored here with '_').

        total_reward += reward  # Add the reward received from the last action to the total accumulated reward.
        states.append(state)    # Append the current state to the list
        steps += 1              # Increment the step counter.

    # Return the total accumulated reward for the episode, which serves as a measure of the policy's performance.
    return total_reward, states


def augmented_random_search(env, alpha, N, nu, b, max_episodes, v2=False):
    # Initialize policy parameters and environment configuration
    n = env.observation_space.shape[0]  # Number of features
    # Number of actions:
    if isinstance(env.action_space, gym.spaces.Discrete):
        p = env.action_space.n
    else:
        p = env.action_space.shape[0]


    M = np.zeros((p, n))    # Initialize policy weights as zeros
    mu = np.zeros(n)        # Initialize mean state as zeros
    sigma = np.eye(n)       # Initialize covariance matrix as identity matrix

    states_encountered = []

    # Iterate over a fixed number of episodes to update policy weights
    for episode in range(max_episodes):
        deltas = np.random.randn(N, p, n)  # Generate N random perturbations for policy weights

        rewards_plus = []     # To store rewards when adding perturbations
        rewards_minus = []    # To store rewards when subtracting perturbations
        states_plus = []
        states_minus = []

        # Evaluate policy for each perturbation and its negation
        for delta in deltas:
            # Define perturbed policies
            policy_plus = lambda state: policy_function(M, state, delta, nu, v2, sigma, mu)
            policy_minus = lambda state: policy_function(M, state, -delta, nu, v2, sigma, mu)

            # Evaluate each perturbed policy and store rewards
            reward_plus, states_plus_tmp = evaluate_policy(policy_plus, env)
            reward_minus, states_minus_tmp = evaluate_policy(policy_minus, env)
            rewards_plus.append(reward_plus)
            rewards_minus.append(reward_minus)
            states_plus.extend(states_plus_tmp)
            states_minus.extend(states_minus_tmp)

        # Select the top b perturbations based on max rewards obtained from perturbed policies
        scores = list(zip(deltas, rewards_plus, rewards_minus))
        scores.sort(key=lambda x: max(x[1], x[2]), reverse=True)
        top_scores = scores[:b]

        # Update policy weights using the top b perturbations
        update_step = np.zeros((p, n))
        sigma_rewards = np.std([r for _, r_plus, r_minus in top_scores for r in (r_plus, r_minus)]) + 1e-4
        for delta, reward_plus, reward_minus in top_scores:
            update_step += (reward_plus - reward_minus) * delta

        # Apply update to policy weights
        M += alpha / (b * sigma_rewards) * update_step

        if v2:
            # Compute mean and covariance of the encountered states to update the policy parameters
            states_encountered.extend(states_plus)
            states_encountered.extend(states_minus)
            states_array = np.array(states_encountered)
            if len(states_array) > 0:
                mu = np.mean(states_array, axis=0)
                sigma = np.cov(np.array(states_array).T) + 1e-6 * np.eye(n)

    env.close()

    return M, mu, sigma

In [None]:
# Hyperparameters
pend_alpha = 0.1
pend_N = 10
pend_nu = 0.01
pend_b = 5
pend_max_episodes = 1000

# Gym environment
env = gym.make('Pendulum-v1')

# Run augmented random search
M, mu, sigma = augmented_random_search(env, pend_alpha, pend_N, pend_nu, pend_b, pend_max_episodes, v2=False)

print("Final policy weights (M):", M)
print("Final mean state (mu):", mu)
print("Final covariance matrix (sigma):", sigma)

In [None]:
M, mu, sigma = augmented_random_search(env, pend_alpha, pend_N, pend_nu, pend_b, pend_max_episodes, v2=True)

print("Final policy weights (M):", M)
print("Final mean state (mu):", mu)
print("Final covariance matrix (sigma):", sigma)

In [None]:
# Hyperparameters
cp_alpha = 0.1
cp_N = 10
cp_nu = 0.01
cp_b = 5
cp_max_episodes = 1000

# Gym environment
env = gym.make('CartPole-v1')

# Run augmented random search
M, mu, sigma = augmented_random_search(env, cp_alpha, cp_N, cp_nu, cp_b, cp_max_episodes, v2=False)

print("Final policy weights (M):", M)
print("Final mean state (mu):", mu)
print("Final covariance matrix (sigma):", sigma)

In [None]:
M, mu, sigma = augmented_random_search(env, cp_alpha, cp_N, cp_nu, cp_b, cp_max_episodes, v2=True)

print("Final policy weights (M):", M)
print("Final mean state (mu):", mu)
print("Final covariance matrix (sigma):", sigma)