In [115]:
import numpy as np
import gym

def policy_function(M, state, delta, nu, v2, sigma=None, mu=None):
    if v2:
        # In v2, action probability is computed by adjusting the policy weights (M) by a perturbation (nu * delta),
        # then scaling by the inverse of the covariance matrix (sigma) and centering by the mean state (mu).
        # The resulting value (action_prob) represents the likelihood of choosing one action over another.
        action_prob = (M + nu * delta) @ np.linalg.inv(sigma) @ (state - mu)
    else:
        # The action probability is computed by adjusting the policy weights (M) by a perturbation (nu * delta)
        # and then applying these adjusted weights to the current state.
        action_prob = (M + nu * delta) @ state

    # Determine the action based on the sign of the action probability.
    # If action_prob is positive or zero, the action will be 1; otherwise, it will be 0.
    action = int(action_prob >= 0)

    # Return the computed action.
    return action

In [None]:
def evaluate_policy(policy, env, max_steps=1000):
    state = env.reset()
    total_reward = 0
    done = False
    steps = 0
    
    # Continue taking steps in the environment until the episode ends ('done' becomes True)
    # or the number of steps reaches the specified maximum ('max_steps').
    while not done and steps < max_steps:
        action = policy(state)  # Determine the action to take in the current state by calling the provided 'policy' function.
        state, reward, done, _ = env.step(action)  # Apply the action in the environment.

        total_reward += reward  # Add the reward received from the last action to the total accumulated reward.
        steps += 1  # Increment the step counter.
        
    # Return the total accumulated reward for the episode.
    return total_reward


In [None]:
def augmented_random_search(env, alpha, N, nu, b, max_episodes, v2=False):
    n = env.observation_space.shape[0]

    M = np.zeros(n)     # Initialize policy weights as zeros
    mu = np.zeros(n)    # Initialize mean state as zeros
    sigma = np.eye(n)   # Initialize covariance matrix as identity matrix

    for _ in range(max_episodes):
        deltas = np.random.randn(N, n)  # Generate N random perturbations for policy weights

        rewards_plus = []   # To store rewards when adding perturbations
        rewards_minus = []  # To store rewards when subtracting perturbations

        # Evaluate policy for each perturbation and its negation
        for delta in deltas:
            # Define perturbed policies
            policy_plus = lambda s, d=delta: policy_function(M, s, d, nu, v2, sigma, mu)
            policy_minus = lambda s, d=delta: policy_function(M, s, -d, nu, v2, sigma, mu)

            # Evaluate each perturbed policy and store rewards
            rewards_plus.append(evaluate_policy(policy_plus, env))
            rewards_minus.append(evaluate_policy(policy_minus, env))

        # Select the top b perturbations based on max rewards obtained from perturbed policies
        scores = list(zip(deltas, rewards_plus, rewards_minus))
        scores.sort(key=lambda x: max(x[1], x[2]), reverse=True)
        top_scores = scores[:b]

        # Update policy weights using the top b perturbations
        update_step = np.zeros(n)
        sigma_rewards = np.std([r for _, r_plus, r_minus in top_scores for r in (r_plus, r_minus)]) + 1e-4
        for delta, reward_plus, reward_minus in top_scores:
            update_step += (reward_plus - reward_minus) * delta

        M += (alpha / (b * sigma_rewards)) * update_step  # Apply update to policy weights

        # We haven't done this part yet:
        # V2 : Set μj+1, Σj+1 to be the mean and covariance of the 2NH(j + 1) states encountered from the start of training.
