In [1]:
# Installs the necessary Python and system libraries
try:
    from easypip import easyimport, easyinstall, is_notebook
except ModuleNotFoundError as e:
    get_ipython().run_line_magic("pip", "install 'easypip>=1.2.0'")
    from easypip import easyimport, easyinstall, is_notebook

easyinstall("swig")
easyinstall("gymnasium")
easyinstall("tensorboard")
easyinstall("box2d-kengz")

[easypip] Installing bbrl_gymnasium>=0.2.0


In [2]:
import os
from typing import Tuple, List

import numpy as np
if is_notebook():
    get_ipython().run_line_magic("matplotlib", "inline")
import matplotlib.pyplot as plt

[easypip] Installing bbrl_gymnasium


Matplotlib backend: module://matplotlib_inline.backend_inline


In [4]:
def augmented_random_search(env, alpha, N, nu, b, max_episodes, v2=False):
    p, n = env.action_space.shape, env.observation_space.shape[0]
    M = np.zeros((p, n))
    mu = np.zeros(n)
    sigma = np.eye(n)
    
    rewards = []
    
    episode = 0
    while episode < max_episodes:
        deltas = np.random.randn(N, p, n)
        
        # Collect rollouts
        rewards_plus = []
        rewards_minus = []
        policies = []
        for delta in deltas:
            reward_sum_plus = 0
            reward_sum_minus = 0
            state = env.reset()
            terminated = False
            truncated = False
            while not(terminated or truncated):
                if v2:
                    policy_plus = (M + nu * delta) @ np.diag(np.sqrt(np.linalg.inv(Sigma))) @ (env.state - mu)
                    policy_minus = (M - nu * delta) @ np.diag(np.sqrt(np.linalg.inv(Sigma))) @ (env.state - mu)
                else:
                    policy_plus = (M + nu * delta) @ env.state
                    policy_minus = (M - nu * delta) @ env.state
                state_plus, reward_plus, terminated, truncated, _ = env.step(action_plus)
                state_minus, reward_minus, terminated, truncated, _ = env.step(action_minus)
                reward_sum_plus += reward_plus
                reward_sum_minus += reward_minus
            rewards_plus.append(reward_sum_plus)
            rewards_minus.append(reward_sum_minus)
            
        # Sort directions by max rewards
        rewards_plus = np.array(rewards_plus)
        rewards_minus = np.array(rewards_minus)
        sorted_indices = np.argsort(np.maximum(rewards_plus, rewards_minus))[::-1][:b]

        # Update step
        update_directions = deltas[sorted_indices]
        update_rewards = rewards_plus[sorted_indices] - rewards_minus[sorted_indices]
        std_rewards = np.std(np.concatenate((rewards_plus[sorted_indices], rewards_minus[sorted_indices])))
        M += (alpha / (b * std_rewards)) * np.sum(update_rewards[:, None, None] * update_directions, axis=0)

        # Calculate average rewards and append to rewards list
        avg_reward_plus = np.mean(rewards_plus)
        avg_reward_minus = np.mean(rewards_minus)
        avg_reward = (avg_reward_plus + avg_reward_minus) / 2
        rewards.append(avg_reward)
        
        if v2:
            # Update mean and covariance
            states = []
            for policy in policies:
                states.append(env.next_state(policy))
            states = np.array(states)
            mu = np.mean(states, axis=0)
            sigma = np.cov(states.T)

        episode += 1

    return M, mu, sigma


# Hyperparameters
alpha = 0.1
N = 10
nu = 0.01
b = 5
max_episodes = 1000  # Maximum number of episodes for visualization

# Run augmented random search
def augmented_random_search(env, alpha, N, nu, b, V2=False):
    # Initialize policy weights M, mean state mu, and covariance matrix Sigma
    M = np.zeros((N, env.action_space.shape[0]))
    mu = np.zeros(env.observation_space.shape[0])
    Sigma = np.eye(env.observation_space.shape[0])

    # List to store rewards for visualization
    rewards = []

    # Main training loop
    episode = 0
    while episode < max_episodes:
        # Sample perturbations
        deltas = np.random.randn(N, *env.action_space.shape)

        # Evaluate perturbations
        rewards = []
        for delta in deltas:
            reward_sum = 0
            state = env.reset()
            done = False
            while not done:
                action = (M + nu * delta) @ state
                state, reward, done, _ = env.step(action)
                reward_sum += reward
            rewards.append(reward_sum)

        # Update policy weights M
        # Your implementation of updating M here...

        # Calculate average reward and append to rewards list
        avg_reward = np.mean(rewards)
        rewards.append(avg_reward)

        # Increment episode counter
        episode += 1

    return M, mu, Sigma, rewards

# Create Gym environment
env = gym.make('CartPole-v1')

# Run augmented random search
M, mu, Sigma, rewards = augmented_random_search(env, alpha, N, nu, b, ending_condition, V2=True)

# Plot rewards
plt.plot(rewards)
plt.xlabel('Episode')
plt.ylabel('Average Reward')
plt.title('Average Reward over Episodes')
plt.show

Optimal Parameters: [  3.5980421  -25.75099216  21.10023133]
