In [25]:
from random_search.mujoco_random_search_learned import MujocoRandomSearchLearned
import click
import json
import gym
import ray 

# @click.command()
# @click.option('--param_file', default='params.json', help='JSON file for exp parameters')

param_file = 'hc.json'  # Replace with the actual path to your JSON file

# Correct usage of the with statement
with open(param_file, 'r') as json_params:
    params = json.load(json_params)
    print(params)
    

exp_identifier = '|'.join('{}={}'.format(key,val) for (key,val) in params.items())
params['exp_id'] = exp_identifier

env = gym.make(params['env_name'])
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]

# set policy parameters. Possible filters: 'MeanStdFilter' for v2, 'NoFilter' for v1.
policy_params={'type':'linear',
                'ob_filter': "MeanStdFilter",
                'ob_dim':obs_dim,
                'ac_dim':act_dim}
params["policy_params"] = policy_params
params["dimension"] = obs_dim*act_dim
# ray.init(num_cpus=params["num_workers"], ignore_reinit_error=True, dashboard_host="127.0.0.1")


model = MujocoRandomSearchLearned(params)

[2024-09-30 16:57:33,847] Making new env: HalfCheetah-v1
  result = entry_point.load(False)
2024-09-30 16:57:33,882	INFO worker.py:1619 -- Calling ray.init() again after it has already been called.


{'env_name': 'HalfCheetah-v1', 'seed': 4242, 'policy_type': 'linear', 'dir_std': 0.03, 'step_size': 0.02, 'num_workers': 8, 'rollout_length': 1000, 'num_rollouts': 8, 'sampler': 'jacobian', 'n_iter': 2500, 'every_val': 10, 'shift': 0, 'optimizer': 'sgd', 'learning_rate': 0.01, 'num_hidden_dim': 8, 'num_learning_iterations': 10, 'gram_schmidt': True, 'variance_reduced': False, 'filter_corrected': True}
Sampler is jacobian; so a learner is created
Learner with 102->8->1 using SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: True
    maximize: False
    momentum: 0.9
    nesterov: False
    weight_decay: 0
) with gs:True
TOP-K is 8


In [26]:
model.policy.get_weights_plus_stats().shape
policy = model.policy
policy.get_weights_plus_stats().shape
policy.observation_filter.get_stats()

mu,std = policy.observation_filter.get_stats()

In [30]:

mu, std = policy.observation_filter.get_stats()
mu_expanded = mu[np.newaxis, :]  # Shape becomes (1, 17)
std_expanded = std[np.newaxis, :]  # Shape becomes (1, 17)

# # Concatenate weights, mu, and std along the first axis (rows)
aux = np.concatenate([policy.weights, mu_expanded, std_expanded], axis=0)
aux.shape

(8, 17)

In [93]:
import torch
num_episodes = 0
model.sampler.current_solution = torch.from_numpy(model.current_solution).view(1,-1).float()
iteration = 0
model.ts = iteration + 1
validation_epoch = 10
if iteration % validation_epoch == 0:
    # Evaluate at every validation_epoch
    #print('Evaluation at {}'.format(num_episodes))
    model.evaluate(num_episodes) 

directions = model.sampler.sample()
directions.shape
# 8*6*17
rewards, num_eval = model.rollouts(directions)
num_episodes+=num_eval

{'AverageRewards': -0.7374557836879756, 'StdRewards': 0.7436504429711981, 'MaxRewards': 1.378270869378289, 'MinRewards': -2.2905339439886228}


In [60]:
model.update(rewards, directions)

tensor(2326.3784)
tensor(2326.3784)


In [61]:
model.post_iteration_cleanup()
model.train_stats 

[]

### How to evaluate the function

In [68]:
import numpy as np 
policy_id = ray.put(model.current_solution)
rollout_per_worker = int(50/model.num_workers) + 1

rollouts = [ worker.do_rollouts_same_policy.remote(policy_id, None,
                                        num_rollouts=rollout_per_worker, 
                                        evaluate=True) for worker in model.workers]

results = ray.get(rollouts)

rewards = []
for result in results:
    rewards += result["rollout_rewards"]

rewards = np.array(rewards, dtype=np.float64)

### How the sampler work

In [83]:
sampler = model.sampler
sampler.num_directions
sampler.dimension

102

### How to rollout      
select different directions $d_i$ and cast perturbations on the original parameters. Collect rollouts as an estimation for the value function $R_{\pi_{\theta}}(\tau)$. And finally we can estimate the derivative with respect to $\theta$.

In [84]:
directions_n = directions.numpy()
rollout_per_worker = int(directions_n.shape[0]/model.num_workers)
# Current implementation is incomplete and only support this
assert(rollout_per_worker*model.num_workers == directions_n.shape[0])

# Sync all workers first
current_policy = ray.put(model.current_solution)

# Do rollouts
rollouts = []
for rollout in range(rollout_per_worker):
    for worker in range(model.num_workers):
        perturbation_for_worker = ray.put(directions_n[worker+rollout*model.num_workers])
        rollouts+= [model.workers[worker].do_rollouts_same_policy.remote(current_policy, perturbation_for_worker, evaluate=False)]

results = ray.get(rollouts)

pos_rollouts_un = []
neg_rollouts_un = []

pos_rollouts = []
neg_rollouts = []
time = 0
for result in results:
    time += result['steps']
    pos_rollouts += [result['rollout_rewards'][0]['+']]
    neg_rollouts += [result['rollout_rewards'][0]['-']]

    pos_rollouts_un += [result['rollout_rewards'][0]['un_+']]
    neg_rollouts_un += [result['rollout_rewards'][0]['un_-']]


### Update a policy
How to use the obtained reward data and the directions to update the policy.

In [110]:
model.update(rewards, directions)
mx_rewards = torch.max(rewards['+'], rewards['-'])
ss, ind = torch.sort(mx_rewards, dim=0, descending=True)
chosen_indices = ind[0:model.top_k,0]

tensor(0.7072)
tensor(0.7072)


torch.Size([8, 102])

In [107]:

directions = directions[chosen_indices,:]
rewards['+'] = rewards['+'][chosen_indices,:]
rewards['-'] = rewards['-'][chosen_indices,:]
rewards['un_+'] = rewards['un_+'][chosen_indices,:]
rewards['un_-'] = rewards['un_-'][chosen_indices,:]

stddev = torch.std(torch.cat((rewards['+'], rewards['-']),0), unbiased=False)
print(stddev)
if stddev < 1:
    stddev = 1
rewards['+'] /= stddev
rewards['-'] /= stddev
directional_grads = rewards['+'] - rewards['-']

final_direction = torch.matmul(directions.transpose(0,1), directional_grads)
final_direction = final_direction / directions.shape[0]
final_direction = final_direction / model.dir_std 

if len(model.old_gradients) > 5:
    model.old_gradients.pop(0)
model.old_gradients.append(final_direction)

update = model.step_size * final_direction
model.current_solution += update.numpy().reshape(model.current_solution.shape)

if not "unit_normal" in model.sampler_type:
    model.update_models(model.current_solution, directions, rewards)

stddev2 = torch.std(torch.cat((rewards['un_+'], rewards['un_-']),0), unbiased=False)
print(stddev2)


tensor(0.7072)
tensor(0.7072)


In [1]:
model.learner

NameError: name 'model' is not defined