In [1]:
import numpy as np
import matplotlib.pyplot as plt
import time
import tqdm
import gymnasium as gym
import moviepy.editor as mpy
import stable_baselines3 as sb3
from stable_baselines3 import PPO,SAC,A2C,DQN
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import VecNormalize
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize

In [2]:
# env = gym.make("Pusher-v4",render_mode="rgb_array")
env = make_vec_env("Pusher-v4", n_envs=4)
env = VecNormalize(env,norm_obs=True, norm_reward=True, clip_obs=10.)

In [3]:
print("_____OBSERVATION SPACE_____ \n")
print("Observation Space", env.observation_space)
print("Sample observation", env.observation_space.sample())  # Get a random observation

_____OBSERVATION SPACE_____ 

Observation Space Box(-inf, inf, (23,), float64)
Sample observation [ 0.10693588 -1.45736352  0.56168233  0.31165048 -0.19846846 -1.06896805
 -1.39573209  0.65461877  0.24592515  0.04769211 -0.83719801  1.3144478
  1.10735103 -1.62614845 -0.49208871 -0.5053764   1.34836321 -0.96561512
  0.66317161 -1.73365373 -1.13627005 -1.16616795  0.75919473]


In [4]:
print("\n _____ACTION SPACE_____ \n")
print("Action Space Shape", env.action_space.shape)
print("Action Space Sample", env.action_space.sample())  # Take a random action


 _____ACTION SPACE_____ 

Action Space Shape (7,)
Action Space Sample [ 1.5906184   1.1811805  -0.53761786 -0.14646472  1.5923935  -1.8549227
  1.3608754 ]


In [5]:
policy_kwargs = dict(
    net_arch=[dict(pi=[64, 64], vf=[64, 64])]
)

In [6]:
model = A2C(policy = "MlpPolicy",
            env = env
            ,policy_kwargs=policy_kwargs, verbose=1, 
            learning_rate=0.0007, n_steps=5, gamma=0.99, gae_lambda=0.95, 
            ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5)

Using cuda device


In [7]:
model.learn(total_timesteps=2_000_000)

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 100      |
|    ep_rew_mean        | -122     |
| time/                 |          |
|    fps                | 336      |
|    iterations         | 100      |
|    time_elapsed       | 5        |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -9.88    |
|    explained_variance | 0.942    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.778   |
|    std                | 0.993    |
|    value_loss         | 0.0485   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 100      |
|    ep_rew_mean        | -120     |
| time/                 |          |
|    fps                | 222      |
|    iterations         | 200      |
|    time_elapsed       | 17       |
|    total_timesteps    | 4000     |
|

<stable_baselines3.a2c.a2c.A2C at 0x7f312baeb400>

In [8]:
model.save("a2c-Pusher-v4")
env.save("vec_normalize.pkl")

In [9]:


eval_env = DummyVecEnv([lambda: gym.make("Pusher-v4")])
eval_env = VecNormalize.load("vec_normalize.pkl", eval_env)

# We need to override the render_mode
eval_env.render_mode = "rgb_array"

#  do not update them at test time
eval_env.training = False
# reward normalization is not needed at test time
eval_env.norm_reward = False

# Load the agent
model = A2C.load("a2c-Pusher-v4")

mean_reward, std_reward = evaluate_policy(model, eval_env)

print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}")

Mean reward = -44.40 +/- 4.30


In [12]:
mean_reward, std_reward

(-64.71723785400391, 2.338777727123102)

In [33]:
from huggingface_hub import notebook_login

In [34]:
notebook_login()
!git config --global credential.helper store

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
env_id = "Pusher-v4"

In [37]:
from huggingface_sb3 import package_to_hub

package_to_hub(
    model=model,
    model_name=f"a2c-{env_id}",
    model_architecture="A2C",
    env_id=env_id,
    eval_env=eval_env,
    repo_id=f"Hevagog/a2c-{env_id}",
    commit_message="Initial commit",
)

[38;5;4mℹ This function will save, evaluate, generate a video of your agent,
create a model card and push everything to the hub. It might take up to 1min.
This is a work in progress: if you encounter a bug, please open an issue.[0m


  logger.warn(


Saving video to /tmp/tmpsghlryvz/-step-0-to-step-1000.mp4
[38;5;4mℹ Pushing repo Hevagog/a2c-Pusher-v4 to the Hugging Face Hub[0m


ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

a2c-Pusher-v4.zip:   0%|          | 0.00/125k [00:00<?, ?B/s]

vec_normalize.pkl:   0%|          | 0.00/2.78k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

[38;5;4mℹ Your model is pushed to the Hub. You can view your model here:
https://huggingface.co/Hevagog/a2c-Pusher-v4/tree/main/[0m


CommitInfo(commit_url='https://huggingface.co/Hevagog/a2c-Pusher-v4/commit/75433f55ab397a3b7bd62fe5c151b248c48d22e8', commit_message='Initial commit', commit_description='', oid='75433f55ab397a3b7bd62fe5c151b248c48d22e8', pr_url=None, pr_revision=None, pr_num=None)

In [10]:
eval_env = DummyVecEnv([lambda: gym.make("Pusher-v4")])
eval_env = VecNormalize.load("vec_normalize.pkl", eval_env)
eval_env.render_mode = "rgb_array"

In [11]:
model_loaded = A2C.load("a2c-Pusher-v4", env=eval_env)

In [12]:
eval_env = gym.make("Pusher-v4", render_mode="rgb_array")

In [13]:
frames = []
obs,*_ = eval_env.reset()

for i in tqdm.tqdm(range(15_000)):  # Adjust episode length as needed
    action, _ = model_loaded.predict(obs)
    obs, reward, done, info,*_ = eval_env.step(action)
    eval_env.render()  # Render the environment without blocking

    if eval_env.render_mode == 'rgb_array':
        frame = eval_env.render()  # Capture the rendered frame
        frames.append(frame)

    if done:
        break

eval_env.close()  # Close the environment

# Create the video from the captured frames
if frames:
    clip = mpy.ImageSequenceClip(frames, fps=24)  # Adjust FPS if needed
    clip.write_videofile("Pusher-v4.mp4", codec='libx264')


  0%|          | 0/15000 [00:00<?, ?it/s]

100%|██████████| 15000/15000 [33:14<00:00,  7.52it/s]


Moviepy - Building video Pusher-v4.mp4.
Moviepy - Writing video Pusher-v4.mp4



                                                                   

Moviepy - Done !
Moviepy - video ready Pusher-v4.mp4




: 

In [50]:
eval_env = make_vec_env("Pusher-v4", n_envs=1)  # Create a VecEnv with 1 environment
eval_env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)

# obs, _ = eval_env.reset()  # Unpack the tuple, discard info
# action, _states = model.predict(obs)  # Use obs for prediction
