# CartPol Environment to balance a stick

In [None]:
%pip install stable-baselines3[extra]

In [7]:
import os
import gymnasium as gym
from stable_baselines3 import PPO #PPO is the algorithm
from stable_baselines3.common.vec_env import DummyVecEnv #To vectorize the environments
from stable_baselines3.common.evaluation import evaluate_policy #evaluating the model
from IPython.display import display

display(env.render())


# Load Environment

In [21]:
env_name='CartPole-v1' #prebuilt environment
env=gym.make(env_name)

## Understanding Environment

In [9]:
print(env.reset())

(array([ 0.04919025, -0.0019744 ,  0.01278595,  0.00967455], dtype=float32), {})


In [10]:
print(env.action_space)
print(env.observation_space) #cart position, cart velocity, pole angle, pole angular velocity

Discrete(2)
Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)


In [11]:
print(env.action_space.sample() ) #every environment has an action space and an observation space
print(env.observation_space.sample())

1
[ 1.7875730e+00  1.3884745e+38 -9.2699967e-02  2.0723724e+35]


In [25]:
episodes=10 #testing the environment 5 times (for cartpole each episode is 200 frames)
for episode in range(episodes):
  state=env.reset() #initial state of the environment and set of observations
  done=False
  score=0

  while not done:
    env.render() #view the graphical representaion of the environment
    action=env.action_space.sample() #random action from the sample action space
    n_state,reward,done,info=env.step(action) #getting the new environment variables by passing the action into the environment, next state, reward, is it done?
    score+=reward
    print('Episode{} Score{}'.format(episode+1,score))

ValueError: too many values to unpack (expected 4)

## Training
 Model based or Model free?

Model-free: only uses the current state values to make a decision
A2C, DDPG, DQN, HER, PPO, SAC, TD3

Model-based: makes a prediction about the future state of a model to make the best decision for the desired outcome

 (Model-free algorithm:PPO stable baselines only deals with model-free
 RL lib can be used for model-based)

 spinningup.openai.com


Algorithm choice depends on:
1. Action space

In [None]:
#logging and saving logs
#log_path=os.path.join('training','logs')
model=PPO('MlpPolicy',env,verbose=1,tensorboard_log="logs")

In [13]:
env=DummyVecEnv([lambda:env]) #part of environment creation

In [14]:
model=PPO('MlpPolicy',env,verbose=1) #defining the model, multilayer perceptron policy, env

Using cpu device


In [15]:
model.learn(total_timesteps=20000)

-----------------------------
| time/              |      |
|    fps             | 1566 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 949        |
|    iterations           | 2          |
|    time_elapsed         | 4          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00833503 |
|    clip_fraction        | 0.0729     |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.687     |
|    explained_variance   | -0.00518   |
|    learning_rate        | 0.0003     |
|    loss                 | 6.91       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0108    |
|    value_loss           | 56.2       |
----------------------------------------
-----------------------------------------
| time/   

<stable_baselines3.ppo.ppo.PPO at 0x1f278afdad0>

In [16]:
model.save("ppo_path")

In [17]:
del model

In [26]:
model=PPO.load("ppo_path",env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


# Evaluation

Training Metrics depend on the algorithm

- Evaluation metrics
    - Episode Length Mean
    - Reward Mean

- Time Metrics
    - fps
    - iterations
    - time elapsed
    - total_timesteps

- Loss Metrics
    - Entropy_loss
    - Policy loss
    - Value Loss

- Other metrics
    - Explained variance
    - Learning rate
    - n_updates

In [27]:
evaluate_policy(model,env,n_eval_episodes=10,render=True)

(500.0, 0.0)

In [20]:
env.close()

# Testing and deploying

Time to use the model to take decisions
so we predict from the observations and choose the best possible action based on the prediction from the observation

obs=env.reset

model.predict(obs)


In [28]:
#action, _=model.predict(obs)
env=gym.make('CartPole-v1')
env=DummyVecEnv([lambda:env])
obs=env.reset() #initial observations
model.predict(obs) #gives a prediction of action (not next state)


(array([0], dtype=int64), None)

In [29]:
episodes=10
for episode in range(episodes):
  obs=env.reset()
  done=False
  score=0

  while not done:
    env.render()
    action,_=model.predict(obs)
    obs,reward,done,info=env.step(action)
    score+=reward
    print('Episode{} Score{}'.format(episode+1,score))

Episode1 Score[1.]
Episode1 Score[2.]
Episode1 Score[3.]
Episode1 Score[4.]
Episode1 Score[5.]
Episode1 Score[6.]
Episode1 Score[7.]
Episode1 Score[8.]
Episode1 Score[9.]
Episode1 Score[10.]
Episode1 Score[11.]
Episode1 Score[12.]
Episode1 Score[13.]
Episode1 Score[14.]
Episode1 Score[15.]
Episode1 Score[16.]
Episode1 Score[17.]
Episode1 Score[18.]
Episode1 Score[19.]
Episode1 Score[20.]
Episode1 Score[21.]
Episode1 Score[22.]
Episode1 Score[23.]
Episode1 Score[24.]
Episode1 Score[25.]
Episode1 Score[26.]
Episode1 Score[27.]
Episode1 Score[28.]
Episode1 Score[29.]
Episode1 Score[30.]
Episode1 Score[31.]
Episode1 Score[32.]
Episode1 Score[33.]
Episode1 Score[34.]
Episode1 Score[35.]
Episode1 Score[36.]
Episode1 Score[37.]
Episode1 Score[38.]
Episode1 Score[39.]
Episode1 Score[40.]
Episode1 Score[41.]
Episode1 Score[42.]
Episode1 Score[43.]
Episode1 Score[44.]
Episode1 Score[45.]
Episode1 Score[46.]
Episode1 Score[47.]
Episode1 Score[48.]
Episode1 Score[49.]
Episode1 Score[50.]
Episode1 