# CartPol Environment to balance a stick

In [None]:
!pip install stable-baselines3[extra]

In [73]:
import os
import gymnasium as gym
from stable_baselines3 import PPO #PPO is the algorithm
from stable_baselines3.common.vec_env import DummyVecEnv #To vectorize the environments
from stable_baselines3.common.evaluation import evaluate_policy #evaluating the model

# Load Environment

In [105]:
 env_name='CartPole-v1' #prebuilt environment
 env=gym.make(env_name)

## Understanding Environment

In [43]:
print(env.reset())

[ 0.04332942 -0.04489931  0.01443436  0.03380908]


In [None]:
print(env.action_space)
print(env.observation_space) #cart position, cart velocity, pole angle, pole angular velocity

In [23]:
print(env.action_space.sample() ) #every environment has an action space and an observation space
print(env.observation_space.sample())

1
[ 8.8849670e-01  2.5844497e+38 -3.6456153e-01  2.2438203e+38]


In [None]:
episodes=10 #testing the environment 5 times (for cartpole each episode is 200 frames)
for episode in range(episodes):
  state=env.reset() #initial state of the environment and set of observations
  done=False
  score=0

  while not done:
    env.render() #view the graphical representaion of the environment
    action=env.action_space.sample() #random action from the sample action space
    n_state,reward,done,info=env.step(action) #getting the new environment variables by passing the action into the environment, next state, reward, is it done?
    score+=reward
    print('Episode{} Score{}'.format(episode+1,score))

## Training
 Model based or Model free?

Model-free: only uses the current state values to make a decision
A2C, DDPG, DQN, HER, PPO, SAC, TD3

Model-based: makes a prediction about the future state of a model to make the best decision for the desired outcome

 (Model-free algorithm:PPO stable baselines only deals with model-free
 RL lib can be used for model-based)

 spinningup.openai.com


Algorithm choice depends on:
1. Action space

In [26]:
#logging and saving logs
#log_path=os.path.join('training','logs')
model=PPO('MlpPolicy',env,verbose=1,tensorboard_log="logs")

'training/logs'

In [63]:
env=DummyVecEnv([lambda:env]) #part of environment creation



In [75]:
model=PPO('MlpPolicy',env,verbose=1) #defining the model, multilayer perceptron policy, env

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [None]:
model.learn(total_timesteps=20000)

In [79]:
model.save("ppo_path")

In [80]:
del model

In [81]:
model=PPO.load("ppo_path",env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


# Evaluation

Training Metrics depend on the algorithm

- Evaluation metrics
    - Episode Length Mean
    - Reward Mean

- Time Metrics
    - fps
    - iterations
    - time elapsed
    - total_timesteps

- Loss Metrics
    - Entropy_loss
    - Policy loss
    - Value Loss

- Other metrics
    - Explained variance
    - Learning rate
    - n_updates

In [82]:
evaluate_policy(model,env,n_eval_episodes=10,render=True)



(500.0, 0.0)

In [106]:
env.close()

# Testing and deploying

Time to use the model to take decisions
so we predict from the observations and choose the best possible action based on the prediction from the observation

obs=env.reset

model.predict(obs)


In [122]:
#action, _=model.predict(obs)
env=gym.make('CartPole-v1')
env=DummyVecEnv([lambda:env])
obs=env.reset() #initial observations
model.predict(obs) #gives a prediction of action (not next state)


[[-0.03619986  0.02306512  0.0447387  -0.01729192]]


(array([0]), None)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
episodes=10
for episode in range(episodes):
  obs=env.reset()
  done=False
  score=0

  while not done:
    env.render()
    action,_=model.predict(obs)
    obs,reward,done,info=env.step(action)
    score+=reward
    print('Episode{} Score{}'.format(episode+1,score))