In [1]:
# !pip install stable-baselines3[extra] 
#Python library to work with model-free RL tasks 
# !pip install gymnasium

In [1]:
import os
import gymnasium as gym #open ai's library for environments 
from stable_baselines3 import PPO #Proximal policy optimization
from stable_baselines3.common.vec_env import DummyVecEnv #to vectorize the environment 
from stable_baselines3.common.evaluation import evaluate_policy #returns average rewards for episodes and also std deviation for our agent

In [9]:
env_name='CartPole-v1'
env=gym.make(env_name,render_mode='human')

UNDERSTANDING THE ENVIRONMENT

In [10]:
print(f"a random action that is possible:{env.action_space.sample()}\n") #0-->push cart to left, 1-->push cart to right
initial_state, info = env.reset()
cart_position = initial_state[0]
cart_velocity = initial_state[1]
pole_angle = initial_state[2]
pole_angular_velocity = initial_state[3]
print(f"A random possible state in the environment\ncart position: {cart_position}\ncart velocity: {cart_velocity}\npole angle: {pole_angle}\npole angular velocity:  {pole_angular_velocity}\n")
new_state, reward, terminated, truncated, info = env.step(env.action_space.sample())
done = terminated or truncated
print(f"Outcome of an action\nNew state: {new_state}\nReward: {reward}\nDone: {done}\nInfo: {info}")

a random action that is possible:0

A random possible state in the environment
cart position: -0.017919812351465225
cart velocity: 0.02868456393480301
pole angle: 0.03616403043270111
pole angular velocity:  0.03311875835061073

Outcome of an action
New state: [-0.01734612 -0.16693683  0.03682641  0.3369889 ]
Reward: 1.0
Done: False
Info: {}


sample run and visualization of environment, action and reward

In [11]:
eps=5
for e in range(eps):
    env.reset()
    done=False
    score=0
    while not done:
        env.render()
        action=env.action_space.sample()
        new_state, reward, terminated, truncated, info = env.step(action)
        score+=reward
        done=terminated or truncated
    print(f"episode {e}, score = {score}")

episode 0, score = 28.0
episode 1, score = 13.0
episode 2, score = 26.0
episode 3, score = 31.0
episode 4, score = 12.0


In [12]:
env.close()

### TRAINING THE MODEL

In [13]:
log_path=os.path.join('training','logs')

In [14]:
env=gym.make(env_name)
env=DummyVecEnv([lambda: env])
agent=PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

Using cpu device


  from .autonotebook import tqdm as notebook_tqdm


In [15]:
total_timesteps=5000
agent.learn(total_timesteps)

Logging to training\logs\PPO_1


-----------------------------
| time/              |      |
|    fps             | 926  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 684        |
|    iterations           | 2          |
|    time_elapsed         | 5          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00859977 |
|    clip_fraction        | 0.114      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.686     |
|    explained_variance   | -0.000638  |
|    learning_rate        | 0.0003     |
|    loss                 | 7.73       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0177    |
|    value_loss           | 51.2       |
----------------------------------------
-----------------------------------------
| time/   

<stable_baselines3.ppo.ppo.PPO at 0x15a38a54310>

In [16]:
PPO_path = os.path.join('Training', 'Saved Models', 'PPO_model')
# agent.save(PPO_path)

In [17]:
#del agent
# agent= PPO.load(PPO_path, env=env)

### EVALUATION

In [18]:
# for cartpole an avg score of 200 means it is solved
env=gym.make(env_name,render_mode='human')
evaluate_policy(agent,env,n_eval_episodes=10,render=True)



(226.7, 114.89477794921753)

(older values)<br>
+ ***Average Reward (453.1)*** : The agent is doing quite well, balancing the pole for most of the episode length (near 500 steps).<br>
+ ***Standard Deviation (109.77)*** : The agent's performance is somewhat inconsistent across episodes. Some episodes might see the agent failing earlier, while in others it performs near the maximum.

In [19]:
env.close()

### Testing the agent

In [26]:
env=gym.make(env_name,render_mode='human')
eps=5
for e in range(eps):
    observation,info=env.reset()
    done=False
    score=0
    while not done:
        env.render()
        action,_=agent.predict(observation) #returns action, next state
        observation, reward, terminated, truncated, info = env.step(action)
        score+=reward
        done=terminated or truncated
    print(f"episode {e}, score = {score}")

episode 0, score = 98.0
episode 1, score = 73.0
episode 2, score = 128.0
episode 3, score = 41.0
episode 4, score = 60.0


In [27]:
env.close()

### Viewing logs in tensorboard

In [28]:
training_log_path=os.path.join(log_path,'PPO_1')

In [31]:
!tensorboard --logdir={training_log_path}

^C


Average reward & average episode length are important measures to evaluate the model <br>
to improve: <br>
1. train for longer <br>
2. Hyper-parameter tuning <br>
3. Try different algorithms