# **1. Import dependencies**

In [80]:
!pip install stable-baselines3[extra]



In [81]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import numpy as np
import matplotlib.pyplot as plt

# **2. Load Environment**

In [82]:
 environment_name = 'CartPole-v1'
 env = gym.make(environment_name,render_mode="rgb_array")

In [83]:
#Insight of the environment
episodes = 5
#Number of full games in the environment insight
for episode in range(1,episodes+1):
  #Reset our observations in the current environment and obtain initial observations
  state = env.reset()
  done = False;
  score = 0

  while not done:
    #View the graphical representation in the current environment
    env.render()
    #Generate random actions
    action = env.action_space.sample()
    #Apply an action to the evnironment
    n_state, reward, done, info = env.step(action)[:4]
    score = score + reward
  print('Episode:{} Score:{}'.format(episode,score))
#Close the render
env.close()

Episode:1 Score:13.0
Episode:2 Score:18.0
Episode:3 Score:16.0
Episode:4 Score:21.0
Episode:5 Score:14.0


In [94]:
env = gym.make(environment_name,render_mode="rgb_array")
#Wrapper dummy for the environment
env = DummyVecEnv([lambda:env])
#Creating the agent
model = PPO('MlpPolicy',env, verbose = 1)

Using cpu device


In [95]:
#Training the model
model.learn(total_timesteps=20000)

Logging to Training/Logs/PPO_2
-----------------------------
| time/              |      |
|    fps             | 1133 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 812        |
|    iterations           | 2          |
|    time_elapsed         | 5          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00818188 |
|    clip_fraction        | 0.0938     |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.687     |
|    explained_variance   | 0.00521    |
|    learning_rate        | 0.0003     |
|    loss                 | 4.86       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0145    |
|    value_loss           | 45.2       |
----------------------------------------
---------------------

<stable_baselines3.ppo.ppo.PPO at 0x781ed9d2e3e0>

# **4. Evaluation**



In [96]:
#Evaluating the reward, it needs to be over 200 to be considered as solved
evaluate_policy(model, env, n_eval_episodes=10, render = True)

(500.0, 0.0)

In [97]:
env.close()

# **5. Testing**

In [98]:
#Our agent is controlling the CartPole
episodes = 5
for episode in range(1,episodes+1):
  obs = env.reset()
  done = False;
  score = 0

  while not done:
    env.render()
    #Predicting the agent moves
    action, _ = model.predict(obs)
    obs, reward, done, info = env.step(action)[:4]
    score = score + reward
  print('Episode:{} Score:{}'.format(episode,score))

env.close()

Episode:1 Score:[500.]
Episode:2 Score:[419.]
Episode:3 Score:[500.]
Episode:4 Score:[500.]
Episode:5 Score:[500.]
