# CartPole

---

# Game Setup

In [1]:
import warnings
warnings.simplefilter("ignore")

In [2]:
# Import game environment
import gym

In [3]:
# Environment
env = gym.make("CartPole-v1")

In [4]:
# Number of actions
env.action_space

Discrete(2)

In [5]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [6]:
# Game with random actions
done = True

for step in range(1000):
    if done:
        env.reset()
    new_state, reward, done, info = env.step(env.action_space.sample())
    env.render()

In [7]:
# Closing environment
env.close()

# Preprocessing Environment

In [6]:
state = env.reset()
print("dimensions =" ,state.shape)

dimensions = (4,)


In [7]:
from stable_baselines3.common.vec_env import DummyVecEnv

# Wraping inside dummy environment
env = DummyVecEnv([lambda: env])

state = env.reset()
print("dimensions =" ,state.shape)

dimensions = (1, 4)


# Training The RL Model

In [9]:
from stable_baselines3 import PPO

# Creating PPO model
model = PPO("MlpPolicy", env, verbose=1)

Using cpu device


In [11]:
# Training model
model.learn(total_timesteps=60_000)

-----------------------------
| time/              |      |
|    fps             | 2730 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1831        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008355977 |
|    clip_fraction        | 0.117       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.00435     |
|    learning_rate        | 0.0003      |
|    loss                 | 6.8         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0184     |
|    value_loss           | 60.3        |
-----------------------------------------
----------------------------------

------------------------------------------
| time/                   |              |
|    fps                  | 1404         |
|    iterations           | 13           |
|    time_elapsed         | 18           |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0015157561 |
|    clip_fraction        | 0.00703      |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.463       |
|    explained_variance   | 0.0649       |
|    learning_rate        | 0.0003       |
|    loss                 | 0.113        |
|    n_updates            | 120          |
|    policy_gradient_loss | -0.00127     |
|    value_loss           | 16.5         |
------------------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 1403         |
|    iterations           | 14           |
|    time_elapsed         | 20           |
|    total_

-----------------------------------------
| time/                   |             |
|    fps                  | 1389        |
|    iterations           | 24          |
|    time_elapsed         | 35          |
|    total_timesteps      | 49152       |
| train/                  |             |
|    approx_kl            | 0.005469546 |
|    clip_fraction        | 0.0729      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.442      |
|    explained_variance   | 0.419       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0227     |
|    n_updates            | 230         |
|    policy_gradient_loss | -0.00355    |
|    value_loss           | 0.0197      |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1388        |
|    iterations           | 25          |
|    time_elapsed         | 36          |
|    total_timesteps      | 51200 

<stable_baselines3.ppo.ppo.PPO at 0x25bfcb2fdc0>

In [13]:
# Savning model
model.save("best_model_60000")

# Testing The Model

In [14]:
# Loading model
model = model.load("best_model_60000")

In [12]:
# Starting the game
state = env.reset()
# Loop through the game
for episode in range(1000):
    action, _ = model.predict(state)
    state, reward, done, info = env.step(action)
    env.render()

In [15]:
env.close()