# 1 Install Stable Baselines

In [4]:
#!pip install stable-baselines3[extra]

In [5]:
import os 
import gym #for building environments
from stable_baselines3 import PPO#Selected a one basic model for it 
from stable_baselines3.common.vec_env import DummyVecEnv#mostly open Ai gym allows you to train on multiple environment this is the wrapper function
from stable_baselines3.common.evaluation import evaluate_policy #for Evaluation of Policy

<!-- Вox :n dimensional tensor, range of values
E.g. Box (0,1,shape=(3,3))
Discrete :- set of items
E.g. Discrete (3)
Tuple - tuple of other spaces e.g.Box or Discrete
E.g. Tuple ( (Discrete (2), Box (0,100, shape= (1,))))
Dict - dictionary of spaces e.g. Box or Discrete
E.g. Dict: ({'height':Discrete (2), "speed":Box (0,100, shape= (1, )) })
MultiBinary one hot encoded binary values
E.g. MultiBinary(4)
MultiDiscrete - multiple discrete value
Eg. MultiDiscrete([2,4,2])

# 2. Load an Environment

In [7]:
environment_name = 'CartPole-v0'
env = gym.make(environment_name)

In [6]:
environment_name

'CartPole-v0'

In [15]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()#Initial observations
    done = False
    score =0

    while not done:
        env.render()#view environment
        action = env.action_space.sample()#pick action from action space
        n_states,reward,done,info = env.step(action)#apply action to the environment 
        score +=reward
    print('Episode: {} Score: {}'.format(episode,score))
env.close()#close environment







Episode: 1 Score: 35.0
Episode: 2 Score: 11.0
Episode: 3 Score: 17.0
Episode: 4 Score: 12.0
Episode: 5 Score: 46.0


In [16]:
#Episode is like a full game the limit of some models for 1 episode is fixed like 200 frames for CartPole but some has this varibale

# 3 Understanding Environmnet

In [17]:
# env.action_space# this tells action type
# env.action_space.sample()#it gives (0 or 1 ) because we have 2 discrete values zero and one action zero left action 1 right   

In [24]:
env.observation_space
env.observation_space.sample()
#1 cart position cart velocity pole angle pole angular velocity

array([-4.4787750e+00, -2.7301290e+37, -1.2654702e-01, -1.6757597e+38],
      dtype=float32)

In [20]:
env.reset()

array([-0.01098211,  0.01933999,  0.02342481,  0.01444309])

In [21]:
env.step(1)

(array([-0.01059531,  0.2141183 ,  0.02371367, -0.27075796]), 1.0, False, {})

# 4 Training a model

In [1]:
#make directory first
log_path = os.path.join('Training','Logs')


In [2]:
log_path

'Training\\Logs'

In [8]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])#wrapped our environment 
model = PPO('MlpPolicy',env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [9]:
model.learn(total_timesteps=20000)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Logging to Training\Logs\PPO_1
-----------------------------
| time/              |      |
|    fps             | 298  |
|    iterations      | 1    |
|    time_elapsed    | 6    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 399         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009219332 |
|    clip_fraction        | 0.109       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.00669    |
|    learning_rate        | 0.0003      |
|    loss                 | 7.05        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0176     |
|    value_loss           | 57.2        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x1dc95ffc788>

# 5. Save Model

In [10]:
PPO_PATH = os.path.join('Training','Logs','PPO_MODEL_MY_CARTPOLE')

In [11]:
model.save(PPO_PATH)

In [12]:
del model

In [13]:
#load saved model
model = PPO.load(PPO_PATH , env=env)

# 6. Evaluate model


In [14]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)#return avearage and standard deviation



(200.0, 0.0)

# 7. Test Model

In [19]:
episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()#Initial observations
    done = False
    score =0

    while not done:
        env.render()#view environment
        action, _ = model.predict(obs) #we use model to do cartpole now
        obs,reward,done,info = env.step(action)#apply action to the environment 
        score +=reward
    print('Episode: {} Score: {}'.format(episode,score))
env.close()#close environment


Episode: 1 Score: [200.]
Episode: 2 Score: [200.]
Episode: 3 Score: [200.]
Episode: 4 Score: [200.]
Episode: 5 Score: [200.]


In [None]:
#200 score comes because reward for one action is is 1 (when pole not fall) and 0 when fall so the return or score is sum of rewards
#These are model less environment