# Installing Stablebaseline dependencies

In [None]:
#!pip install stable-baselines3[extra]

# 1. Importing Necessary Dependencies

In [14]:
import gym
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import A2C
import os

# 2. Loading an Environment

In [3]:
environment_name = 'MountainCar-v0'
env = gym.make(environment_name)

In [4]:
environment_name

'MountainCar-v0'

# 3. Understand an Environment

In [6]:
env.action_space #this contains discreate action space 0,1,2
#0 Accelerate to left
#1 no Accelerate
#2 Accelerate to right

Discrete(3)

In [7]:
env.observation_space # contains two observation space 
# -1.2 is minimum position and 0.6 is maximum position
# -0.07 is minimum speed and 0.07 is maximum speed

Box([-1.2  -0.07], [0.6  0.07], (2,), float32)

In [8]:
env.reset() #set random value of postion and speed

array([-0.43264476,  0.        ])

# 4. Watching model without traning

In [10]:
episode = 5
for i in range(1,episode+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        n_states,reward,done,info = env.step(action=action)
        score+=reward
    print(f'Episode: {i}, Score: {score}')
env.close()


Episode: 1, Score: -200.0
Episode: 2, Score: -200.0
Episode: 3, Score: -200.0
Episode: 4, Score: -200.0
Episode: 5, Score: -200.0


# 5. Training_Model

In [12]:
train_path = os.path.join('Training','Logs')


In [15]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])#wrapped our environment 
model = A2C('MlpPolicy',env, verbose=1, tensorboard_log=train_path)

Using cpu device


In [19]:
model.learn(total_timesteps=1000000)

Logging to Training\Logs\A2C_2
------------------------------------
| time/                 |          |
|    fps                | 562      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.09    |
|    explained_variance | -0.0155  |
|    learning_rate      | 0.0007   |
|    n_updates          | 2099     |
|    policy_loss        | -0.00324 |
|    value_loss         | 1.11e-05 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 523      |
|    iterations         | 200      |
|    time_elapsed       | 1        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -1.09    |
|    explained_variance | -0.0206  |
|    learning_rate      | 0.0007   |
|    n_updates          | 2199     |
|    policy_loss        | -0.0005  |
|    va

<stable_baselines3.a2c.a2c.A2C at 0x1d417fed1c8>

# 6. Saving Model

In [21]:
save_path = os.path.join('Training','Logs','Mountain_Car_A2C')

In [23]:
model.save(save_path)

In [24]:
del model

In [25]:
model = A2C.load(save_path , env=env)

# 7. Evaluate Model

In [26]:
evaluate_policy(model, env, n_eval_episodes=50, render=True)#return avearage and standard deviation
env.close()



# 8. Test Model

In [27]:
episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()#Initial observations
    done = False
    score =0

    while not done:
        env.render()#view environment
        action, _ = model.predict(obs) #we use model to do cartpole now
        obs,reward,done,info = env.step(action)#apply action to the environment 
        score +=reward
    print('Episode: {} Score: {}'.format(episode,score))
env.close()#close environment


Episode: 1 Score: [-200.]
Episode: 2 Score: [-200.]
Episode: 3 Score: [-200.]
Episode: 4 Score: [-200.]
Episode: 5 Score: [-200.]
