# Deep RL hands-on by Maxim Lapan
* conda activate gym 
  - which will work with torch 1.1, tensorflow 2.0 with CUDA 10
* this book use torch


In [1]:
import gym
e = gym.make('CartPole-v0')

In [2]:
obs = e.reset()
obs # return 4 values, x coordinate, speed, angle, angular speed

array([ 0.02557767, -0.03932847,  0.04887002,  0.02317516])

In [3]:
print(e.action_space, e.observation_space)
# action only left and right, space has 4 values with continue value [-inf, inf].

Discrete(2) Box(4,)


In [4]:
e.step(0) # left action is taken, 
# new obs, reward, done flag deal with the end of episode.
# extra information is {}

(array([ 0.0247911 , -0.23511596,  0.04933352,  0.33086784]), 1.0, False, {})

In [5]:
e.action_space.sample() # random action is taken

1

In [6]:
e.action_space.sample()

1

In [7]:
e.observation_space.sample()

array([ 3.7583354e+00, -1.6064673e+38,  7.8152247e-02,  1.7623309e+38],
      dtype=float32)

In [8]:
e.observation_space.sample()

array([-1.7173146e+00,  5.9243443e+37,  1.4845206e-01, -1.5358119e+38],
      dtype=float32)

## making randomly acting agent

In [9]:
if __name__ == "__main__":
    e = gym.make("CartPole-v0")
    total_reward = 0.0
    total_steps = 0
    obs = e.reset()
    # initialize the env

In [10]:
while True:
    action = e.action_space.sample() # random action
    obs, reward, done, _ = e.step(action) # return the results
    total_reward += reward
    total_steps += 1
    print("%d steps, %d action, total reward %.2f" %(total_steps, action, total_reward), obs)
    if done: # if done flag return True, end episode.
        break
print("Episode done in %d steps, total reward %.2f" %(total_steps, total_reward))

1 steps, 0 action, total reward 1.00 [ 0.00797062 -0.23835427  0.04959519  0.35409005]
2 steps, 0 action, total reward 2.00 [ 0.00320353 -0.43414505  0.05667699  0.66199011]
3 steps, 1 action, total reward 3.00 [-0.00547937 -0.23985559  0.06991679  0.387678  ]
4 steps, 1 action, total reward 4.00 [-0.01027648 -0.04579214  0.07767035  0.11783281]
5 steps, 1 action, total reward 5.00 [-0.01119232  0.148136    0.08002701 -0.14937027]
6 steps, 1 action, total reward 6.00 [-0.0082296   0.34202622  0.0770396  -0.41577196]
7 steps, 0 action, total reward 7.00 [-0.00138908  0.14590176  0.06872416 -0.0998306 ]
8 steps, 1 action, total reward 8.00 [ 0.00152896  0.33997491  0.06672755 -0.37006414]
9 steps, 1 action, total reward 9.00 [ 0.00832846  0.53408849  0.05932627 -0.64098317]
10 steps, 1 action, total reward 10.00 [ 0.01901023  0.72833541  0.04650661 -0.91440926]
11 steps, 1 action, total reward 11.00 [ 0.03357693  0.92279852  0.02821842 -1.19212052]
12 steps, 1 action, total reward 12.00 

## Wrapper function

In [11]:
import random

In [12]:
class RandomActionWrapper(gym.ActionWrapper):
    def __init__(self, env, epsilon=0.1):
        super(RandomActionWrapper, self).__init__(env)
        self.epsilon = epsilon
    def action(self, action):
        if random.random() < self.epsilon: # 0.1 probability make random action.
            print("Random!")
            return self.env.action_space.sample()
        return action


In [13]:
if __name__ == "__main__":
    env = RandomActionWrapper(gym.make("CartPole-v0")) # env is wrapped by ActionWrapper.

    obs = env.reset()
    total_reward = 0.0
    total_steps = 0
    while True:
        obs, reward, done, _ = env.step(0) # action will be 0, otherwise random action took with 0.1 probability.
# question, how do I know the taken action in the wrapper?
        total_reward += reward
        total_steps += 1
        print("%d steps,total reward %.2f" %(total_steps, total_reward), obs)
        if done:
            break

    print("Reward got: %.2f" % total_reward)

1 steps,total reward 1.00 [-0.02835297 -0.17819333  0.00204605  0.31641757]
2 steps,total reward 2.00 [-0.03191684 -0.37334437  0.0083744   0.60974505]
3 steps,total reward 3.00 [-0.03938373 -0.56858238  0.02056931  0.90505384]
4 steps,total reward 4.00 [-0.05075538 -0.76397675  0.03867038  1.20413035]
5 steps,total reward 5.00 [-0.06603491 -0.95957666  0.06275299  1.50867713]
6 steps,total reward 6.00 [-0.08522644 -1.15540061  0.09292653  1.82027216]
7 steps,total reward 7.00 [-0.10833446 -1.35142415  0.12933197  2.14031992]
8 steps,total reward 8.00 [-0.13536294 -1.54756516  0.17213837  2.46999196]
9 steps,total reward 9.00 [-0.16631424 -1.74366683  0.22153821  2.81015622]
Reward got: 9.00


In [14]:
if __name__ == "__main__":
    env = gym.make("CartPole-v0")
    env = gym.wrappers.Monitor(env, "recording0") # directory name.
# it need FFmpeg to make mp4 file, also you need X11 session, forwarding for ssh.
    total_reward = 0.0
    total_steps = 0
    obs = env.reset()

    while True:
        action = env.action_space.sample()
        obs, reward, done, _ = env.step(action)
        total_reward += reward
        total_steps += 1
        if done:
            break

    print("Episode done in %d steps, total reward %.2f" % (total_steps, total_reward))
    env.close()
    env.env.close()

Error: Trying to write to monitor directory recording0 with existing monitor files: recording0/openaigym.manifest.1.14711.manifest.json.

 You should use a unique directory for each training run, or use 'force=True' to automatically clear previous monitor files.