# Deep RL hands-on by Maxim Lapan
* conda activate gym 
  - which will work with torch 1.1, tensorflow 2.0 with CUDA 10
* this book use torch


In [1]:
import gym
e = gym.make('CartPole-v0')

In [2]:
obs = e.reset()
obs # return 4 values, x coordinate, speed, angle, angular speed

array([-0.00602764, -0.04494352, -0.01977204, -0.00313485])

In [3]:
print(e.action_space, e.observation_space)
# action only left and right, space has 4 values with continue value [-inf, inf].

Discrete(2) Box(4,)


In [4]:
e.step(0) # left action is taken, 
# new obs, reward, done flag deal with the end of episode.
# extra information is {}

(array([-0.00692651, -0.23977641, -0.01983474,  0.28324477]), 1.0, False, {})

In [5]:
e.action_space.sample() # random action is taken

0

In [6]:
e.action_space.sample()

1

In [7]:
e.observation_space.sample()

array([-2.9349544e+00,  1.9150158e+38, -2.2192951e-01,  2.5455205e+38],
      dtype=float32)

In [8]:
e.observation_space.sample()

array([4.4707634e-02, 3.1531373e+38, 3.2359022e-01, 2.5518052e+38],
      dtype=float32)

## making randomly acting agent

In [9]:
if __name__ == "__main__":
    e = gym.make("CartPole-v0")
    total_reward = 0.0
    total_steps = 0
    obs = e.reset()
    # initialize the env

In [10]:
while True:
    action = e.action_space.sample() # random action
    obs, reward, done, _ = e.step(action) # return the results
    total_reward += reward
    total_steps += 1
    print("%d steps, %d action, total reward %.2f" %(total_steps, action, total_reward), obs)
    if done: # if done flag return True, end episode.
        break
print("Episode done in %d steps, total reward %.2f" %(total_steps, total_reward))

1 steps, 0 action, total reward 1.00 [ 0.03989026 -0.18203008  0.01474504  0.30295928]
2 steps, 1 action, total reward 2.00 [0.03624966 0.01287866 0.02080423 0.01496284]
3 steps, 0 action, total reward 3.00 [ 0.03650723 -0.18253537  0.02110349  0.31413647]
4 steps, 0 action, total reward 4.00 [ 0.03285652 -0.37795149  0.02738622  0.61339934]
5 steps, 0 action, total reward 5.00 [ 0.02529749 -0.57344523  0.0396542   0.91458053]
6 steps, 1 action, total reward 6.00 [ 0.01382859 -0.3788814   0.05794581  0.6346195 ]
7 steps, 1 action, total reward 7.00 [ 0.00625096 -0.18461354  0.0706382   0.36073333]
8 steps, 0 action, total reward 8.00 [ 0.00255869 -0.38066476  0.07785287  0.67482714]
9 steps, 0 action, total reward 9.00 [-0.00505461 -0.57677738  0.09134941  0.99097065]
10 steps, 1 action, total reward 10.00 [-0.01659016 -0.38298883  0.11116883  0.72831922]
11 steps, 1 action, total reward 11.00 [-0.02424993 -0.18956488  0.12573521  0.47259062]
12 steps, 0 action, total reward 12.00 [-0.

## Wrapper function

In [11]:
import random

In [42]:
class RandomActionWrapper(gym.ActionWrapper):
    def __init__(self, env, epsilon=0.1):
        super(RandomActionWrapper, self).__init__(env)
        self.epsilon = epsilon
    def action(self, action):
        if random.random() < self.epsilon: # 0.1 probability make random action.
            print("Random!")
            return self.env.action_space.sample()
        return action


In [45]:
if __name__ == "__main__":
    env = RandomActionWrapper(gym.make("CartPole-v0")) # env is wrapped by ActionWrapper.

    obs = env.reset()
    total_reward = 0.0
    total_steps = 0
    while True:
        obs, reward, done, _ = env.step(0) # action will be 0, otherwise random action took with 0.1 probability.
# question, how do I know the taken action in the wrapper?
        total_reward += reward
        total_steps += 1
        print("%d steps,total reward %.2f" %(total_steps, total_reward), obs)
        if done:
            break

    print("Reward got: %.2f" % total_reward)

1 steps,total reward 1.00 [-0.00288824 -0.20683474 -0.04462909  0.28648165]
2 steps,total reward 2.00 [-0.00702494 -0.40129275 -0.03889946  0.56476162]
3 steps,total reward 3.00 [-0.01505079 -0.59584796 -0.02760423  0.84494011]
Random!
4 steps,total reward 4.00 [-0.02696775 -0.40036044 -0.01070543  0.54370594]
5 steps,total reward 5.00 [-3.49749619e-02 -5.95330329e-01  1.68692215e-04  8.32996670e-01]
6 steps,total reward 6.00 [-0.04688157 -0.79045458  0.01682863  1.12573264]
7 steps,total reward 7.00 [-0.06269066 -0.98579298  0.03934328  1.42364614]
8 steps,total reward 8.00 [-0.08240652 -1.18137868  0.0678162   1.72836159]
9 steps,total reward 9.00 [-0.10603409 -1.37720695  0.10238343  2.04135147]
10 steps,total reward 10.00 [-0.13357823 -1.57322166  0.14321046  2.36388202]
11 steps,total reward 11.00 [-0.16504267 -1.76929918  0.1904881   2.69694751]
12 steps,total reward 12.00 [-0.20042865 -1.96522989  0.24442705  3.04119299]
Reward got: 12.00


In [48]:
if __name__ == "__main__":
    env = gym.make("CartPole-v0")
    env = gym.wrappers.Monitor(env, "recording0") # directory name.
# it need FFmpeg to make mp4 file, also you need X11 session, forwarding for ssh.
    total_reward = 0.0
    total_steps = 0
    obs = env.reset()

    while True:
        action = env.action_space.sample()
        obs, reward, done, _ = env.step(action)
        total_reward += reward
        total_steps += 1
        if done:
            break

    print("Episode done in %d steps, total reward %.2f" % (total_steps, total_reward))
    env.close()
    env.env.close()

Episode done in 22 steps, total reward 22.00
