In [7]:
import tensorflow as tf
from tensorflow.keras import layers
print(tf.__version__)
print(tf.keras.__version__)

2.0.0
2.2.4-tf


## keras 堆叠模型

In [3]:
model = tf.keras.Sequential()
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(10, activation='softmax'))

In [4]:
model.compile(optimizer=tf.keras.optimizers.Adam(0.001),
              loss=tf.keras.losses.categorical_crossentropy,
              metrics=[tf.keras.metrics.categorical_accuracy])

In [7]:
import numpy as np
train_x = np.random.random((1000, 72))
train_y = np.random.random((1000, 10))
val_x = np.random.random((200, 72))
val_y = np.random.random((200, 10))
model.fit(train_x, train_y, epochs=10, batch_size=100,
          validation_data=(val_x, val_y))

Train on 1000 samples, validate on 200 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fba90523278>

In [9]:
test_x = np.random.random((1000, 72))
test_y = np.random.random((1000, 10))

model.evaluate(test_x, test_y, batch_size=32)



[56.35947882080078, 0.089]

In [10]:
model.predict(test_x)

array([[1.13609815e-06, 4.39963710e-09, 6.00056246e-13, ...,
        2.10448071e-01, 1.08942135e-04, 3.01993615e-03],
       [5.94892617e-06, 4.09629379e-08, 1.05548677e-11, ...,
        2.24327385e-01, 1.72162268e-04, 6.30125776e-03],
       [8.56924089e-06, 6.37576960e-08, 1.22256727e-11, ...,
        1.98053017e-01, 2.16444823e-04, 7.91333150e-03],
       ...,
       [1.02347167e-05, 5.06844096e-08, 3.48560694e-11, ...,
        4.26875442e-01, 3.14823788e-04, 3.88961844e-03],
       [1.26333771e-06, 3.55623397e-09, 2.52151165e-13, ...,
        1.45974874e-01, 1.12772497e-04, 2.32446170e-03],
       [4.22254516e-06, 2.99279108e-08, 7.45401223e-12, ...,
        3.53763252e-01, 8.36234249e-05, 8.29309784e-03]], dtype=float32)

## keras函数式API

In [11]:
input_x = tf.keras.Input(shape=(72,))
hidden1 = layers.Dense(32, activation='relu')(input_x)
hidden2 = layers.Dense(16, activation='relu')(hidden1)
pred = layers.Dense(10, activation='softmax')(hidden2)

model = tf.keras.Model(inputs=input_x, outputs=pred)
model.compile(optimizer=tf.keras.optimizers.Adam(0.001),
              loss=tf.keras.losses.categorical_crossentropy,
              metrics=['accuracy'])
model.fit(train_x, train_y, batch_size=32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fba780ec358>

## 模型子类化

In [12]:
class MyModel(tf.keras.Model):
    def __init__(self, num_classes=10):
        super(MyModel, self).__init__(name='my_model')
        self.num_classes = num_classes
        self.layer1 = layers.Dense(32, activation='relu')
        self.layer2 = layers.Dense(num_classes, activation='softmax')
    def call(self, inputs):
        h1 = self.layer1(inputs)
        out = self.layer2(h1)
        return out
    def compute_output_shape(self, input_shape):
        shape = tf.TensorShape(input_shape).as_list()
        shape[-1] = self.num_classes
        return tf.TensorShape(shape)

model = MyModel()
model.compile(optimizer=tf.keras.optimizers.RMSprop(0.001),
              loss=tf.keras.losses.categorical_crossentropy,
              metrics=['accuracy'])
model.fit(train_x, train_y, batch_size=16, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fba107d3da0>

## 回调

In [15]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=2, monitor='val_loss'),
    tf.keras.callbacks.TensorBoard(log_dir='/home/yzk/logs')
]
model.fit(train_x, train_y, batch_size=16, epochs=5,
          callbacks=callbacks, validation_data=(val_x, val_y))

Train on 1000 samples, validate on 200 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


<tensorflow.python.keras.callbacks.History at 0x7fba106c6358>

## 多输入多输出模型

In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds

num_batches = 1000
batch_size = 50
learning_rate = 0.001

dataset = tfds.load("tf_flowers", split=tfds.Split.TRAIN, as_supervised=True)
dataset = dataset.map(lambda img, label: (tf.image.resize(img, [224, 224]) / 255.0, label)).shuffle(1024).batch(32)
model = tf.keras.applications.MobileNetV2(weights=None, classes=5)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
for images, labels in dataset:
    with tf.GradientTape() as tape:
        labels_pred = model(images)
        loss = tf.keras.losses.sparse_categorical_crossentropy(y_true=labels, y_pred=labels_pred)
        loss = tf.reduce_mean(loss)
        print("loss %f" % loss.numpy())
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables))

[1mDownloading and preparing dataset tf_flowers (218.21 MiB) to /home/yzk/tensorflow_datasets/tf_flowers/1.0.0...[0m


ImportError: IntProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [4]:
class QNetwork(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.dense1 = tf.keras.layers.Dense(units=24, activation=tf.nn.relu)
        self.dense2 = tf.keras.layers.Dense(units=24, activation=tf.nn.relu)
        self.dense3 = tf.keras.layers.Dense(units=2)

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        x = self.dense3(x)
        return x

    def predict(self, inputs):
        q_values = self(inputs)
        return tf.argmax(q_values, axis=-1)

In [26]:
pwd

'/home/yzk/dist/BaseLib'

In [25]:
import gym
from collections import deque
import random
import numpy as np
env = gym.make('CartPole-v1')       # 实例化一个游戏环境，参数为游戏名称
model = QNetwork()
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
replay_buffer = deque(maxlen=10000) # 使用一个 deque 作为 Q Learning 的经验回放池
initial_epsilon= 0.001
final_epsilon = 0.01
epsilon = initial_epsilon
num_episodes = 100
num_exploration_episodes = 10
max_len_episode = 50
gamma = 0.01
tf.keras.backend.set_floatx('float64')
summary_writer = tf.summary.create_file_writer('./tensorboard') 
for episode_id in range(num_episodes):
    state = env.reset()             # 初始化环境，获得初始状态
    epsilon = max(                  # 计算当前探索率
        initial_epsilon * (num_exploration_episodes - episode_id) / num_exploration_episodes,
        final_epsilon)
    for t in range(max_len_episode):
#         env.render()                                # 对当前帧进行渲染，绘图到屏幕
        if random.random() < epsilon:               # epsilon-greedy 探索策略，以 epsilon 的概率选择随机动作
            action = env.action_space.sample()      # 选择随机动作（探索）
        else:
            action = model.predict(np.expand_dims(state, axis=0)).numpy()   # 选择模型计算出的 Q Value 最大的动作
            action = action[0]

        # 让环境执行动作，获得执行完动作的下一个状态，动作的奖励，游戏是否已结束以及额外信息
        next_state, reward, done, info = env.step(action)
        # 如果游戏Game Over，给予大的负奖励
        reward = -10. if done else reward
        # 将(state, action, reward, next_state)的四元组（外加 done 标签表示是否结束）放入经验回放池
        replay_buffer.append((state, action, reward, next_state, 1 if done else 0))
        # 更新当前 state
        state = next_state

        if done:                                    # 游戏结束则退出本轮循环，进行下一个 episode
            print("episode %d, epsilon %f, score %d" % (episode_id, epsilon, t))
            break

        if len(replay_buffer) >= batch_size:
            # 从经验回放池中随机取一个批次的四元组，并分别转换为 NumPy 数组
            batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(
                *random.sample(replay_buffer, batch_size))
            batch_state, batch_reward, batch_next_state, batch_done = \
                [np.array(a, dtype=np.float32) for a in [batch_state, batch_reward, batch_next_state, batch_done]]
            batch_action = np.array(batch_action, dtype=np.int32)
            
            q_value = model(batch_next_state)
            y = batch_reward + (gamma * tf.reduce_max(q_value, axis=1)) * (1 - batch_done)  # 计算 y 值
            with tf.GradientTape() as tape:
                loss = tf.keras.losses.mean_squared_error(  # 最小化 y 和 Q-value 的距离
                    y_true=y,
                    y_pred=tf.reduce_sum(model(batch_state) * tf.one_hot(batch_action, depth=2), axis=1)
                )
            
            with summary_writer.as_default():
                tf.summary.scalar('loss', loss, step=episode_id)
                tf.summary.scalar('reward', batch_reward.sum(), step=episode_id)
            grads = tape.gradient(loss, model.variables)
            optimizer.apply_gradients(grads_and_vars=zip(grads, model.variables))       # 计算梯度并

W1014 09:31:48.207993 140389822482176 base_layer.py:1814] Layer q_network_18 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



episode 3, epsilon 0.010000, score 46
episode 4, epsilon 0.010000, score 41
episode 7, epsilon 0.010000, score 45
episode 11, epsilon 0.010000, score 17
episode 12, epsilon 0.010000, score 15
episode 13, epsilon 0.010000, score 40
episode 14, epsilon 0.010000, score 35
episode 18, epsilon 0.010000, score 43
episode 19, epsilon 0.010000, score 44
episode 20, epsilon 0.010000, score 37
episode 22, epsilon 0.010000, score 34
episode 23, epsilon 0.010000, score 37
episode 24, epsilon 0.010000, score 49
episode 25, epsilon 0.010000, score 36
episode 26, epsilon 0.010000, score 38
episode 27, epsilon 0.010000, score 46
episode 31, epsilon 0.010000, score 48
episode 32, epsilon 0.010000, score 36
episode 33, epsilon 0.010000, score 31
episode 34, epsilon 0.010000, score 44
episode 35, epsilon 0.010000, score 49
episode 39, epsilon 0.010000, score 48
episode 40, epsilon 0.010000, score 45
episode 41, epsilon 0.010000, score 46
episode 42, epsilon 0.010000, score 32
episode 43, epsilon 0.010000

In [1]:
import tensorflow as tf
import numpy as np
import tensorflow_probability as tfp
import gym

In [2]:
GAME = 'Pendulum-v0'
env = gym.make(GAME)
N_A = env.action_space.shape[0]
N_S = env.observation_space.shape[0]
A_BOUND = [env.action_space.low, env.action_space.high]
ENTROPY_BETA = 0.01

class ACNet(tf.keras.Model):
    def __init__(self, globalAC=None):
        super().__init__()
        self.globalAC = globalAC
        self.la = tf.keras.layers.Dense(units=200, activation=tf.nn.relu)
        self.mu = tf.keras.layers.Dense(units=N_A, activation=tf.nn.tanh)  
        self.sigma = tf.keras.layers.Dense(units=N_A, activation=tf.nn.softplus)
        self.lc = tf.keras.layers.Dense(100, activation=tf.nn.relu)
        self.v = tf.keras.layers.Dense(N_A, activation=tf.nn.softmax)
            
    def call(self, state):
        x = self.la(state)
        mu = self.mu(x)
        sigma = self.sigma(x)
        v_s = self.lc(state)
        self.v_s = self.v(v_s)
        tf.multiply(mu, A_BOUND[1])
        sigma = sigma + 1e-4
        self.norm_dist = tfp.distributions.Normal(mu, sigma)
        return mu, sigma, self.v_s
    
    def choose_action(self, state):
        self(state)
        return tf.clip_by_value(self.norm_dist.sample(1), A_BOUND[0], A_BOUND[1])
    
    def update_global(self, state, a_his, v_target):
        mu, sigma, v_s = self(state)
        td = tf.subtract(v_target, v_s)
        
        entropy = self.norm_dist.entropy()
        a_prob = self.norm_dist.log_prob(a_his)
        
        exp_v = a_prob * tf.stop_gradient(td) + entropy * ENTROPY_BETA
        
#         update global net's paramerter
        with tf.GradientTape() as tape:
            c_loss = tf.reduce_mean(tf.square(td))
        grad_c = tape.gradient(c_loss, self.variables)
        
        with tf.GradientTape() as tape:
            a_loss = tf.reduce_mean(-exp_v)
        grad_a = tape.gradient(a_loss, self.variables)
        
        OPT_C.apply_gradients(grads_and_vars=zip(grad_c, self.globalAC.variables))
        OPT_A.apply_gradients(grads_and_vars=zip(grad_a, self.globalAC.variables))
        
    def pull_global(self):
        for l_param, g_param in zip(self.variables, self.globalAC.variables):
            l_param.assign(g_param)
        

In [3]:
net = ACNet()

In [4]:
s = env.reset()
a = env.action_space.sample()
s_, r, done, info = env.step(a)
s_

array([0.4110373 , 0.91161853, 0.05072076])

In [5]:
a = net.choose_action(s_.reshape((1, -1))).numpy().ravel()
net.norm_dist.entropy()

W1014 20:23:18.809545 139628629841664 base_layer.py:1814] Layer ac_net is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



<tf.Tensor: id=177, shape=(1, 1), dtype=float32, numpy=array([[1.0364757]], dtype=float32)>

In [6]:
MAX_GLOBAL_EP = 200
MAX_EPISODE = 200
UPDATE_GLOBAL_ITER = 10
GLOABL_REWARD_R = []
GLOBAL_EP = 0
GAMMA = 0.9
        
class Worker():
    def __init__(self,name, globalAC):
        self.env = gym.make(GAME).unwrapped
        self.name = name 
        self.ACNet = ACNet(globalAC)
    def work(self):
        total_step = 1
        buffer_s, buffer_a, buffer_r = [], [], []
        global GLOBAL_EP, GLOABL_REWARD_R
        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
            ep_r = 0
            s = self.env.reset()
            for ep_t in range(MAX_EPISODE):
                a = self.ACNet.choose_action(s.reshape(1, -1)).numpy().ravel()
                s_ , r, done, info = self.env.step(a)
                ep_r += r
                done = True if ep_t == MAX_EPISODE -1 else False
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append((r +  8)/ 8)
                if total_step % UPDATE_GLOBAL_ITER ==0 or done:
                    if done:
                        v_s_ = 0
                    else:
                        v_s_ = self.ACNet(s_.reshape(1, -1))[-1].numpy().ravel()
                    buffer_v_target = []
                    for item in buffer_r[::-1]:
                        v_s_  = v_s_ * GAMMA + item
                        buffer_v_target.append(v_s_)
                    buffer_v_target.reverse()
                    buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target)
                    self.ACNet.update_global(buffer_s, buffer_a, buffer_v_target)
                    buffer_a,buffer_s,buffer_r = [],[],[]
                    self.ACNet.pull_global()

                total_step += 1
                s = s_
                if done:
                    if len(GLOABL_REWARD_R) == 0:
                        GLOABL_REWARD_R.append(ep_r)
                    else:
                        GLOABL_REWARD_R.append(GLOABL_REWARD_R[-1]*0.9 + 0.1*ep_r)
                    print(self.name, "EP:",GLOBAL_EP, "Reward:", GLOABL_REWARD_R[-1])
                    GLOBAL_EP +=1    
                    break

In [7]:
import multiprocessing
import threading
LR_A = 0.0001
LR_C = 0.001
N_WORKERS = multiprocessing.cpu_count()
summary_writer = tf.summary.create_file_writer('./tensorboard') 
with tf.device("/cpu:0"):
    OPT_A = tf.keras.optimizers.RMSprop(LR_A)
    OPT_C = tf.keras.optimizers.RMSprop(LR_C)
    GLOBAL_AC = ACNet()  # we only need its params
    workers = []
    # Create worker
    for i in range(N_WORKERS):
        i_name = 'W_%i' % i   # worker name
        workers.append(Worker(i_name, GLOBAL_AC))
COORD = tf.train.Coordinator()

worker_threads = []
for worker in workers:
    job = lambda: worker.work()
    t = threading.Thread(target=job)
    t.start()
    worker_threads.append(t)
COORD.join(worker_threads)

import matplotlib.pyplot as plt
plt.plot(np.arange(len(GLOABL_REWARD_R)), GLOABL_REWARD_R)
plt.xlabel('step')
plt.ylabel('Total moving reward')
plt.show()

W1014 20:23:34.572710 139610277517056 base_layer.py:1814] Layer ac_net_2 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

W1014 20:23:34.575170 139609791002368 base_layer.py:1814] Layer ac_net_3 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If 

W1014 20:23:34.590804 139609161844480 base_layer.py:1814] Layer ac_net_13 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

W1014 20:23:34.592258 139609153451776 base_layer.py:1814] Layer ac_net_14 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. I

W1014 20:23:34.608612 139608113280768 base_layer.py:1814] Layer ac_net_24 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

W1014 20:23:34.610073 139608104888064 base_layer.py:1814] Layer ac_net_25 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. I

W1014 20:23:34.632501 139607542839040 base_layer.py:1814] Layer ac_net_35 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

W1014 20:23:34.634467 139607534446336 base_layer.py:1814] Layer ac_net_36 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. I

W1014 20:23:34.685012 139606494275328 base_layer.py:1814] Layer ac_net_46 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

W1014 20:23:34.688280 139606485882624 base_layer.py:1814] Layer ac_net_47 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. I

W1014 20:23:34.742203 139605923833600 base_layer.py:1814] Layer ac_net_57 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



W_8 EP: 0 Reward: -1070.5248722727922
W_2 EP: 1W_44 EP: 1  Reward: -1069.6922748776738
Reward: -1065.6890099051598
W_1 EP: 3 Reward: -1069.1664554789222
W_6 EP: 4W_13 EP: 4 Reward:  Reward: -1068.8505098557632
-1075.6154390938977
W_42 EP: 6 Reward: -1087.9226660185032
W_9 EP: 7 Reward: -1141.2496030068855
W_39W_19 EP: 8  EP: 8 Reward: Reward: -1147.5942089706004
W_12 -1161.3452233087821
EP: 9 Reward: -1154.5115704761042
W_11 EP: 11 Reward: -1160.743237853619
W_18 EP: 12 Reward: -1174.6443783632574
W_22 EP: 13 Reward: -1208.6890263723794
W_17 EP: 14 Reward: -1167.7884330924521
W_7 EP: 15 Reward:W_31 EP: 15 -1156.800569707175
 Reward: -1227.086692495218
W_37 EP: 17 Reward: -1213.3848005067937
W_35 EP: 18 Reward: -1255.0107486392485
W_30 EP: 18 Reward:W_21 EP: 19 Reward: -1308.6744966263211
 -1292.2283399367968
W_25 EP: 21 Reward: -1308.6706271880396
W_26 EP: 22 Reward: -1314.5538223430262
W_3 EP: W_20 EP: 23 Reward: -1338.8944314165715
23 Reward: -1349.7309136521167
W_0 EP: 25 Reward: -1

W_50 EP: 206 Reward: -1332.0920359566846
W_12 EP: 207 Reward: -1298.794904548911
W_51 EP: 208 Reward: -1323.0902252500757
W_24 EP: 209 Reward: -1348.542054268582
W_10 EP: 210 Reward: -1302.156124081564
W_49 EP: 211 Reward: -1354.9218418907255
W_16W_39 EP:  212 Reward: -1340.2124437370992
EP: 212 Reward: -1324.3921743259004
W_15 EP: 214 Reward: -1315.4545974588511
W_5 EP: 215 Reward: -1297.5600100298323
W_27 EP: 216 Reward: -1339.4859755856326
W_28 EP: 217 Reward: -1390.5280829924397
W_4 EP: 218 Reward: -1358.2744849015687
W_19 EP: 219 Reward: -1339.8516698067808
W_48 EP: 220 Reward: -1311.8790863146173
W_43W_47 EP: 221 Reward: -1388.3118316170528 EP: 221 Reward:
 -1350.8946763031727
W_40 EP: 223 Reward: -1384.4515301832398
W_37 EP: 224 Reward: -1347.4771108350785
W_8 EP: 225 Reward: -1360.854286375281
W_35 EP: 226 Reward: -1366.200983901487
W_17 EP: 227 Reward: -1348.0125302563172
W_36 EP: 228 Reward: -1394.1270975549185
W_9 EP: 229 Reward: -1353.5929884203392
W_21 EP: 230 Reward: -135

<Figure size 640x480 with 1 Axes>

In [8]:
plt.show()