In [1]:
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import copy
import random
import gym

%matplotlib inline

In [2]:
tf.reset_default_graph()
sess = tf.Session()

In [3]:
def copy_net(name1, name2):
    variables = tf.trainable_variables()
    for var1 in variables:
        if name2+"/" in var1.name:
            trained_var = [var2 for var2 in tf.trainable_variables() if var2.op.name in str.replace(var1.name, name2+"/", name1+"/")][0]
            value = sess.run(trained_var)
            sess.run(tf.assign(var1, value))

In [4]:
# Moving average
def smoothen_curve(points, factor=0.9):
    smoothed_points = []
    for point in points:
        if smoothed_points:
            previous = smoothed_points[-1]
            smoothed_points.append(previous * factor + point * (1 - factor))
        else:
            smoothed_points.append(point)
    return smoothed_points


def plot(array, title='', axis=plt):
    axis.title.set_text(title)
    axis.plot(np.arange(len(array)), smoothen_curve(array))

In [5]:
def make_env(env_id, seed):
    def _f():
        env = gym.make(env_id)
        env.seed(seed)

        # Desync environments
        env.reset()
        for i in range(int(env.spec.max_episode_steps*seed//nproc)):
            env.step(env.action_space.sample())
        return env
    return _f

In [6]:
env_name = "MountainCar-v0"
discount = 0.97
state_queue_size = 1
learning_rate = 0.001

iterations = 10000
batch_size = 64
update_net_period = 10
history_size = 64*4
stochastic_action_likelihood = 0.9
stochastic_action_likelihood_d = (stochastic_action_likelihood - 0.1) / iterations #TODO: adaptive stoch_action_l_d; endpoint: 0.1

nproc = 30
envs = DummyVecEnv([make_env(env_name, seed) for seed in range(nproc)]) #Dummy
        
# This code gets action and observation space sizes for 1D tasks
action_space_size = envs.action_space.n if isinstance(envs.action_space, gym.spaces.discrete.Discrete) else envs.action_space.shape[0]
observation_space_size = envs.observation_space.n if isinstance(envs.observation_space, gym.spaces.discrete.Discrete) else envs.observation_space.shape[0]

plots_data = {
    "reward": [],
    "loss": [],
    "gradients_max_abs": [],
    "gradients_equal_zero": [],
    "max_action": [],
    "max_qvals": [],
    "min_qvals": [],
}

In [7]:
class QualityNet:
    def __init__(self, net_name):
        with tf.variable_scope(net_name + "/"):
            layers_config = (action_space_size + state_queue_size*observation_space_size, 32) # 32 - hidden layer size

            self.input_state = tf.placeholder(tf.float32, shape=[None, state_queue_size, observation_space_size], name="input_state")
            self.flattened_state = tf.reshape(self.input_state, [-1, observation_space_size * state_queue_size])
            self.input_action = tf.placeholder(tf.int32, shape=[None], name="input_action")
            input_action_one_hot = tf.one_hot(self.input_action, depth=action_space_size)
            
            self.input_data = tf.concat([self.flattened_state, input_action_one_hot], 1)
            
            #self.input_layer = tf.layers.dense(self.input_data, units=layers_config[0], activation='relu')
            self.hidden_layer = tf.layers.dense(self.input_data, units=layers_config[1], activation='relu', kernel_initializer=tf.random_normal_initializer(0, 0.01))
            self.output_layer = tf.layers.dense(self.hidden_layer, units=1, kernel_initializer=tf.random_normal_initializer(0, 0.01))
            
            self.exp_value = tf.placeholder(tf.float32, name="exp_value") #  shape=[None] WTF???
            self.loss = tf.losses.mean_squared_error(self.exp_value, self.output_layer)
            
            self.gradients = tf.gradients(self.loss, [t_var for t_var in tf.trainable_variables() if "prediction_net/" in t_var.name]) # For Debug!
            
            optimizer = tf.train.AdamOptimizer(learning_rate)
            self.train_op = optimizer.minimize(self.loss)

prediction_net = QualityNet("prediction_net")
train_net = QualityNet("train_net")     

sess.run(tf.local_variables_initializer())
sess.run(tf.global_variables_initializer())

W0716 22:20:23.162427  3064 deprecation.py:323] From <ipython-input-7-723713c37b24>:14: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.dense instead.
W0716 22:20:23.489581  3064 deprecation.py:323] From C:\Anaconda3\lib\site-packages\tensorflow\python\ops\losses\losses_impl.py:121: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [8]:
def prepare_input(states_q):
    return np.repeat(np.array(states_q), repeats=action_space_size, axis=0), np.tile(range(action_space_size), len(states_q))

In [9]:
def update_state(state_queue, val):
    state_queue.pop()
    state_queue.insert(0, val)

def batch_generator(batch_size, history_size):
    """
        history[0] = queue of four last states
        history[1] = action
        history[2] = reward
        history[3] = states queue with new state
        history[4] = discounted reward
    """
    states_q = [[np.random.sample(observation_space_size) for _ in range(state_queue_size)] for _ in range(nproc)] #observation_space_size refactor
    trajectories = [np.empty((0, 4)) for _ in range(nproc)]

    history = np.empty((0, 5))
    observations = [[0 for i in range(observation_space_size)] for j in range(nproc)]#envs.reset()
    for pid in range(nproc):
        update_state(states_q[pid], observations[pid])

    while True:
        history = history[batch_size:]
        while len(history) < history_size:
            if random.random() < stochastic_action_likelihood:
                actions = np.stack([envs.action_space.sample() for _ in range(nproc)]) # test
            else:
                ############ action prediction ############
                input_states, input_actions = prepare_input(states_q)
                qvalues = sess.run((prediction_net.output_layer), feed_dict={
                        prediction_net.input_state: input_states,
                        prediction_net.input_action: input_actions,
                    })
                qvalues = qvalues.reshape(len(qvalues) // action_space_size, action_space_size)
                plots_data["max_qvals"].append(qvalues.max(axis=1)[0])
                plots_data["min_qvals"].append(qvalues.min(axis=1)[0])
                actions = np.argmax(qvalues, axis=1)
                ###########################################
            observations, rewards, dones, _ = envs.step(actions)
            rewards /= 200

            for pid in range(nproc):
                new_state = copy.deepcopy(states_q[pid])
                update_state(new_state, observations[pid])

                trajectories[pid] = np.vstack((trajectories[pid], [states_q[pid], actions[pid], rewards[pid], new_state]))

                if dones[pid]:
                    ################ qvalues prediction ################
                    input_states, input_actions = prepare_input(trajectories[pid][:-1][:, 3])
                    qvalues = sess.run((prediction_net.output_layer), feed_dict={
                            prediction_net.input_state: np.array(input_states.tolist()),
                            prediction_net.input_action: input_actions,
                        })
                    qvalues = qvalues.reshape(len(qvalues) // action_space_size, action_space_size).max(axis=1)
                    qvalues *= discount
                    qvalues = qvalues + trajectories[pid][:-1][:, 2]
                    qvalues = np.append(qvalues, rewards[pid])
                    ####################################################

                    new_history = np.hstack((trajectories[pid], np.expand_dims(qvalues, axis=-1)))   
                    plots_data["reward"].append((trajectories[pid][:-1][:, 2].sum() + rewards[pid]) * 200)
                    trajectories[pid] = np.empty((0, 4))
                    history = np.vstack((history, new_history))

        np.random.shuffle(history)
        history = history[:history_size]
        yield history[:batch_size]

In [17]:
batch_gen = batch_generator(batch_size, history_size)

for i in range(10):
    next(batch_gen)

for i in tqdm_notebook(range(iterations)):
    batch = next(batch_gen)

    input_state = [batch[:, 0][i] for i in range(len(batch))]
    input_action = batch[:, 1]

    _, loss, gradients = sess.run((prediction_net.train_op, prediction_net.loss, prediction_net.gradients), 
                       feed_dict = {
                            prediction_net.input_state: input_state,
                            prediction_net.input_action: input_action,
                            prediction_net.exp_value: batch[:, 4],
                        })

    #****************** DEBUG ******************* 
    plots_data["loss"].append(loss)
    plots_data["gradients_max_abs"].append(np.max([np.max(np.abs(var_grad)) for var_grad in gradients]))
    plots_data["gradients_equal_zero"].append(len(gradients[gradients == 0]))
    #*********************************************

    if i % update_net_period == 0:
        pass#copy_net("train_net", "prediction_net")

    stochastic_action_likelihood -= stochastic_action_likelihood_d

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

KeyboardInterrupt: 

In [18]:
#************ Plotting debug info ****************
fig, axs = plt.subplots(2, 3, figsize=(16, 9))
plot(plots_data["reward"], "Reward", axs[0, 0])
plot(plots_data["loss"], "Loss (doesn’t measure performance)", axs[0, 1])
plot(plots_data["max_qvals"], "Max Qvals", axs[1, 0])
plot(plots_data["min_qvals"], "Min Qvals", axs[1, 1])
plot(plots_data["gradients_max_abs"], "Gradients max (not from adam)", axs[0, 2])
plot(plots_data["gradients_equal_zero"], f"Gradients equal zero out of parameters", axs[1, 2])
#*************************************************

<IPython.core.display.Javascript object>

In [19]:
x = np.linspace(-1, 1, num=21)
y = np.linspace(-1, 1, num=21)
input_state = np.expand_dims(np.transpose([np.tile(x, len(y)), np.repeat(y, len(x))]), axis=1)
input_action = np.tile([0,1,2], len(input_state))
input_state = np.repeat(input_state, 3, axis=0)

output = sess.run((prediction_net.output_layer), feed_dict={
    prediction_net.input_state: input_state,
    prediction_net.input_action: input_action,
})

print("MAX Q:", output.max())        
print("MIN Q:", output.min())

MAX Q: -0.19750738
MIN Q: -0.22167125


In [20]:
env = gym.make(env_name)
while True:
    observations = env.reset()
    done = False
    while not done:
        env.render()
        ############ action prediction ############
        input_states, input_actions = prepare_input([[observations]])
        qvalues = sess.run((prediction_net.output_layer), feed_dict={
                prediction_net.input_state: input_states,
                prediction_net.input_action: input_actions,
            })
        qvalues = qvalues.reshape(len(qvalues) // action_space_size, action_space_size)
        plots_data["max_qvals"].append(qvalues.max(axis=1)[0])
        plots_data["min_qvals"].append(qvalues.min(axis=1)[0])
        actions = np.argmax(qvalues, axis=1)
        ###########################################
        
        observations, rewards, done, _ = env.step(actions[0])

        if done:
            print('done')
            break

KeyboardInterrupt: 

# График

In [None]:
env = gym.make(env_name)
d = 20
x = np.linspace(env.observation_space.low[0], env.observation_space.high[0], d)
y = np.linspace(env.observation_space.low[1], env.observation_space.high[1], d)
q_table = np.zeros((d, d))

net = prediction_net
for i in range(len(x)):
    for j in range(len(y)):
        print(sess.run(net.output_layer, feed_dict={net.input_state: [[[x[i], y[j]]]],
                                                              net.input_action: [0]}))
        q_table[i][j] = sess.run(net.output_layer, feed_dict={net.input_state: [[[x[i], y[j]]]],
                                                              net.input_action: [0]})[0][0]


In [21]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
import numpy as np
%matplotlib notebook

fig = plt.figure(figsize=(9, 7))
ax = fig.gca(projection='3d')


# Make data
x = np.linspace(env.observation_space.low[0], env.observation_space.high[0], q_table.shape[0])
y = np.linspace(env.observation_space.low[1], env.observation_space.high[1], q_table.shape[1])
z = q_table


# Make data.
X, Y = np.meshgrid(x, y)

# Plot the surface.
surf = ax.plot_surface(X, Y, z, cmap=cm.coolwarm,
                       linewidth=0, antialiased=False)

ax.set_xlabel('X position', fontsize=20)
ax.set_ylabel('Velocity', fontsize=20)

# Add a color bar which maps values to colors.
fig.colorbar(surf, shrink=0.5, aspect=5)


<IPython.core.display.Javascript object>

<matplotlib.colorbar.Colorbar at 0x231c17197f0>

In [16]:
print(q_table)

[[-0.20969054 -0.20973945 -0.20978838 -0.20983733 -0.20988624 -0.2099352
  -0.20998411 -0.21003306 -0.21008199 -0.21013092 -0.21017985 -0.21022877
  -0.21027772 -0.21032667 -0.21037559 -0.21042453 -0.21047345 -0.21052238
  -0.21057133 -0.21062027]
 [-0.20919301 -0.20924194 -0.20929086 -0.2093398  -0.20938873 -0.20943767
  -0.2094866  -0.20953554 -0.20958449 -0.20963341 -0.20968235 -0.20973125
  -0.2097802  -0.20982912 -0.20987807 -0.20992701 -0.20997593 -0.21002486
  -0.21007381 -0.21012273]
 [-0.20869549 -0.20874442 -0.20879334 -0.20884228 -0.20889121 -0.20894016
  -0.20898908 -0.20903802 -0.20908695 -0.20913588 -0.20918487 -0.209234
  -0.20928313 -0.20933226 -0.20938137 -0.20943052 -0.20947963 -0.20952876
  -0.20957789 -0.20962702]
 [-0.20820637 -0.20825548 -0.20830461 -0.20835373 -0.20840287 -0.20845199
  -0.20850113 -0.20855024 -0.20859937 -0.20864849 -0.20869763 -0.20874675
  -0.20879588 -0.20884499 -0.20889413 -0.20894325 -0.20899236 -0.20904151
  -0.20909062 -0.20913975]
 [-0.20