# Homework 9 Group 7

In [94]:
# Main imports
import numpy as np
import tensorflow as tf
import struct
import matplotlib.pyplot as plt
import gym

# General tensorflow settings
config = tf.ConfigProto()
# Use GPU in incremental mode (is ignored on CPU version)
config.gpu_options.allow_growth=True
# Add config=config in every tf.Session() -> tf.Session(config=config)

In [95]:
# Helper function
discount_factor = 0.99
def discount_rewards(rewards, discount_factor):
    discounted_rewards = np.zeros_like(rewards, dtype=np.float32)
    for i, reward in enumerate(reversed(rewards)):
        discounted_rewards[-(i+1)] = discounted_rewards[-i] * discount_factor + reward
        
    normalized_rewards = ((discounted_rewards - np.mean(discounted_rewards)) / np.std(discounted_rewards))
    return normalized_rewards

def feed_forward_layer(x, hidden_n, activation_fn, normalize, stddev=0.02):
    initializer = tf.random_normal_initializer(stddev=stddev)
    weights = tf.get_variable("weights", [x.shape[1], hidden_n], tf.float32, initializer)
    biases = tf.get_variable("biases", [hidden_n], tf.float32, tf.zeros_initializer())
   
    drive = tf.matmul(x, weights) + biases
    if normalize:
        drive = batch_norm(drive, [0])
   
    if activation_fn == 'linear':
        return drive
    else:
        return activation_fn(drive)



def flatten(x):
    size = int(np.prod(x.shape[1:]))
    return tf.reshape(x, [-1, size])


def batch_norm(x, axes):
    mean, var = tf.nn.moments(x, axes = axes)
    offset_initializer = tf.constant_initializer(0.0)
    offset = tf.get_variable("offset", [x.shape[-1]], tf.float32, offset_initializer)
    scale_initializer = tf.constant_initializer(1.0)
    scale = tf.get_variable("scale", [x.shape[-1]], tf.float32, scale_initializer)
    return tf.nn.batch_normalization(x, mean, var, offset, scale, 1e-6)

In [96]:
env = gym.make("CartPole-v0")

print(env.metadata)
print(env.observation_space)
print(env.action_space)

{'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 50}
Box(4,)
Discrete(2)


In [97]:
game_state = env.reset()
done = False
episodes = 2000
for _ in range(episodes):
    while(not done):
        env.render()
        sampled_action = env.action_space.sample()
        game_state, reward, done, _ = env.step(sampled_action)
env.close()

## Build the Model

In [98]:
# placeholder for the current state of the environment (data)
state = tf.placeholder(tf.float32, [1,4])

with tf.variable_scope("HIDDEN", reuse=tf.AUTO_REUSE) as scope:
    hidden = feed_forward_layer(state, 8, tf.tanh, False, 0.002)
    print(hidden)

with tf.variable_scope("OUT", reuse=tf.AUTO_REUSE) as scope:
    out = feed_forward_layer(hidden, 1, tf.sigmoid, False, 0.002)
    print(out)
    log_out = tf.reshape(tf.log([out, 1-out]), shape=(1,2)) # apply log? explanation: log_probabilities(tf.concat(p, 1-p)) # does not make sense and throws erros
    print(log_out)
    action = tf.multinomial(log_out, num_samples=1)[0][0]
    action_probability = log_out[:, tf.to_int32(action)]
    
with tf.variable_scope("optimizer", reuse=tf.AUTO_REUSE) as scope:
    optimizer = tf.train.AdamOptimizer(learning_rate=0.02)
    gradients_and_variables = optimizer.compute_gradients(action_probability)
    gradients = [gradient_and_variable[0] * -1 for gradient_and_variable in gradients_and_variables] 
    print(gradients)
    gradient_placeholders = [tf.placeholder(tf.float32, gradient.shape) for gradient in gradients]
    print(gradient_placeholders)
    training_step = optimizer.apply_gradients(zip(gradient_placeholders, tf.trainable_variables()))
    
    


Tensor("HIDDEN_16/Tanh:0", shape=(1, 8), dtype=float32)
Tensor("OUT_16/Sigmoid:0", shape=(1, 1), dtype=float32)
Tensor("OUT_16/Reshape:0", shape=(1, 2), dtype=float32)
[<tf.Tensor 'optimizer_7/mul:0' shape=(4, 8) dtype=float32>, <tf.Tensor 'optimizer_7/mul_1:0' shape=(8,) dtype=float32>, <tf.Tensor 'optimizer_7/mul_2:0' shape=(8, 1) dtype=float32>, <tf.Tensor 'optimizer_7/mul_3:0' shape=(1,) dtype=float32>]
[<tf.Tensor 'optimizer_7/Placeholder:0' shape=(4, 8) dtype=float32>, <tf.Tensor 'optimizer_7/Placeholder_1:0' shape=(8,) dtype=float32>, <tf.Tensor 'optimizer_7/Placeholder_2:0' shape=(8, 1) dtype=float32>, <tf.Tensor 'optimizer_7/Placeholder_3:0' shape=(1,) dtype=float32>]


In [99]:
env = gym.make("CartPole-v0")

with tf.Session(config=config) as sess:
    
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(50):
        gradients_epoch = []
        scores = []
        for run in range(10):
            game_state = env.reset()
            done = False
            gradient_list = []
            reward_list = []
            i = 0
            while(not done):
                if run == 0:
                    env.render()
                sampled_action, gradient = sess.run([action, gradients], feed_dict={state: np.reshape(game_state, (1,4))})
                game_state, reward, done, _ = env.step(sampled_action)
                #print(len(gradient))
                #print(len(gradient[0]))
                #print(len(gradient[0][0]))
                #print(gradient[0])
                #print(gradient[sampled_action])
                gradient_list.append(gradient)
                reward_list.append(reward)
                i += 1
            scores.append(i)
            norm_rewards = discount_rewards(reward_list, discount_factor)
            #print(np.array(gradient_list)[0][0].shape)
            #print((np.array(norm_rewards)[:,None]*np.array(gradient_list)).shape)

            gradients_epoch.append(np.sum(np.array(norm_rewards)[:, None]*np.array(gradient_list), axis=0))
        
        total_gradients = np.sum(np.array(gradients_epoch), axis=0)
        #print(total_gradients)
        #print("\n")
        feed_dict = {placeholder: total_gradients[i]
            for i, placeholder in enumerate(gradient_placeholders)}
        sess.run(training_step, feed_dict = feed_dict)
        print("Epoch: {} Score: {}".format(epoch, np.mean(scores)))
    env.close()
                
            

Epoch: 0 Score: 20.4
Epoch: 1 Score: 25.7
Epoch: 2 Score: 23.2
Epoch: 3 Score: 19.6
Epoch: 4 Score: 24.2
Epoch: 5 Score: 28.0
Epoch: 6 Score: 21.7
Epoch: 7 Score: 20.9
Epoch: 8 Score: 25.7
Epoch: 9 Score: 23.3
Epoch: 10 Score: 15.1
Epoch: 11 Score: 20.5
Epoch: 12 Score: 23.1
Epoch: 13 Score: 21.4
Epoch: 14 Score: 18.9
Epoch: 15 Score: 27.6
Epoch: 16 Score: 19.3
Epoch: 17 Score: 23.7
Epoch: 18 Score: 19.2
Epoch: 19 Score: 29.4
Epoch: 20 Score: 17.6
Epoch: 21 Score: 34.3
Epoch: 22 Score: 40.6
Epoch: 23 Score: 33.0
Epoch: 24 Score: 35.4
Epoch: 25 Score: 31.0
Epoch: 26 Score: 37.3
Epoch: 27 Score: 40.0
Epoch: 28 Score: 45.7
Epoch: 29 Score: 36.8
Epoch: 30 Score: 43.7
Epoch: 31 Score: 48.8
Epoch: 32 Score: 38.5
Epoch: 33 Score: 50.8
Epoch: 34 Score: 52.3
Epoch: 35 Score: 45.7
Epoch: 36 Score: 56.4
Epoch: 37 Score: 66.9
Epoch: 38 Score: 66.1
Epoch: 39 Score: 87.7
Epoch: 40 Score: 62.5
Epoch: 41 Score: 99.5
Epoch: 42 Score: 99.2
Epoch: 43 Score: 92.6
Epoch: 44 Score: 123.7
Epoch: 45 Score: 13