# Chapter 2: Reinforcement Learning Algorithms

## Sample Code p. 25

### Notes regarding the objects used below

1. keras.layers.Input(): Calling .Input() function will instantiate a symbolic Keras tensor (i.e. a placeholder). Specifying a shape tuple indicates the expected input, while None refers to an unknow shape. 
2. keras.layers.Dense()(): First set of parentheses specifies the layer attributes, the second is where the prior layers' output goes. Alternatively the layers can be arranged by using tf.keras.models.Sequential() whicht takes care of what serves as input/output to what and in wht shape.


In [None]:
# %load neural_networks/policy_gradient_utilities.py
#!/usr/bin/env python2
"""
Created on Mon Mar 25 15:22:27 2019

@author: tawehbeysolow
@notes: gianrothfuchs
"""

import keras.layers as layers
from keras import backend
from keras.models import Model
from keras.optimizers import Adam
from keras.initializers import glorot_uniform

class PolicyGradient():
    
    def __init__(self, n_units, n_layers, n_columns, n_outputs, learning_rate, hidden_activation, output_activation, loss_function):
        self.n_units = n_units
        self.n_layers = n_layers
        self.n_columns = n_columns
        self.n_outputs = n_outputs
        self.hidden_activation = hidden_activation
        self.output_activation = output_activation
        self.learning_rate = learning_rate
        self.loss_function = loss_function
        
    def methodlog_likelihood_loss(actual_labels, predicted_labels):
        log_likelihood = backend.log(actual_labels * (actual_labels - predicted_labels) + 
                              (1 - actual_labels) * (actual_labels + predicted_labels))
        return backend.mean(log_likelihood * advantages, keepdims=True)

    def create_policy_model(self, input_shape):
        input_layer = layers.Input(shape=input_shape)
        advantages = layers.Input(shape=[1])
        
        hidden_layer = layers.Dense(units=self.n_units, 
                                    activation=self.hidden_activation,
                                    use_bias=False,
                                    kernel_initializer=glorot_uniform(seed=42))(input_layer)
        
        output_layer = layers.Dense(units=self.n_outputs, 
                                    activation=self.output_activation,
                                    use_bias=False,
                                    kernel_initializer=glorot_uniform(seed=42))(hidden_layer)
        

        
        if self.loss_function == 'log_likelihood':
            self.loss_function = self.methodlog_likelihood_loss
        else:
            self.loss_function = 'categorical_crossentropy'
                
        policy_model = Model(inputs=[input_layer, advantages], outputs=output_layer)
        policy_model.compile(loss=self.loss_function, optimizer=Adam(self.learning_rate))
        model_prediction = Model(input=[input_layer], outputs=output_layer)
        return policy_model, model_prediction


In [1]:
# %load chapter2/cart_pole_example.py
#!/usr/bin/env python2
"""
Created on Wed Feb 20 13:50:58 2019

@author: tawehbeysolow
"""
import keras


import gym, numpy as np, matplotlib.pyplot as plt
from neural_networks.policy_gradient_utilities import PolicyGradient

#Parameters 
n_units = 5
gamma = .99
batch_size = 50
learning_rate = 1e-3
n_episodes = 500
render = False
goal = 195
n_layers = 2
n_classes = 2
environment = gym.make('CartPole-v1')
environment_dimension = len(environment.reset())
            
def calculate_discounted_reward(reward, gamma=gamma):
    output = [reward[i] * gamma**i for i in range(0, len(reward))]
    return output[::-1]

def score_model(model, n_tests, render=render):
    scores = []    
    for _ in range(n_tests):
        environment.reset()
        observation = environment.reset()
        reward_sum = 0
        while True:
            if render:
                environment.render()
                
            state = np.reshape(observation, [1, environment_dimension])
            predict = model.predict([state])[0]
            action = np.argmax(predict)
            observation, reward, done, _ = environment.step(action)
            reward_sum += reward
            if done:
                break
        scores.append(reward_sum)
        
    environment.close()
    return np.mean(scores)

def cart_pole_game(environment, policy_model, model_predictions):
    loss = []
    n_episode, reward_sum, score, episode_done = 0, 0, 0, False
    n_actions = environment.action_space.n
    observation = environment.reset()
    
    states = np.empty(0).reshape(0, environment_dimension)
    actions = np.empty(0).reshape(0, 1)
    rewards = np.empty(0).reshape(0, 1)
    discounted_rewards = np.empty(0).reshape(0, 1)
    
    while n_episode < n_episodes: 
         
        state = np.reshape(observation, [1, environment_dimension])        
        prediction = model_predictions.predict([state])[0]
        action = np.random.choice(range(environment.action_space.n), p=prediction)
        states = np.vstack([states, state])
        actions = np.vstack([actions, action])
        
        observation, reward, episode_done, info = environment.step(action)
        reward_sum += reward
        rewards = np.vstack([rewards, reward])

        if episode_done == True:
            
            discounted_reward = calculate_discounted_reward(rewards)
            discounted_rewards = np.vstack([discounted_rewards, discounted_reward])
            rewards = np.empty(0).reshape(0, 1)
            
            if (n_episode + 1) % batch_size == 0:
                
                discounted_rewards -= discounted_rewards.mean()
                discounted_rewards /= discounted_rewards.std()
                discounted_rewards = discounted_rewards.squeeze()
                actions = actions.squeeze().astype(int)
                
                train_actions = np.zeros([len(actions), n_actions])
                train_actions[np.arange(len(actions)), actions] = 1
                
                error = policy_model.train_on_batch([states, discounted_rewards], train_actions)
                loss.append(error)
                
                states = np.empty(0).reshape(0, environment_dimension)
                actions = np.empty(0).reshape(0, 1)
                discounted_rewards = np.empty(0).reshape(0, 1)
                                
                #score = score_model(model=model_predictions, n_tests=100)
                
                #print('\nEpisode: {} \nAverage Reward: {}  \nScore: {} \nError: {}'.format(n_episode+1, reward_sum/float(batch_size), score, np.mean(loss[-batch_size:])))
    
                #if score >= goal:
                #    break 
                
                reward_sum = 0
                
            n_episode += 1
            observation = environment.reset()
            
    plt.title('Policy Gradient Error plot over %s Episodes'%(n_episode+1))
    plt.xlabel('N batches')
    plt.ylabel('Error Rate')
    plt.plot(loss)
    plt.show()
    
if __name__ == '__main__':
        
    
    mlp_model = PolicyGradient(n_units=n_units, 
                              n_layers=n_layers, 
                              n_columns=environment_dimension, 
                              n_outputs=n_classes, 
                              learning_rate=learning_rate, 
                              hidden_activation='selu', 
                              output_activation='softmax',
                              loss_function='log_likelihood')
        
    policy_model, model_predictions = mlp_model.create_policy_model(input_shape=(environment_dimension, ))
    
    policy_model.summary()
    
    cart_pole_game(environment=environment, 
                   policy_model=policy_model, 
                   model_predictions=model_predictions)
    
    model_predictions.save('savings/chapter2/cartpole/e50k.h5')
    print('done')

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 20        
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 10        
Total params: 30
Trainable params: 30
Non-trainable params: 0
_________________________________________________________________



  model_prediction = Model(input=[input_layer], outputs=output_layer)


<Figure size 640x480 with 1 Axes>

done


## Whatch the Reinforcement-Learner Play
    

In [None]:
from IPython.display import Video
Video("videos/chapter2/cartpole/vid.mp4",width=600, height=400)

In [None]:
import keras
import keras.layers as layers

from keras import backend
from keras.models import Model
from keras.optimizers import Adam
from keras.initializers import glorot_uniform

import gym, numpy as np, matplotlib.pyplot as plt
from neural_networks.policy_gradient_utilities import PolicyGradient



n_units = 5
gamma = .99
batch_size = 50
learning_rate = 1e-3
n_episodes = 500
render = False
goal = 193
n_layers = 2
n_classes = 2
environment = gym.make('CartPole-v1')
environment_dimension = len(environment.reset())

mlp_model = PolicyGradient(n_units=n_units, 
                          n_layers=n_layers, 
                          n_columns=environment_dimension, 
                          n_outputs=n_classes, 
                          learning_rate=learning_rate, 
                          hidden_activation='selu', 
                          output_activation='softmax',
                          loss_function='log_likelihood')

model_predictions = keras.models.load_model('savings/chapter2/cartpole/e50k.h5',compile=False)

In [None]:
import VideoRec as vr
video_recorder = None
video_recorder = vr.VideoRecorder(
        environment, "videos/chapter2/cartpole/vid.mp4", enabled=True)


In [None]:
loss = []

n_episode, reward_sum, score, episode_done = 0, 0, 0, False
n_actions = environment.action_space.n
observation = environment.reset()


states_log = np.empty(0).reshape(0, environment_dimension)
actions_log = np.empty(0).reshape(0, 1)
rewards_log = np.empty(0).reshape(0, 1)
discounted_rewards = np.empty(0).reshape(0, 1)

while not episode_done: 
    
    #environment.render()
    environment.unwrapped.render()
    video_recorder.capture_frame()
    state = np.reshape(observation, [1, environment_dimension])        
    prediction = model_predictions.predict([state])[0]
    action = np.random.choice(range(environment.action_space.n), p=prediction)
    states_log = np.vstack([states_log, state])
    actions_log = np.vstack([actions_log, action])

    observation, reward, episode_done, info = environment.step(action)
    reward_sum += reward
    rewards_log = np.vstack([rewards_log, reward])
    
if video_recorder.enabled:
    video_recorder.close()
    video_recorder.enabled = False

environment.close()