In [None]:
!pip3 uninstall gym
!pip3 install box2d-py
!pip3 install gym[box_2D]
!pip3 install gym[box2d]

import numpy as np
import tensorflow as tf
import gym
from gym.spaces import Box
from PIL import Image as im

import datetime
from statistics import mean
from gym import wrappers

from tensorflow.keras import Model
from tensorflow.keras.layers import Layer
import tensorflow_probability as tfp

In [5]:
class Actor(Model): 
    
    def __init__(self):
        super().__init__()
        # Define the layers.
        self.conv_1 = tf.keras.layers.Conv2D(filters = 16, kernel_size = 3,
                                               activation=tf.keras.activations.relu, input_shape = (96,96,3)
                                               )
        self.max_pool_1 = tf.keras.layers.MaxPool2D()
        self.conv_2 = tf.keras.layers.Conv2D(filters = 32, kernel_size = 3,
                                               activation=tf.keras.activations.relu
                                               )
        self.max_pool_2 = tf.keras.layers.MaxPool2D()

        self.global_pool = tf.keras.layers.GlobalAveragePooling2D()

        self.output_layer = tf.keras.layers.Dense(3,
                                                  activation = tf.keras.activations.sigmoid
                                                  )

    def call(self, x):
        # Define the forward step.
        x = self.conv_1(x)
        x = self.max_pool_1(x)
        x = self.conv_2(x)
        x = self.max_pool_2(x)
        x = self.global_pool(x)
        x = self.output_layer(x)

        return x
    
    def sample_prob(self, state, action = False):
        
        # get network output for one state
        mu = self(state)
        # rescale first action to be between -1 and 1 instead of 0 and 1
        mu = tf.convert_to_tensor([[(mu[0][0]*2)-1,mu[0][1],mu[0][2]]])

        # if there is no action given, sample one
        if not action:
            steering = tf.random.normal([1], mu[0][0], 0.5) 
            gas = tf.random.normal([1], mu[0][1], 0.25)
            breaking = tf.random.normal([1], mu[0][2], 0.25)

            action = [steering.numpy()[0], gas.numpy()[0], breaking.numpy()[0]]
        
        log_prob = tfp.distributions.Normal(mu, [0.5, 0.25, 0.25]).log_prob(action)
        return log_prob, action




class Critic(Model):

    def __init__(self):
        super().__init__()
        # Define the layers.
        self.conv_1 = tf.keras.layers.Conv2D(filters = 16, kernel_size = 3,
                                               activation=tf.keras.activations.relu, input_shape = (96,96,3)
                                               )
        self.max_pool_1 = tf.keras.layers.MaxPool2D()
        self.conv_2 = tf.keras.layers.Conv2D(filters = 32, kernel_size = 3,
                                               activation=tf.keras.activations.relu
                                               )
        self.max_pool_2 = tf.keras.layers.MaxPool2D()

        self.global_pool = tf.keras.layers.GlobalAveragePooling2D()

        self.output_layer = tf.keras.layers.Dense(1,
                                                  activation = tf.keras.activations.linear
                                                  )

    def call(self, x):
        # Define the forward step.
        x = self.conv_1(x)
        x = self.max_pool_1(x)
        x = self.conv_2(x)
        x = self.max_pool_2(x)
        x = self.global_pool(x)
        x = self.output_layer(x)

        return x

In [6]:
def preprocess_state(state):
    # state = np.dot(state[..., 0:3], [0.299, 0.587, 0.114])
    state = tf.cast(state, tf.float32)
    state /= 255
    state = tf.expand_dims(state, axis = 0)
    return state

In [7]:
def create_trajectories(model, env, defined_steps = False):
  
  done = False
  state = env.reset()
  state = preprocess_state(state)
  states = []
  actions = []
  rewards = []
  next_states = []
  log_probs = []
  i = 0

  while not done and (not defined_steps or defined_steps > i) : 
    log_prob, action = model.sample_prob(state)

    next_state, reward, done, _ = env.step(action)
    next_state = preprocess_state(next_state)
    states.append(state)
    actions.append(action)
    rewards.append(reward)
    next_states.append(next_state)
    log_probs.append(log_prob)
    state = next_state


    i += 1

  return states, actions, rewards, next_states, log_prob

In [None]:
#Vanilla Policy Gradient
actor = Actor()
critic = Critic()
env = gym.make("CarRacing-v2")

episodes = 10
gamma = 0.99
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)

for i in range(episodes):
  states, actions, rewards, _, _ = create_trajectories(actor,env,100)

  sum_reward = 0
  discnt_rewards = []
  rewards.reverse()
  for r in rewards:
    sum_reward = r + gamma*sum_reward
    discnt_rewards.append(sum_reward)
  discnt_rewards.reverse()  

  for state, reward, action in zip(states, discnt_rewards, actions):
      with tf.GradientTape() as actor_tape, tf.GradientTape() as critic_tape:
        log_prob, _ = actor.sample_prob(state, action)

        values = critic(state)

        advantage = reward - values
        actor_loss = -tf.math.reduce_sum(log_prob * advantage)

        critic_loss = huber_loss(values, reward)

        loss = actor_loss + critic_loss


      actor_grads = actor_tape.gradient(actor_loss, actor.trainable_variables)
      optimizer.apply_gradients(zip(actor_grads, actor.trainable_variables))

      critic_grads = critic_tape.gradient(critic_loss, critic.trainable_variables)
      optimizer.apply_gradients(zip(critic_grads, critic.trainable_variables))
