In [2]:
import tensorflow as tf
import tensorflow_probability as tfp
import pandas as pd
import numpy as np
import gym

env = gym.make('CartPole-v0')
env._max_episode_steps=200

In [3]:
class Reinforce(tf.keras.Model):
    
    def __init__(self,env):
        super(Reinforce, self).__init__()
        self.env=env
        self.MAX_TRAJ=200
        self.max_reward=200
        self.space_size=env.observation_space.shape
        self.action_size=env.action_space.n
        self.learning_rate=0.01
        self.gamma=0.999
        self.optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
        self.layer1=tf.keras.layers.Dense(128,"relu",input_shape=self.space_size)
        self.layer2=tf.keras.layers.Dense(128,"relu")
        self.layer3=tf.keras.layers.Dense(128,"relu")
        self.layer4=tf.keras.layers.Dense(self.action_size,"softmax")
        
    def call_model(self,state):
        net=self.layer1(state)
        net=self.layer2(net)
        net=self.layer3(net)
        net=self.layer4(net)
        return net
    
    def action(self,state):
        s=state.reshape(1,-1)
        net=self.call_model(s)
        action=tfp.distributions.Categorical(probs=net).sample()
        return action.numpy()[0]
      
    def get_log_prob(self,state,action):
        #s=state.reshape(1,-1)
        net=self.call_model(state)
        dist=tfp.distributions.Categorical(probs=net)
        return dist.log_prob(action)
      
    def get_return_G(self,rewards):
        G=[]
        for i in range(len(rewards)):
            ret=0
            p=0
            for j in rewards[i:]:
                ret+= self.gamma**p*j
                p+=1
            G.append([ret])
        return G

    def get_trajectory(self):
        states=[]
        actions=[]
        rewards=[]
        
        done=False
        curr_state=env.reset()
        
        while not done:
            act=self.action(curr_state)
            next_state,reward,done,_=env.step(act)
            states.append(curr_state)
            actions.append([act])
            rewards.append(reward)
            curr_state=next_state
        return states,actions,rewards

    def get_full_experience(self):
        all_states=[]
        all_actions=[]
        all_G=[]
        current_value=0
        for i in range(self.MAX_TRAJ):
            states,actions,rewards=self.get_trajectory()
            G=self.get_return_G(rewards)
            all_states.extend(states)
            all_actions.extend(actions)
            all_G.extend(G)
            current_value+=sum(rewards)
        current_value/=self.MAX_TRAJ
        return np.array(all_states),np.array(all_actions),np.array(all_G),current_value

    def loss(self,states,actions,returns):
        m=states.shape[0]
        log_prob=self.get_log_prob(states,actions)
        return -1* tf.reduce_sum(tf.multiply(log_prob,returns))/m

    def train(self):

        max_value=self.max_reward 
        curr_value=0

        counter=1

        while curr_value < max_value:
            states,actions,returns,curr_value=self.get_full_experience()
            print(f"At iteration {counter} value_function: {curr_value}")

            with tf.GradientTape() as t:
                J=self.loss(states,actions,returns)
            grads=t.gradient(J,self.trainable_variables)
            self.optimizer.apply_gradients(zip(grads,self.trainable_variables))
            
            counter+=1
    
    def test(m=1):
      for i in range(m):
          curr_state=self.env.reset()
          done=False
          rewd=0
          while not done:
              self.env.render()
              act=self.action(curr_state)
              next_state,reward,done,_=self.env.step(act)

              curr_state=next_state
              rewd+=reward
          print(f"{i}: Total Reward:{rewd}")
      env.close()
      env.display()



      
    

In [4]:
agent=Reinforce(env)

In [None]:
agent.train()