In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp

In [12]:
class Agent(tf.keras.Model):
    
    def __init__(self):
        super(Agent, self).__init__()
        self.layer1=tf.keras.layers.Dense(128,"relu",input_shape=(4,))
        self.layer2=tf.keras.layers.Dense(128,"relu")
        self.layer3=tf.keras.layers.Dense(128,"relu")
        self.layer4=tf.keras.layers.Dense(1,"sigmoid")
        
    def call(self,s):
        net=self.layer1(s)
        net=self.layer2(net)
        net=self.layer3(net)
        net=self.layer4(net)
        return net
    
    def action(self,s):
        s=s.reshape(1,-1)
        net=self.call(s)
        act=tfp.distributions.Bernoulli(probs=net).sample()
        return act.numpy()[0]
    
    def log_prob(self,s,a):
        net=self.call(s)
        dist=tfp.distributions.Bernoulli(probs=net)
        return dist.log_prob(a)
    

In [13]:
agent=Agent()

In [22]:
def loss(S,A,G):
    m=S.shape[0]
    log_pi_A_given_S=agent.log_prob(S,A)
    return -1* tf.reduce_sum(tf.multiply(log_pi_A_given_S,G))/m

In [23]:
import gym
env=gym.make("CartPole-v0")
env._max_episode_steps=200

In [24]:
def get_returns_from_rewards(rewards):
    G=[]
    ret=0
    for r in reversed(rewards):
        ret=0.99*ret+r #gamma=0.99
        G.insert(0,ret)
    return np.array(G)

In [25]:
def get_trijectory():
    states=[]
    actions=[]
    rewards=[]

    curr_state=env.reset()
    done=False

    while not done:
        act=agent.action(curr_state)
        next_state,reward,done,_=env.step(act[0])

        states.append(curr_state)
        actions.append(act)
        rewards.append(reward)

        curr_state=next_state

    G=get_returns_from_rewards(rewards).reshape(-1,1).tolist()
    R=sum(rewards)
    return states,actions,G,R

In [26]:
def get_experience_on_theta(m):
    states=[];actions=[];G=[];V_theta=0
    for i in range(m):
        S,A,g,R=get_trijectory()
        states.extend(S)
        actions.extend(A)
        G.extend(g)
        V_theta+=R
    return np.array(states),np.array(actions),np.array(G),V_theta/m

In [29]:
def train():
  goal_vtheta=200
  v_theta=0
  optimizer=tf.keras.optimizers.Adam(learning_rate=0.001)

  i=1
  while v_theta <goal_vtheta:
      S,A,G,v_theta=get_experience_on_theta(200) #m=200
      print(f"{i}: V_theta:{v_theta}")
      
      
      with tf.GradientTape() as t:
          J=loss(S,A,G)
      grads=t.gradient(J,agent.trainable_variables)
      optimizer.apply_gradients(zip(grads,agent.trainable_variables))
      
      i+=1
  

In [None]:
train()