## **Policy-Based Methods (REINFORCE)**

In policy-based methods, the agent directly learns a policy function that maps states to actions, rather than learning a value function. REINFORCE is a Monte Carlo policy gradient method that updates the policy by using the rewards received in an episode to adjust the likelihood of actions that led to higher rewards.


**Imports**

In [3]:
import numpy as np
import gym
import tensorflow as tf
from tensorflow.keras import layers


**Data Loading**

In [None]:
# Create an environment
env = gym.make('CartPole-v1')

# Define the policy network
def create_policy_network():
    model = tf.keras.Sequential([
        layers.Dense(24, activation='relu', input_shape=env.observation_space.shape),
        layers.Dense(24, activation='relu'),
        layers.Dense(env.action_space.n, activation='softmax')
    ])
    return model


**Model Building**

In [None]:
# Hyperparameters
gamma = 0.99  # Discount factor
learning_rate = 0.01

# Define the policy gradient agent
policy_model = create_policy_network()
optimizer = tf.keras.optimizers.Adam(learning_rate)

def compute_loss(log_probs, rewards, gamma=gamma):
    discounted_rewards = []
    cumulative_reward = 0
    for reward in rewards[::-1]:
        cumulative_reward = reward + cumulative_reward * gamma
        discounted_rewards.insert(0, cumulative_reward)
    discounted_rewards = np.array(discounted_rewards)

    log_probs = np.array(log_probs)
    loss = -np.sum(log_probs * discounted_rewards)
    return loss

def train_step(state, action, reward):
    with tf.GradientTape() as tape:
        logits = policy_model(state)
        log_probs = tf.math.log(logits[0, action])
        loss = compute_loss(log_probs, reward)
    grads = tape.gradient(loss, policy_model.trainable_variables)
    optimizer.apply_gradients(zip(grads, policy_model.trainable_variables))


**Training Loop**

In [None]:
    def reinforce(env, n_episodes=1000):
        for episode in range(n_episodes):
            state = env.reset()
            done = False
            rewards = []
            actions = []
            log_probs = []
            while not done:
                state = np.expand_dims(state, axis=0)  # Add batch dimension
                action_probs = policy_model(state)
                action = np.random.choice(np.arange(env.action_space.n), p=action_probs[0].numpy())
                log_prob = np.log(action_probs[0][action])
                next_state, reward, done, _ = env.step(action)

                rewards.append(reward)
                actions.append(action)
                log_probs.append(log_prob)

                state = next_state

            train_step(np.array(actions), rewards)

    reinforce(env)
