<a href="https://colab.research.google.com/github/Jhansipothabattula/Machine_Learning/blob/main/Day102.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Policy Gradient Methods

**Policy Gradient Methods**

Policy Gradient Methods are a class of reinforcement learning algorithms that learn a policy directly by optimizing the parameters of a policy network. Instead of learning Q-values like Q-Learning or DQN, policy gradient methods focus on finding the optimal action-selection strategy that maximizes cumulative rewards. A popular approach is the REINFORCE algorithm, where actions are sampled from a policy distribution, and the policy is updated using gradients based on rewards

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import gym

# Fix for numpy.bool8 AttributeError in older gym versions with newer numpy
# Newer NumPy versions (>=1.24) deprecated np.bool8. This line ensures compatibility.
if not hasattr(np, 'bool8'):
    np.bool8 = bool

# Set up the environment
env = gym.make("CartPole-v1")
state_shape = env.observation_space.shape[0]
num_actions = env.action_space.n

# Parameters
learning_rate = 0.01
gamma = 0.99  # Discount factor

# Policy network
def build_policy_model():
    model = tf.keras.Sequential([
        layers.Dense(24, activation='relu', input_shape=(state_shape,)),
        layers.Dense(24, activation='relu'),
        layers.Dense(num_actions, activation='softmax')  # Softmax for probability distribution
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), loss='categorical_crossentropy')
    return model

# Function to select an action based on policy
policy_model = build_policy_model()

# Function to select an action based on policy
def choose_action(state):
    state = np.array(state).reshape([1, state_shape]) # Ensure state is a 2D array
    probabilities = policy_model.predict(state)
    return np.random.choice(num_actions, p=probabilities[0])

# Function to calculate returns (discounted rewards)
def discount_rewards(rewards):
    discounted = np.zeros_like(rewards)
    cumulative = 0
    for i in reversed(range(len(rewards))):
        cumulative = cumulative * gamma + rewards[i]
        discounted[i] = cumulative
    return discounted - np.mean(discounted) # Normalization

# Training function
def train_on_episode(states, actions, rewards):
    discounted_rewards = discount_rewards(rewards)
    with tf.GradientTape() as tape:
        action_probs = policy_model(tf.convert_to_tensor(states, dtype=tf.float32))
        action_indices = tf.stack([tf.range(len(actions)), actions], axis=1)
        selected_action_probs = tf.gather_nd(action_probs, action_indices)
        loss = -tf.reduce_mean(tf.math.log(selected_action_probs) * discounted_rewards)
    gradients = tape.gradient(loss, policy_model.trainable_variables)
    policy_model.optimizer.apply_gradients(zip(gradients, policy_model.trainable_variables))

# Main training loop
num_episodes = 1000
for episode in range(num_episodes):
    # Fix: Old gym versions return a single observation, not (observation, info)
    state = env.reset()
    episode_states, episode_actions, episode_rewards = [], [], []

    while True:
        action = choose_action(state)
        # Fix: Old gym versions return (observation, reward, done, info) - 4 values
        next_state, reward, done, _ = env.step(action)

        # In old gym API, 'done' covers both termination and truncation
        # The 'truncated' variable and the 'done = done or truncated' line are not applicable
        # if using the old gym API where step returns 4 values.

        episode_states.append(state)
        episode_actions.append(action)
        episode_rewards.append(reward)

        state = next_state
        if done:
            episode_states = np.vstack(episode_states)
            train_on_episode(episode_states, np.array(episode_actions), np.array(episode_rewards))
            print(f"Episode {episode + 1}, Reward: {np.sum(episode_rewards)}")
            break

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34