In [None]:
# ----------------------------
# Import required libraries
# ----------------------------
import numpy as np
np.bool8 = np.bool_  # Fix for NumPy >= 1.24

import torch  # For building and training neural networks
import torch.nn as nn
import torch.optim as optim
import gym  # OpenAI Gym for RL environments

# ----------------------------
# Create the CartPole environment
# ----------------------------
env = gym.make("CartPole-v1")

# ----------------------------
# Set random seeds for reproducibility
# ----------------------------
torch.manual_seed(42)
np.random.seed(42)

# ----------------------------
# Get state and action space sizes
# ----------------------------
state_dim = env.observation_space.shape[0]  # 4 for CartPole
num_actions = env.action_space.n            # 2 actions: left or right

# ----------------------------
# Define the policy network π(a|s)
# ----------------------------
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 32)      # Hidden layer with 32 neurons
        self.relu = nn.ReLU()                    # ReLU activation
        self.fc2 = nn.Linear(32, output_dim)     # Output layer
        self.softmax = nn.Softmax(dim=-1)        # Action probabilities

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        logits = self.fc2(x)
        probs = self.softmax(logits)
        return probs

# ----------------------------
# Initialize network and optimizer
# ----------------------------
policy_net = PolicyNetwork(state_dim, num_actions)
optimizer = optim.Adam(policy_net.parameters(), lr=0.01)

# ----------------------------
# Reward-to-go function
# ----------------------------
def compute_reward_to_go(rewards, gamma=0.99):
    """
    Compute reward-to-go for each timestep.
    G_t = R_t + γR_{t+1} + γ²R_{t+2} + ...
    """
    rtg = np.zeros_like(rewards, dtype=np.float32)
    running_add = 0
    for t in reversed(range(len(rewards))):
        running_add = rewards[t] + gamma * running_add
        rtg[t] = running_add
    return rtg

# ----------------------------
# Number of training episodes
# ----------------------------
num_episodes = 10  # Increase to 500+ for better training

# ----------------------------
# Training loop
# ----------------------------
for episode in range(num_episodes):
    # Reset environment (handle both new and old Gym versions)
    reset_output = env.reset()
    if isinstance(reset_output, tuple):
        state, _ = reset_output  # New Gym API
    else:
        state = reset_output     # Old Gym API

    done = False  # Whether episode is complete

    episode_states = []   # List of states
    episode_actions = []  # List of actions
    episode_rewards = []  # List of rewards

    # ----------------------------
    # Run one episode
    # ----------------------------
    while not done:
        state_tensor = torch.FloatTensor(state).unsqueeze(0)  # shape: [1, state_dim]

        # Get action probabilities from the policy
        action_probs = policy_net(state_tensor).detach().numpy().ravel()

        # Sample action from the probability distribution
        action = np.random.choice(num_actions, p=action_probs)

        # Take the action in the environment
        step_output = env.step(action)
        if len(step_output) == 5:  # New Gym API
            next_state, reward, terminated, truncated, _ = step_output  # gives you the reward at each timestep
            done = terminated or truncated
        else:  # Old Gym API
            next_state, reward, done, _ = step_output

        # Save transition
        episode_states.append(state)
        episode_actions.append(action)
        episode_rewards.append(reward)

        # Move to the next state
        state = next_state

    # ----------------------------
    # Compute normalized reward-to-go
    # ----------------------------
    reward_to_go = compute_reward_to_go(episode_rewards)
    reward_to_go -= np.mean(reward_to_go)
    reward_to_go /= (np.std(reward_to_go) + 1e-8)
    reward_to_go_tensor = torch.FloatTensor(reward_to_go)

    # Convert episode data to tensors
    states_tensor = torch.FloatTensor(episode_states)
    actions_tensor = torch.LongTensor(episode_actions)

    # Get action probabilities for all states
    action_probs = policy_net(states_tensor)

    # Get log-probabilities of the actions that were actually taken
    selected_log_probs = torch.log(action_probs.gather(1, actions_tensor.unsqueeze(1)).squeeze())

    # Compute policy gradient loss: REINFORCE with reward-to-go
    loss = -torch.mean(selected_log_probs * reward_to_go_tensor)

    # Backpropagation step
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # ----------------------------
    # Logging
    # ----------------------------
    print(f"Episode {episode + 1}, Return: {np.sum(episode_rewards):.2f}")


  states_tensor = torch.FloatTensor(episode_states)


Episode 1, Return: 38.00
Episode 2, Return: 17.00
Episode 3, Return: 17.00
Episode 4, Return: 26.00
Episode 5, Return: 12.00
Episode 6, Return: 14.00
Episode 7, Return: 13.00
Episode 8, Return: 46.00
Episode 9, Return: 39.00
Episode 10, Return: 28.00
