In [None]:
# Import NumPy for numerical computations
import numpy as np

# Import PyTorch modules
import torch
import torch.nn as nn
import torch.optim as optim

# Import OpenAI Gym for reinforcement learning environments
import gym

# Create the CartPole environment
env = gym.make("CartPole-v1")

# Set random seeds for reproducibility (so that results are consistent)
torch.manual_seed(42)
np.random.seed(42)

# Get the dimensionality of the state (CartPole has 4 inputs: position, velocity, angle, angular velocity))
state_dim = env.observation_space.shape[0]

# Get the number of possible actions (CartPole has 2: left or right)
num_actions = env.action_space.n

# Define the policy network as a PyTorch class
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PolicyNetwork, self).__init__()
        # First fully connected layer with 32 units
        self.fc1 = nn.Linear(input_dim, 32)
        # ReLU activation function
        self.relu = nn.ReLU()
        # Output layer to produce logits for each action
        self.fc2 = nn.Linear(32, output_dim)
        # Softmax layer to convert logits to probabilities
        self.softmax = nn.Softmax(dim=-1)

    # Forward pass through the network
    def forward(self, x):
        x = self.fc1(x)       # Apply first linear layer
        x = self.relu(x)      # Apply ReLU activation
        logits = self.fc2(x)  # Apply second linear layer to get logits
        probs = self.softmax(logits)  # Convert logits to action probabilities
        return probs

# Initialize the policy network with state_dim inputs and num_actions outputs
policy_net = PolicyNetwork(state_dim, num_actions)

# Define the optimizer (Adam) to update the network weights
optimizer = optim.Adam(policy_net.parameters(), lr=0.01)
'''
# Function to compute discounted and normalized rewards
def compute_discounted_rewards(rewards, gamma=0.99):
    discounted = np.zeros_like(rewards, dtype=np.float32)  # Initialize array
    running_add = 0  # Initialize the cumulative sum
    # Traverse rewards in reverse to apply the discount
    for t in reversed(range(len(rewards))):
        running_add = running_add * gamma + rewards[t]
        discounted[t] = running_add
    # Normalize rewards for better training stability
    discounted -= np.mean(discounted)
    discounted /= (np.std(discounted) + 1e-8)
    return discounted
'''
def compute_reward_to_go(rewards, gamma=0.99):
    rtg = np.zeros_like(rewards, dtype=np.float32)
    running_add = 0
    for t in reversed(range(len(rewards))):
        running_add = rewards[t] + gamma * running_add
        rtg[t] = running_add
    return rtg

# Number of training episodes
num_episodes = 10

# Training loop across episodes
for episode in range(num_episodes):
    state = env.reset()  # Reset the environment at the beginning of each episode
    done = False  # Termination flag for the episode

    # Buffers to store the trajectory
    episode_states = []
    episode_actions = []
    episode_rewards = []

    # Generate a full episode
    while not done:
        # Convert the state to a tensor and add a batch dimension
        state_tensor = torch.FloatTensor(state).unsqueeze(0)  # shape: [1, state_dim]

        # Get action probabilities from the policy network (detach to avoid gradients)
        action_probs = policy_net(state_tensor).detach().numpy().ravel()

        # Sample an action from the probability distribution
        action = np.random.choice(num_actions, p=action_probs)

        # Step the environment with the chosen action
        next_state, reward, done, _ = env.step(action)

        # Store the transition
        episode_states.append(state)
        episode_actions.append(action)
        episode_rewards.append(reward)

        # Move to the next state
        state = next_state

    # Compute discounted rewards from the episode
    discounted_rewards = compute_discounted_rewards(episode_rewards)
    discounted_rewards_tensor = torch.FloatTensor(discounted_rewards)

    # Convert episode data to tensors
    states_tensor = torch.FloatTensor(episode_states)         # [batch_size, state_dim]
    actions_tensor = torch.LongTensor(episode_actions)        # [batch_size]

    # Compute action probabilities for the whole episode
    action_probs = policy_net(states_tensor)                  # [batch_size, num_actions]

    # Gather the probabilities of the actions actually taken
    selected_log_probs = torch.log(action_probs.gather(1, actions_tensor.unsqueeze(1)).squeeze())

    # Compute the policy loss: -log(pi(a|s)) * reward
    loss = -torch.mean(selected_log_probs * discounted_rewards_tensor)

    # Backpropagation step
    optimizer.zero_grad()  # Clear old gradients
    loss.backward()        # Compute new gradients
    optimizer.step()       # Update network parameters

    # Print progress every 10 episodes
    if (episode + 1) % 10 == 0:
        print(f"Episode {episode + 1}, Return: {np.sum(episode_rewards):.2f}")


  deprecation(
  deprecation(


Episode 10, Return: 21.00
Episode 20, Return: 35.00
Episode 30, Return: 22.00
Episode 40, Return: 117.00
Episode 50, Return: 176.00
Episode 60, Return: 159.00
Episode 70, Return: 344.00
Episode 80, Return: 391.00
Episode 90, Return: 438.00
Episode 100, Return: 179.00
Episode 110, Return: 312.00
Episode 120, Return: 437.00
Episode 130, Return: 399.00
Episode 140, Return: 500.00
Episode 150, Return: 115.00
Episode 160, Return: 494.00
Episode 170, Return: 500.00
Episode 180, Return: 471.00
Episode 190, Return: 500.00
Episode 200, Return: 500.00
Episode 210, Return: 403.00
Episode 220, Return: 500.00
Episode 230, Return: 500.00
Episode 240, Return: 330.00
Episode 250, Return: 161.00
Episode 260, Return: 295.00
Episode 270, Return: 159.00
Episode 280, Return: 500.00
Episode 290, Return: 500.00
Episode 300, Return: 500.00
Episode 310, Return: 500.00
Episode 320, Return: 500.00
Episode 330, Return: 500.00
Episode 340, Return: 500.00
Episode 350, Return: 500.00
Episode 360, Return: 500.00
Epis