In [4]:
!pip install numpy==1.23.5


Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.23.5 which is incompatible.
bigframes 2.11.0 requires numpy>=1.24.0, but you have numpy 1.23.5 which is incompatible.
blosc2 3.6.1 requires numpy>=1.26, but you have numpy 1.23.5 which is incompatible.
opencv-python-headless 4.12.0

In [2]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Detect Gym version
import gym
from packaging import version
is_new_gym = version.parse(gym.__version__) >= version.parse("0.26.0")

# Hyperparameters
learning_rate = 3e-4
gamma = 0.99
num_episodes = 1000

env = gym.make("CartPole-v1")

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# Actor Network :  network maps a state to action probabilities . how likely each action is in a given state.
class Actor(nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.policy = nn.Sequential(
            nn.Linear(state_dim, 128),      # Fully connected layer taking input state vector and producing 128 features
            nn.ReLU(),                       # 	Activation function introducing non-linearity
            nn.Linear(128, action_dim),      # Output layer that maps features to number of possible actions
            nn.Softmax(dim=-1)               # 	Converts raw outputs to a probability distribution over actions
        )

    def forward(self, state):
        return self.policy(state)

#  actor network processes an input (state) when you call actor(state).
# It runs the input through the self.policy layers and returns the action probabilities.

# Critic Network : The Critic learns to estimate the value function:
#It outputs a single number representing how good the current state is (i.e., V(s)).
# This models the state value function:V(s; φ) → predicted expected return from this state.

class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.value = nn.Sequential(
            nn.Linear(state_dim, 128),     # hidden layer: transforms state input into 128 features
            # The state is a 1D vector of 4 floating-point numbers, each representing the physical state of the cart and pole system.
            nn.ReLU(),                     # 	Non-linearity to help the network learn complex functions
            nn.Linear(128, 1)              #  returns a single scalar value for the input state
        )

    def forward(self, state):
        return self.value(state)
#  Defines how the critic processes an input state to estimate V(s).


# Instantiate
actor = Actor()
critic = Critic()
actor_optimizer = optim.Adam(actor.parameters(), lr=learning_rate)  #  optimizer (Adam)-  update the actor network's weights during training.
critic_optimizer = optim.Adam(critic.parameters(), lr=learning_rate) # separate optimizer for the critic network's

#Actor is trained using policy gradient.
#Critic is trained using value regression (e.g., TD error, MSE loss).



# Training Loop
for episode in range(num_episodes):
    # Compatible reset
    if is_new_gym:
        state, _ = env.reset()
    else:
        state = env.reset()

    state = torch.FloatTensor(state)
    total_reward = 0
    done = False

    while not done:
        # Select an action using the policy
        #  This is a stochastic policy using the output probabilities from the actor.
        probs = actor(state)  # Forward pass through the actor network to get action probabilities.
        dist = torch.distributions.Categorical(probs)
        action = dist.sample()

        # Compatible step
        if is_new_gym:
            next_state, reward, terminated, truncated, _ = env.step(action.item()) # Take action, observe reward, and move to next state
            done = terminated or truncated
        else:
            next_state, reward, done, _ = env.step(action.item()) # Take action, observe reward, and move to next state

        next_state = torch.FloatTensor(next_state)

        value = critic(state)
        next_value = critic(next_state)

        # Step 3.3: Compute the policy gradient
        advantage = reward + (1 - int(done)) * gamma * next_value.item() - value.item()
        actor_loss = -dist.log_prob(action) * advantage

        # Step 4: Update actor using policy gradient
        # Uses gradient ascent via optimizer to update θ.
        actor_optimizer.zero_grad()
        actor_loss.backward()
        actor_optimizer.step()

        # Step 5: Compute critic loss
        #This is the TD error-based critic loss.
        target_value = reward + (1 - int(done)) * gamma * next_value
        critic_loss = nn.functional.mse_loss(value, target_value.detach())

        # Update critic using gradient descent
        critic_optimizer.zero_grad()
        critic_loss.backward()
        critic_optimizer.step()

        state = next_state
        total_reward += reward

    if (episode + 1) % 10 == 0:
        print(f"Episode {episode + 1}, Total Reward: {total_reward}")


  deprecation(
  deprecation(


Episode 10, Total Reward: 27.0
Episode 20, Total Reward: 22.0
Episode 30, Total Reward: 9.0
Episode 40, Total Reward: 20.0
Episode 50, Total Reward: 10.0
Episode 60, Total Reward: 9.0
Episode 70, Total Reward: 9.0
Episode 80, Total Reward: 22.0
Episode 90, Total Reward: 29.0
Episode 100, Total Reward: 11.0
Episode 110, Total Reward: 14.0
Episode 120, Total Reward: 18.0
Episode 130, Total Reward: 42.0
Episode 140, Total Reward: 19.0
Episode 150, Total Reward: 13.0
Episode 160, Total Reward: 15.0
Episode 170, Total Reward: 33.0
Episode 180, Total Reward: 10.0
Episode 190, Total Reward: 9.0
Episode 200, Total Reward: 13.0
Episode 210, Total Reward: 19.0
Episode 220, Total Reward: 10.0
Episode 230, Total Reward: 19.0
Episode 240, Total Reward: 15.0
Episode 250, Total Reward: 18.0
Episode 260, Total Reward: 13.0
Episode 270, Total Reward: 23.0
Episode 280, Total Reward: 10.0
Episode 290, Total Reward: 13.0
Episode 300, Total Reward: 29.0
Episode 310, Total Reward: 43.0
Episode 320, Total Re