# Imports

In [1]:
import sys
import os
sys.path.append('..')
from dqn_agent import DQNAgent
from ppo_agent import PPOAgent

# Demonstration

## Parameters

In [2]:
# Define parameters for the agents

# Environment parameters (example: CartPole)
STATE_SIZE = 4
ACTION_SIZE = 2
HIDDEN_SIZE = 64 # number of neurons in hidden layers
LEARNING_RATE = 1e-3 # learning rate
GAMMA = 0.99 # discount factor
BATCH_SIZE = 32 # mini-batch size for training
MEMORY_SIZE = 10000 # replay buffer size
EPSILON_START = 1.0 # starting epsilon for exploration
EPSILON_MIN = 0.01 # minimum epsilon value
EPSILON_DECAY = 0.995 # decay rate for epsilon
CLIP_EPSILON = 0.2 # clipping parameter for PPO objective
UPDATE_EPOCHS = 4 # number of epochs to update the policy per training iteration
C1 = 0.5 # coefficient for the value loss
C2 = 0.01 # coefficient for the entropy bonus

# ================================
# DQN Agent Parameters
# ================================
DQN_PARAMS = {
    'state_size': STATE_SIZE,
    'action_size': ACTION_SIZE,
    'hidden_size': HIDDEN_SIZE,         
    'lr': LEARNING_RATE,
    'gamma': GAMMA,
    'batch_size': BATCH_SIZE,
    'memory_size': MEMORY_SIZE,
    'epsilon_start': EPSILON_START,
    'epsilon_min': EPSILON_MIN,
    'epsilon_decay': EPSILON_DECAY
}

# ================================
# PPO Agent Parameters
# ================================
PPO_PARAMS = {
    'state_size': STATE_SIZE,
    'action_size': ACTION_SIZE,
    'hidden_size': HIDDEN_SIZE,
    'lr': LEARNING_RATE,
    'gamma': GAMMA,
    'clip_epsilon': CLIP_EPSILON,
    'update_epochs': UPDATE_EPOCHS,
    'batch_size': BATCH_SIZE,
    'c1': C1,
    'c2': C2
}

## Initialization

In [3]:
dqn_agent = DQNAgent(**DQN_PARAMS)
ppo_agent = PPOAgent(**PPO_PARAMS)

## Loading environment

In [4]:
import numpy as np

# Monkey patch: alias np.bool8 to np.bool_ if it's missing.
if not hasattr(np, 'bool8'):
    np.bool8 = np.bool_

import gym

# Create the CartPole environment (or replace with your desired environment)
env = gym.make("CartPole-v1")
print("Environment loaded:", env.spec.id)

Environment loaded: CartPole-v1


## Training

### DQN

In [5]:
num_episodes = 100

print("Training DQN Agent:")
for episode in range(num_episodes):
    # Extract the observation from the reset tuple.
    state, _ = env.reset()
    done = False
    total_reward = 0

    while not done:
        action = dqn_agent.predict(state)
        # In Gym 0.26+, step returns (observation, reward, terminated, truncated, info)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        # Store only the observation (state) in memory.
        dqn_agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
        dqn_agent.train()
        
    print(f"DQN Episode {episode+1}: Total Reward = {total_reward}")

Training DQN Agent:
DQN Episode 1: Total Reward = 17.0
DQN Episode 2: Total Reward = 14.0
DQN Episode 3: Total Reward = 17.0
DQN Episode 4: Total Reward = 38.0
DQN Episode 5: Total Reward = 18.0
DQN Episode 6: Total Reward = 20.0
DQN Episode 7: Total Reward = 13.0
DQN Episode 8: Total Reward = 19.0
DQN Episode 9: Total Reward = 11.0
DQN Episode 10: Total Reward = 11.0
DQN Episode 11: Total Reward = 17.0
DQN Episode 12: Total Reward = 10.0
DQN Episode 13: Total Reward = 20.0
DQN Episode 14: Total Reward = 17.0
DQN Episode 15: Total Reward = 9.0
DQN Episode 16: Total Reward = 10.0
DQN Episode 17: Total Reward = 10.0
DQN Episode 18: Total Reward = 9.0
DQN Episode 19: Total Reward = 11.0
DQN Episode 20: Total Reward = 12.0
DQN Episode 21: Total Reward = 11.0
DQN Episode 22: Total Reward = 16.0
DQN Episode 23: Total Reward = 29.0
DQN Episode 24: Total Reward = 16.0
DQN Episode 25: Total Reward = 30.0
DQN Episode 26: Total Reward = 77.0
DQN Episode 27: Total Reward = 78.0
DQN Episode 28: Tot

### PPO

In [6]:
print("\nTraining PPO Agent:")
for episode in range(num_episodes):
    state, _ = env.reset()  # Get only the observation.
    done = False
    total_reward = 0

    while not done:
        action = ppo_agent.predict(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        # Store the transition (make sure only observations are stored).
        ppo_agent.store_transition(state, ppo_agent.last_action, ppo_agent.last_log_prob, reward, done, ppo_agent.last_value)
        state = next_state
        total_reward += reward
        
    ppo_agent.train()
    print(f"PPO Episode {episode+1}: Total Reward = {total_reward}")


Training PPO Agent:
PPO Episode 1: Total Reward = 18.0
PPO Episode 2: Total Reward = 13.0
PPO Episode 3: Total Reward = 32.0
PPO Episode 4: Total Reward = 20.0
PPO Episode 5: Total Reward = 28.0
PPO Episode 6: Total Reward = 16.0
PPO Episode 7: Total Reward = 13.0
PPO Episode 8: Total Reward = 16.0
PPO Episode 9: Total Reward = 32.0
PPO Episode 10: Total Reward = 32.0
PPO Episode 11: Total Reward = 18.0
PPO Episode 12: Total Reward = 13.0
PPO Episode 13: Total Reward = 9.0
PPO Episode 14: Total Reward = 35.0
PPO Episode 15: Total Reward = 15.0
PPO Episode 16: Total Reward = 14.0
PPO Episode 17: Total Reward = 18.0
PPO Episode 18: Total Reward = 17.0
PPO Episode 19: Total Reward = 19.0
PPO Episode 20: Total Reward = 19.0
PPO Episode 21: Total Reward = 17.0
PPO Episode 22: Total Reward = 71.0
PPO Episode 23: Total Reward = 19.0
PPO Episode 24: Total Reward = 15.0
PPO Episode 25: Total Reward = 17.0
PPO Episode 26: Total Reward = 13.0
PPO Episode 27: Total Reward = 12.0
PPO Episode 28: T

  # Total loss with entropy bonus.


PPO Episode 43: Total Reward = 60.0
PPO Episode 44: Total Reward = 21.0
PPO Episode 45: Total Reward = 22.0
PPO Episode 46: Total Reward = 29.0
PPO Episode 47: Total Reward = 25.0
PPO Episode 48: Total Reward = 12.0
PPO Episode 49: Total Reward = 19.0
PPO Episode 50: Total Reward = 16.0
PPO Episode 51: Total Reward = 11.0
PPO Episode 52: Total Reward = 17.0
PPO Episode 53: Total Reward = 19.0
PPO Episode 54: Total Reward = 20.0
PPO Episode 55: Total Reward = 14.0
PPO Episode 56: Total Reward = 17.0
PPO Episode 57: Total Reward = 12.0
PPO Episode 58: Total Reward = 13.0
PPO Episode 59: Total Reward = 44.0
PPO Episode 60: Total Reward = 20.0
PPO Episode 61: Total Reward = 24.0
PPO Episode 62: Total Reward = 17.0
PPO Episode 63: Total Reward = 10.0
PPO Episode 64: Total Reward = 10.0
PPO Episode 65: Total Reward = 12.0
PPO Episode 66: Total Reward = 39.0
PPO Episode 67: Total Reward = 35.0
PPO Episode 68: Total Reward = 17.0
PPO Episode 69: Total Reward = 11.0
PPO Episode 70: Total Reward

## Evaluation

### DQN

In [7]:
print("Evaluating DQN Agent...")

# Reset the environment and extract the observation.
state, _ = env.reset()
done = False
total_reward = 0

while not done:
    # Get action from the trained agent.
    action = dqn_agent.predict(state)
    # Step in the environment (Gym 0.26+ returns 5 values).
    next_state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    total_reward += reward
    state = next_state

print("Total evaluation reward:", total_reward)

Evaluating DQN Agent...
Total evaluation reward: 500.0


### PPO

In [8]:
print("Evaluating PPO Agent...")

# Reset the environment and extract the observation.
state, _ = env.reset()
done = False
total_reward = 0

while not done:
    # Get action from the trained agent.
    action = ppo_agent.predict(state)
    # Step in the environment (Gym 0.26+ returns 5 values).
    next_state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    total_reward += reward
    state = next_state

print("Total evaluation reward:", total_reward)

Evaluating PPO Agent...
Total evaluation reward: 51.0


## Save models

In [9]:
dqn_agent.save("../saved_models/dqn_agent.pth")
ppo_agent.save("../saved_models/ppo_agent.pth")