# Imports

In [1]:
import sys
import os
import random
sys.path.append('..')
from dqn_agent import DQNAgent
from ppo_agent import PPOAgent

# Demonstration

## Parameters

In [2]:
# Define parameters for the agents

STATE_SIZE = 4
ACTION_SIZE = 5
HIDDEN_SIZE = 64 # number of neurons in hidden layers
LEARNING_RATE = 1e-3 # learning rate
GAMMA = 0.99 # discount factor
BATCH_SIZE = 32 # mini-batch size for training
MEMORY_SIZE = 10000 # replay buffer size
EPSILON_START = 1.0 # starting epsilon for exploration
EPSILON_MIN = 0.01 # minimum epsilon value
EPSILON_DECAY = 0.995 # decay rate for epsilon
CLIP_EPSILON = 0.2 # clipping parameter for PPO objective
UPDATE_EPOCHS = 4 # number of epochs to update the policy per training iteration
C1 = 0.5 # coefficient for the value loss
C2 = 0.01 # coefficient for the entropy bonus
ACTION_STD = 0.5 # initial standard deviation for continuous actions

# ================================
# DQN Agent Parameters
# ================================
DQN_PARAMS = {
    'state_size': STATE_SIZE,
    'action_size': ACTION_SIZE,
    'hidden_size': HIDDEN_SIZE,         
    'lr': LEARNING_RATE,
    'gamma': GAMMA,
    'batch_size': BATCH_SIZE,
    'memory_size': MEMORY_SIZE,
    'epsilon_start': EPSILON_START,
    'epsilon_min': EPSILON_MIN,
    'epsilon_decay': EPSILON_DECAY
}

# ================================
# PPO Agent Parameters
# ================================
PPO_PARAMS = {
    'state_size': STATE_SIZE,
    'action_size': ACTION_SIZE,
    'hidden_size': HIDDEN_SIZE,
    'lr': LEARNING_RATE,
    'gamma': GAMMA,
    'clip_epsilon': CLIP_EPSILON,
    'update_epochs': UPDATE_EPOCHS,
    'batch_size': BATCH_SIZE,
    'c1': C1,
    'c2': C2,
    'action_std': ACTION_STD
}

## Initialization

In [3]:
dqn_agent = DQNAgent(**DQN_PARAMS)
ppo_agent = PPOAgent(**PPO_PARAMS)

## Training

### DQN

In [4]:
num_timesteps = 10
num_episodes = 25


print("Training DQN Agent:")
for episode in range(num_episodes):
    total_reward = 0.0
    state = [random.randint(0, 1) for _ in range(STATE_SIZE)]

    for timestep in range(num_timesteps):
        done = timestep == num_timesteps - 1
        action = dqn_agent.predict(state)

        next_state = [random.randint(0, 1) for _ in range(STATE_SIZE)]
        reward = random.random()
        if done: reward = reward + 50

        print(f"State: {state}, Action: {action}, Reward: {reward}, Next State: {next_state}, Done: {done}")

        dqn_agent.learn(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
            
        print(f"DQN Episode {episode+1} - Timestep {timestep+1} - Total Reward = {total_reward}")

Training DQN Agent:
State: [1, 1, 1, 1], Action: [0, 1, 0, 0, 0], Reward: 0.3102065098548463, Next State: [0, 0, 0, 1], Done: False
DQN Episode 1 - Timestep 1 - Total Reward = 0.3102065098548463
State: [0, 0, 0, 1], Action: [0, 0, 1, 0, 0], Reward: 0.13683342143610244, Next State: [0, 1, 1, 1], Done: False
DQN Episode 1 - Timestep 2 - Total Reward = 0.44703993129094877
State: [0, 1, 1, 1], Action: [0, 0, 0, 0, 1], Reward: 0.28731259779642127, Next State: [0, 0, 1, 0], Done: False
DQN Episode 1 - Timestep 3 - Total Reward = 0.73435252908737
State: [0, 0, 1, 0], Action: [0, 0, 1, 0, 0], Reward: 0.1502450402069624, Next State: [0, 1, 1, 1], Done: False
DQN Episode 1 - Timestep 4 - Total Reward = 0.8845975692943324
State: [0, 1, 1, 1], Action: [1, 0, 0, 1, 0], Reward: 0.9994662876023936, Next State: [1, 1, 0, 0], Done: False
DQN Episode 1 - Timestep 5 - Total Reward = 1.884063856896726
State: [1, 1, 0, 0], Action: [0, 1, 0, 1, 0], Reward: 0.14068862514936986, Next State: [1, 1, 0, 0], Done

  return F.mse_loss(input, target, reduction=self.reduction)


DQN Episode 4 - Timestep 2 - Total Reward = 0.6988621539586053
State: [0, 0, 0, 1], Action: [1, 0, 1, 0, 1], Reward: 0.7350405063152616, Next State: [1, 0, 1, 0], Done: False
DQN Episode 4 - Timestep 3 - Total Reward = 1.433902660273867
State: [1, 0, 1, 0], Action: [0, 1, 1, 1, 0], Reward: 0.6621487166736729, Next State: [1, 0, 0, 0], Done: False
DQN Episode 4 - Timestep 4 - Total Reward = 2.09605137694754
State: [1, 0, 0, 0], Action: [1, 1, 1, 1, 1], Reward: 0.8378702271923432, Next State: [1, 0, 0, 1], Done: False
DQN Episode 4 - Timestep 5 - Total Reward = 2.9339216041398832
State: [1, 0, 0, 1], Action: [1, 1, 1, 0, 1], Reward: 0.033955670233627, Next State: [0, 0, 0, 0], Done: False
DQN Episode 4 - Timestep 6 - Total Reward = 2.9678772743735102
State: [0, 0, 0, 0], Action: [1, 0, 1, 1, 0], Reward: 0.9136849535545926, Next State: [1, 0, 1, 1], Done: False
DQN Episode 4 - Timestep 7 - Total Reward = 3.881562227928103
State: [1, 0, 1, 1], Action: [1, 1, 0, 1, 0], Reward: 0.78161058585

### PPO

In [5]:
num_timesteps = 10
num_episodes = 25


print("Training PPO Agent:")
for episode in range(num_episodes):
    total_reward = 0.0
    state = [random.randint(0, 1) for _ in range(STATE_SIZE)]

    for timestep in range(num_timesteps):
        done = timestep == num_timesteps - 1
        action = ppo_agent.predict(state)

        next_state = [random.randint(0, 1) for _ in range(STATE_SIZE)]
        reward = random.random()
        if done: reward = reward + 50

        print(f"State: {state}, Action: {action}, Reward: {reward}, Next State: {next_state}, Done: {done}")

        ppo_agent.learn(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
            
        print(f"PPO Episode {episode+1} - Timestep {timestep+1} - Total Reward = {total_reward}")

Training PPO Agent:
State: [0, 0, 1, 0], Action: [-0.904605507850647, 1.5355199575424194, 1.0553532838821411, -2.008817195892334, 0.5343154668807983], Reward: 0.6057832228795297, Next State: [1, 0, 0, 1], Done: False
PPO Episode 1 - Timestep 1 - Total Reward = 0.6057832228795297
State: [1, 0, 0, 1], Action: [0.3092474937438965, 0.501794695854187, -1.7823467254638672, -0.1784544587135315, -0.8232919573783875], Reward: 0.42747730555110064, Next State: [0, 1, 1, 0], Done: False
PPO Episode 1 - Timestep 2 - Total Reward = 1.0332605284306302
State: [0, 1, 1, 0], Action: [-0.01436489075422287, -1.659661889076233, 0.41393041610717773, -1.5377953052520752, 1.6252856254577637], Reward: 0.7268883360096928, Next State: [0, 0, 1, 1], Done: False
PPO Episode 1 - Timestep 3 - Total Reward = 1.760148864440323
State: [0, 0, 1, 1], Action: [-1.0295226573944092, 1.491027593612671, -0.4562627971172333, 1.2717344760894775, 1.5072431564331055], Reward: 0.09972681765297642, Next State: [1, 1, 0, 1], Done: F

## Save models

In [6]:
dqn_agent.save("../saved_models/dqn_agent.pth")
ppo_agent.save("../saved_models/ppo_agent.pth")

Model saved to ../saved_models/dqn_agent.pth
Model saved to ../saved_models/ppo_agent.pth
