In [1]:
from agents.ppo    import PPO
from agents.agent            import Agent


from discriminators.eairl    import EAIRL
from utils.utils             import RunningMeanStd, Dict, make_transition

from configparser            import ConfigParser
from argparse                import ArgumentParser

import os
import gymnasium as gym
import numpy as np

import torch
import torch.nn as nn
from torch.autograd import Variable


In [2]:
env = gym.make("Taxi-v3")

In [3]:
action_dim = 1
state_dim = env.observation_space.n
print(action_dim, state_dim, action_dim)
print()
parser = ArgumentParser('parameters')
# Taxi specific 
def decode_positions(states):
    """
    Gets an state from env.render() (int) and returns
    the taxi position (row, col), the passenger position
    and the destination location

    :param states: a list of states represented as integers [0-499]
    :return: taxi_row, taxi_col, pass_code, dest_idx
    """
    dest_loc = [state % 4 for state in states]
    states = [state // 4 for state in states]
    pass_code = [state % 5 for state in states]
    states = [state // 5 for state in states]
    taxi_col = [state % 5 for state in states]
    states = [state // 5 for state in states]
    taxi_row = states
    return taxi_row, taxi_col, pass_code, dest_loc
def encode_states(states, encode_method, state_dim):
    """
        Gets a list of integers and returns their encoding
        as 1 of 2 possible encoding methods:
            - one-hot encoding (array)
            - position encoding

    :param states: list of integers in [0,num_states-1]
    :param encode_method: one of 'one_hot', 'positions'
    :param state_dim: dimension of state (used for 'one_hot' encoding)
    :return: states_encoded: one hot encoding of states
    """

    batch_size = len(states)

    if encode_method is 'positions':
        '''
            position encoding encodes the important game positions as 
            a 19-dimensional vector:
                - 5 dimensions are used for one-hot encoding of the taxi's row (0-4)
                - 5 dimensions are used for one-hot encoding of the taxi's col (0-4)
                - 5 dimensions are used for one-hot encoding of the passenger's position:
                    0 is 'R', 1 is 'G', 2 is 'Y', 3 is 'B' and 4 is if the passenger in the taxi
                - 4 dimensions are used for one-hot encoding of the destination location:
                    0 is 'R', 1 is 'G', 2 is 'Y' and 3 is 'B'
                we simply concatenate those vectors into a 19-dim. vector with 4 ones in it
                corresponding to the positions encoding and the rest are zeros.      
        '''

        taxi_row, taxi_col, pass_code, dest_loc = decode_positions(states)

        # one-hot encode taxi's row
        taxi_row_onehot = np.zeros((batch_size, 5))
        taxi_row_onehot[np.arange(batch_size), taxi_row] = 1
        # one-hot encode taxi's col
        taxi_col_onehot = np.zeros((batch_size, 5))
        taxi_col_onehot[np.arange(batch_size), taxi_col] = 1
        # one-hot encode row
        pass_code_onehot = np.zeros((batch_size, 5))
        pass_code_onehot[np.arange(batch_size), pass_code] = 1
        # one-hot encode row
        dest_loc_onehot = np.zeros((batch_size, 4))
        dest_loc_onehot[np.arange(batch_size), dest_loc] = 1

        states_encoded = np.concatenate([taxi_row_onehot, taxi_col_onehot,
                                         pass_code_onehot, dest_loc_onehot], axis=1)

    else:   # one-hot
        states_encoded = np.zeros((batch_size, state_dim))
        states_encoded[np.arange(batch_size), states] = 1

    return states_encoded
# Define all the model parameters
hidden_units = 32               # num. units in hidden layer
replay_buffer_size = 200000     # buffer size
start_learning = 50000          # num. transitions before start learning
target_update_freq = 10000      # num. transitions between Q_target network updates
eps = 0.1                       # final epsilon for epsilon-greedy action selection
schedule_timesteps = 350000     # num. transitions for epsilon annealing
batch_size = 32                 # size of batch size for training
gamma = 0.99                    # discount factor of MDP
eps_optim = 0.01                # epsilon parameter for optimization (improves stability of optimizer)
alpha = 0.95                    # alpha parameter of RMSprop optimizer
learning_rate = 0.00025         # step size for optimization process


encode_method = 'one_hot'     # state encoding method ('one_hot' or 'positions')

if encode_method is 'positions':
    state_dim = 19  # explained in utils.encode_states
else:   # one-hot
    state_dim = env.observation_space.n

regularization = None           # regularization may be 'regularization'
save_fig = True     # whether to save figure of accumulated reward
save_model = True      # whether to save the DQN model

# Define 2-layered architecture
architecture = {"state_dim": state_dim,
                "hidden_units": hidden_units,
                "num_actions": env.action_space.n}
# Pack the epsilon greedy exploration parameters
exploration_params = {"timesteps": schedule_timesteps, "final_eps": eps}

dtype = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor 


1 500 1



  if encode_method is 'positions':
  if encode_method is 'positions':


In [9]:
parser = ConfigParser()
parser.read('config.ini')
demonstrations_location_args = Dict(parser,'demonstrations_location',True)
agent_args = Dict(parser,"ppo")
discriminator_args = Dict(parser,"eairl")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device='cpu'
if False:
    from torch.utils.tensorboard import SummaryWriter
    writer = SummaryWriter()
else:
    writer = None
discriminator = EAIRL(writer, device, state_dim, action_dim, discriminator_args)
algorithm = PPO(device, state_dim, action_dim, agent_args)

In [5]:
algorithm

PPO(
  (actor): Actor(
    (layers): ModuleList(
      (0): Linear(in_features=500, out_features=64, bias=True)
      (1): Linear(in_features=64, out_features=64, bias=True)
    )
    (last_layer): Linear(in_features=64, out_features=1, bias=True)
  )
  (critic): Critic(
    (layers): ModuleList(
      (0): Linear(in_features=500, out_features=64, bias=True)
      (1): Linear(in_features=64, out_features=64, bias=True)
    )
    (last_layer): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [23]:
def train_ppo(env, agent, writer, max_episodes=500):
    for episode in range(max_episodes):
        state,_ = env.reset()
        state=encode_states([state], encode_method, state_dim)
        states, actions, rewards, next_states, done_masks, log_probs = [], [], [], [], [], []
        done = False
        episode_reward = 0

        while not done:
            state_tensor = torch.tensor([state], dtype=torch.float32, device=device)

            # print(state_tensor)
            action, log_prob = agent.get_action(state_tensor)
            next_state, reward, done, terminated,_  = env.step(action)
            next_state=encode_states([next_state], encode_method, state_dim)
            
            # Collect transition data
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            next_states.append(next_state)
            done_masks.append(0 if done else 1)
            log_probs.append(log_prob.item())
            
            state = next_state
            episode_reward += reward

        # Store episode reward in TensorBoard
        writer.add_scalar("Episode Reward", episode_reward, episode)

        # Process batch data
        states = torch.tensor(states, dtype=torch.float32, device=device)
        actions = torch.tensor(actions, dtype=torch.int64, device=device).unsqueeze(-1)
        rewards = torch.tensor(rewards, dtype=torch.float32, device=device).unsqueeze(-1)
        next_states = torch.tensor(next_states, dtype=torch.float32, device=device)
        done_masks = torch.tensor(done_masks, dtype=torch.float32, device=device).unsqueeze(-1)
        log_probs = torch.tensor(log_probs, dtype=torch.float32, device=device).unsqueeze(-1)

        # Train the PPO agent with collected data
        agent.train_network(writer, episode, states, actions, rewards, next_states, done_masks, log_probs)
        

        if episode  :
            print(f"Episode {episode}, Reward: {episode_reward}")

In [None]:
train_ppo(env,algorithm,writer)