# Code for Sumo Enviroment

In [11]:
# Custom Environment
import gym
import numpy as np
import traci
import math
from os import path


class SumoEnv(gym.Env):
  def __init__(self):
    
    traci.start(["sumo", "-c", path.abspath("../SUMO/test.sumocfg")])

    ## SUMO VARIABLES ##
    self.bus_stop_positions = [
        [123, 974, 1872, 2764], [123, 827, 1742, 2702, 3592]]
    self.bus_locations = {"-overlap": "123", "-R2": "259", "-R1": "125",
                          "-R0": "267", "-L3": "117", "-L2": "110", "-L1": "123", "-L0": "120"}
    self.bus_ids = ["bus_r_0_0", "bus_r_0_1",
                    "bus_r_0_2", "bus_r_0_3", "bus_r_0_4"]

    # self.route_names = [["-overlap", "-R2", "-R1", "-R0"], ["-overlap", "-L3", "-L2", "-L1", "-L0"]]
    self.route_lengths = [3591, 4697]
    # self.route_junctions={"J1": ["-L0", "-R0", "-overlap"], "J2": ["-R0", "-R1"], "J3": ["-R1", "-R2"], "J4": ["-L3", "-R2", "-overlap"], "J5": ["-L0", "-L1"], "J7": ["-L1", "-L2"], "J8": ["-L2", "-L3"]}

    self.wait_time = 0
    self.delta_speed = 0.1
    self.min_speed_before_change = 30
    self.action_delta_speed = {
        0: (1-self.delta_speed), 2: (1+self.delta_speed)}

    ## GYM VARIABLES ##
    self.bus_num = 5
    bus_stops_num = 4
    bus_speed_max = 50

    # actions: [b1, b2 (...)] # each action is either 0 = slow down, 1 = keep speed, 2 = speed up
    self.action_space = gym.spaces.Box(low=np.array(
        [0]*self.bus_num), high=np.array([2]*self.bus_num), shape=(self.bus_num,), dtype=np.int32)

    # states: [avg_wait_time, b1_speed, b1_pos, b2_speed, b2_pos, (...),  bs1_pos, bs2_pos, bs3_pos, bs4_pos]
    wait_max = 100000
    low_obs = np.zeros([1 + 2*self.bus_num + bus_stops_num])
    high_obs = np.array([wait_max] + [bus_speed_max, self.route_lengths[0]]
                        * self.bus_num + [self.route_lengths[0]]*bus_stops_num)
    self.observation_space = gym.spaces.Box(low=low_obs, high=high_obs, shape=(
        1 + 2*self.bus_num + bus_stops_num,), dtype=np.float32)

    self.max_steps = 500
    self.current_step = 0

  def reset(self):
      traci.close()
      self.wait_time = 0
      self.current_step = 0
      traci.start(["sumo", "-c", path.abspath("../SUMO/test.sumocfg")])
      return self.wait_time, {}

  def step(self, action):
    print("Action from env: ", action)
    try:
      next_state = self.sumo_step()

      # set action for each bus: 0 = slow down, 1 = keep speed, 2 = speed up
      vehicles_length = len(traci.vehicle.getIDList())
    
      for i, bus_action in enumerate(action):
        if bus_action == 1 or i >= vehicles_length:
          break
        bus_id = self.bus_ids[i]
        bus_distance_driven = traci.vehicle.getDistance(bus_id)

        if np.sign(bus_distance_driven) == -1:
          break  # if bus hasnt driven yet, skip

        bus_route = traci.vehicle.getRouteID(bus_id)
        bus_position = round(bus_distance_driven % (
            self.route_lengths[0] if (bus_route == "r_0") else self.route_lengths[1]), 3)
        nearest_bus_stop_position = self._find_nearest(
            self.bus_stop_positions[0 if bus_route == "r_0" else 1], bus_position)
        bus_speed_km_t = traci.vehicle.getSpeed(bus_id)*3.6  # m/s to km/h

        interval = [-22, 3]
        # change speed if speed > min_speed_before_change and bus is not at a bus stop
        if bus_speed_km_t > self.min_speed_before_change and not (bus_position > nearest_bus_stop_position + interval[0] and bus_position < nearest_bus_stop_position + interval[1]):
          # speed is in m/s
          new_speed = self.action_delta_speed[bus_action] * \
              traci.vehicle.getSpeed(bus_id)
          # smoothly changes to new speed over 1 second
          traci.vehicle.slowDown(bus_id, new_speed, 1)

      # reward are given if the new waiting time is strictly lower, otherwise punished
      reward = 1 if next_state[0] < self.wait_time else -1

      # set the wait time to the current wait time
      self.wait_time = next_state[0]

      # check if done
      self.current_step += 1
      done = False
      if (self.current_step >= self.max_steps):
        done = True

      return next_state, reward, done, {}

    except Exception as e:  # if there is an error, close the simulation
      print("An error occurred. Closing simulation.")
      print("Error: ", e)
      traci.close()

  def render(self):
    pass

  def close(self):
    pass

  # SUMO FUNCTIONS
  def sumo_step(self):
    # state [avg_wait_time, b1_speed, b1_pos, b2_speed, b2_pos, (...),  bs1_pos, bs2_pos, bs3_pos, bs4_pos]
    new_state = [0] * (1 + 2 * self.bus_num) + self.bus_stop_positions[0]
    personsWaitingTimeList = []
    traci.simulationStep()

    vehicles = traci.vehicle.getIDList()
    persons = traci.person.getIDList()

    # finds the average waiting time
    for i in range(0, len(persons)):
      personWaitingTime = traci.person.getWaitingTime(persons[i])
      personsWaitingTimeList.append(personWaitingTime)

    persons_waiting_num = len(personsWaitingTimeList)
    new_state[0] = round(sum(personsWaitingTimeList) /
                         persons_waiting_num, 3) if persons_waiting_num > 0 else 0.0

    # finds bus speed and position
    for j in range(0, len(vehicles)):
      vehicleId = vehicles[j]
      if traci.vehicle.getRouteID(vehicleId) != "r_0":
        continue

      vehicleSpeed = traci.vehicle.getSpeed(vehicleId)*3.6  # m/s to km/h
      vehiclePosition = traci.vehicle.getDistance(vehicleId) % (self.route_lengths[0]
                                                                if (traci.vehicle.getRouteID(vehicleId) == "r_0") else self.route_lengths[1])
      new_state[1 + 2*j] = round(vehicleSpeed, 2)
      new_state[2 + 2*j] = round(vehiclePosition, 2)
    return new_state

  def _find_nearest(self, array, value):  # BUG FIND NEAREST BUS STOP
    idx = np.searchsorted(array, value, side="left")
    if (idx == len(array) and math.fabs(value - (array[0] + array[idx-1])) < math.fabs(value - array[idx-1])):
      return array[0]
    elif idx > 0 and math.fabs(value - array[idx-1]) < math.fabs(value - array[idx]):
      return array[idx-1]
    else:
      return array[idx]

### Register of Custom Enviroment

In [12]:
# Register the environment
import gym.envs.registration

gym.envs.register(
    id='SumoEnv-v0',
    entry_point='Simulation/Python:SumoEnv',
    max_episode_steps=500, 
)

  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


# RL (DQN) Implementation

### Setting Up

In [13]:
import gymnasium as gym
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import matplotlib
from collections import namedtuple
import random

# Use Cuda (GPU) if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

<contextlib.ExitStack at 0x105698ed0>

### Replay Buffer

In [14]:
# Transition
Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))

# Replay Buffer
class ReplayBuffer: 

    # Initialize the buffer
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    # Save a transition to the buffer
    def push(self, *args):
        ''' Save a transition to the buffer '''
        self.buffer.append(Transition(*args))

    # Sample a batch of transitions
    def sample(self, batch_size):
        ''' Sample a batch of transitions '''
        return random.sample(self.buffer, batch_size)
    
    # Returns the length of the buffer
    def __len__(self):
        return len(self.buffer)

### DQN Networks

In [15]:
# Deep Q Network
class DQN (nn.Module): 
    def __init__(self, n_obersavations, n_actions):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(n_obersavations, 128) # Input Layer
        self.layer2 = nn.Linear(128, 128) # Hidden Layer
        self.layer3 = nn.Linear(128, n_actions) # Output Layer


    def forward(self, x):
        x = torch.relu(self.layer1(x.float()))
        x = torch.relu(self.layer2(x.float()))
        x = self.layer3(x)
        return x


### Hyperparameters
1. BATCH_SIZE is the number of transitions sampled from the replay buffer
2. GAMMA is the discount factor as mentioned in the previous section
3. EPS_START is the starting value of epsilon
4. EPS_END is the final value of epsilon
5. EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay
6. TAU is the update rate of the target network
7. LR is the learning rate of the ``AdamW`` optimizer
8. The size of memory for the Replay Buffer

In [16]:
# Hyperparameters
# The discount factor, batch size, learning rate, target update frequency, and memory size
BATCH_SIZE = 128
GAMMA = 0.99
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1000
TAU = 0.005
LR = 1e-4
MEMORY_SIZE = 10000

### Initialize environment and DQN

In [17]:
traci.close()

Step #0.00 (0ms ?*RT. ?UPS, TraCI: 6394ms, vehicles TOT 0 ACT 0 BUF 0)                   


In [18]:
# Initialize the environment
env = SumoEnv()

steps_done = 0

# Get number of actions and observations from gym action space
n_actions = env.action_space.shape[0]

# This is most likely wrong
n_observations = env.observation_space.shape[0]

# Initialize the DQN and target network
policy_net = DQN(n_observations, n_actions)
target_net = DQN(n_observations, n_actions)

# Set the target network to have the same weights as the policy network
target_net.load_state_dict(policy_net.state_dict())

# Set the target network to evaluation mode
target_net.eval()

 Retrying in 1 seconds




DQN(
  (layer1): Linear(in_features=15, out_features=128, bias=True)
  (layer2): Linear(in_features=128, out_features=128, bias=True)
  (layer3): Linear(in_features=128, out_features=5, bias=True)
)

### Initialize optimizer and replay buffer

In [19]:
# Initialize the optimizer
optimizer = optim.Adam(policy_net.parameters(), lr=LR)

# memory is initialized 
memory = ReplayBuffer(MEMORY_SIZE)

### Select Action

In [20]:
# def select_action(state): 
#     global steps_done
    
#     ''' Select an action using an epsilon greedy policy '''

#     sample = random.random()
#     eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
#     steps_done += 1

#     if sample > eps_threshold:
#         with torch.no_grad(): 

#             # Get predicted Q-values for all possible actions for all agents
#             q_values = policy_net(state)

#             # Reshape to get individual actions for each agent
#             # Assuming 5 agents with 3 possible actions each
#             q_values = q_values.view(-1, 5, 3)

#             # Select the action with the highest Q-value for each agent
#             actions = torch.argmax(q_values, dim=2)

#             # Return the selected actions as a single tensor
#             return actions.view(1, -1)
        
#     else:
#         # Sample actions for all agents during exploration
#         actions = torch.tensor([[(env.action_space.sample())
#         for _ in range(5)]], device=device, dtype=torch.long)
#     print("Exploration: Sampled Actions:", actions)  # Remove .item() here
#     return actions


def select_action(state):
  global steps_done

  sample = random.random()
  eps_threshold = EPS_END + (EPS_START - EPS_END) * \
      math.exp(-1.0 * steps_done / EPS_DECAY)
  steps_done += 1

  if sample > eps_threshold:
    with torch.no_grad():
      # Get predicted Q-values for all possible actions for all agents
      q_values = policy_net(state)

      # Reshape to get individual actions for each agent (bus)
      q_values = q_values.view(-1, 5, 3)  # Assuming 5 buses and 3 actions each

      # Select the action with the highest Q-value for each bus
      actions = torch.argmax(q_values, dim=2)

      # Return the list of selected actions
      # Convert tensor to list and remove outer list
      return actions.tolist()[0]

  else:
    # Sample actions for all buses during exploration
    actions = [env.action_space.sample() for _ in range(5)]
    print("Exploration: Sampled Actions:", actions)
    return actions





            




    # if sample > eps_threshold:
    #     with torch.no_grad(): 
    #         return policy_net(state).max(1).indices.view(1, 1)
    # else:
    #     print("env.action_space.sample()", env.action_space.sample())
    #     return torch.tensor([[int(env.action_space.sample())]], device=device, dtype=torch.long)
        

### Plot Duration

In [21]:
episode_durations = []
def plot_durations(show_result=False):
    plt.figure(1)
    durations_t = torch.tensor(episode_durations, dtype=torch.float)
    if show_result:
        plt.title('Result')
    else:
        plt.clf()
        plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(durations_t.numpy())
    # Take 100 episode averages and plot them too
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())

    plt.pause(0.001)  # pause a bit so that plots are updated
    if is_ipython:
        if not show_result:
            display.display(plt.gcf())
            display.clear_output(wait=True)
        else:
            display.display(plt.gcf())

### Optimize Model

In [22]:
# Function that performs a single step of the optimization
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return

    transitions = memory.sample(BATCH_SIZE)

    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor((tuple(
        map(lambda s: s is not None, batch.next_state))), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat(
        [s for s in batch.next_state if s is not None])

    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    next_state_values = torch.zeros(BATCH_SIZE, device=device)

    with torch.no_grad():
        next_state_values[non_final_mask] = target_net(
            non_final_next_states).max(1).values

    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values,
                     expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()

    # In-place graident clipping
    torch.nn.utils.clip_grad_norm_(policy_net.parameters(), 100)
    optimizer.step()

### Traning Loop

In [23]:
from itertools import count

num_episodes = 500 
episode_durations = []

f = open("results.csv", "w")

for i_episode in range(num_episodes):
    
    wait_time, info = env.reset()

    print("Wait Time: ", wait_time)

    wait_time = torch.tensor([wait_time], device=device)

    for t in count():
        print("Action Selection", env.action_space.sample())
        action = select_action(wait_time)
        print("action: ", action)
        print("env.action_space.sample()", env.action_space.sample())

        print(f"Step: {t}, wait_time: {wait_time}, Action: {action}\n")

        wait_time, reward, done, _ = env.step(action)
        reward = torch.tensor([reward], device=device)

        

Step #0.00 (0ms ?*RT. ?UPS, TraCI: 656ms, vehicles TOT 0 ACT 0 BUF 0)                    
 Retrying in 1 seconds




Wait Time:  0
Action Selection [2 1 2 2 1]
Exploration: Sampled Actions: [array([2, 2, 0, 1, 2], dtype=int32), array([2, 1, 1, 2, 0], dtype=int32), array([0, 1, 1, 2, 2], dtype=int32), array([1, 0, 0, 1, 2], dtype=int32), array([1, 2, 2, 1, 0], dtype=int32)]
action:  [array([2, 2, 0, 1, 2], dtype=int32), array([2, 1, 1, 2, 0], dtype=int32), array([0, 1, 1, 2, 2], dtype=int32), array([1, 0, 0, 1, 2], dtype=int32), array([1, 2, 2, 1, 0], dtype=int32)]
env.action_space.sample() [2 1 0 0 1]
Step: 0, wait_time: tensor([0]), Action: [array([2, 2, 0, 1, 2], dtype=int32), array([2, 1, 1, 2, 0], dtype=int32), array([0, 1, 1, 2, 2], dtype=int32), array([1, 0, 0, 1, 2], dtype=int32), array([1, 2, 2, 1, 0], dtype=int32)]

Action from env:  [array([2, 2, 0, 1, 2], dtype=int32), array([2, 1, 1, 2, 0], dtype=int32), array([0, 1, 1, 2, 2], dtype=int32), array([1, 0, 0, 1, 2], dtype=int32), array([1, 2, 2, 1, 0], dtype=int32)]
An error occurred. Closing simulation.
Error:  The truth value of an array w

TypeError: cannot unpack non-iterable NoneType object