In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import traci
import sumolib
import math

# Initialization for SUMO environment
environment = "intersection/sumo_config.sumocfg"
phase_lane_control = np.array([
    ["N2TL_0", "N2TL_1", "N2TL_2", "S2TL_0", "S2TL_1", "S2TL_2"],
    ["N2TL_3", "S2TL_3"],
    ["W2TL_0", "W2TL_1", "W2TL_2", "E2TL_0", "E2TL_1", "E2TL_2"],
    ["W2TL_3", "E2TL_3"]
], dtype=object)

sumobin = sumolib.checkBinary('sumo-gui')
traci.start([sumobin, '-c', environment, '--start'])  

traci.simulation.subscribe([traci.constants.VAR_COLLIDING_VEHICLES_IDS])

# Subscribe to vehicle accelerations for all vehicles
for veh_id in traci.vehicle.getIDList():
    traci.vehicle.subscribe(veh_id, traci.constants.VAR_ACCELERATION)

trafficlight_id = traci.trafficlight.getIDList()[0]
controlled_lanes = traci.trafficlight.getControlledLanes(trafficlight_id)
TIME_STEP = 0.8  # Simulation time step in seconds



In [None]:
# Utility functions
def get_avg_waiting():
    grouped_avg_waiting = [get_lane_num_waiting(lanes) / len(lanes) for lanes in phase_lane_control]
    return grouped_avg_waiting

def get_lane_num_waiting(lanes):
    sum = 0
    for lane_id in lanes:
        sum += traci.lane.getLastStepHaltingNumber(lane_id)
    return sum

def get_waiting_ids(lanes):
    ids = []
    for lane_id in lanes:
        ids.extend([veh_id for veh_id in traci.lane.getLastStepVehicleIDs(lane_id) if traci.vehicle.getSpeed(veh_id) < 0.1])
    return np.array(ids)

def pct_served(waiting_ids):
    if len(waiting_ids) == 0:
        return 0
    still_loaded = [veh_id for veh_id in waiting_ids if veh_id in traci.vehicle.getLoadedIDList()]
    num_waiting_served = len([veh_id for veh_id in still_loaded if traci.vehicle.getSpeed(veh_id) > 0.5])
    num_waiting_served += len(waiting_ids) - len(still_loaded)
    return num_waiting_served / len(waiting_ids)
    
def get_total_waiting_time():
    vehicles = traci.vehicle.getIDList()
    waiting_times = [traci.vehicle.getWaitingTime(vehicle) for vehicle in vehicles]
    return sum(waiting_times)


In [3]:

class Environment:
    def __init__(self):
        self.prev_action = traci.trafficlight.getPhase(trafficlight_id)
        self.yellow_duration = 3
        self.green_duration = 25
        self.static_action = 0
        self.waiting_ids = []
        self.pct_served = 0

    def reset_sumo_environment(self, environment):
        traci.load(['-c', environment, '--start', '--step-length', TIME_STEP])
        traci.trafficlight.setProgram(trafficlight_id, '0')
        self.waiting_ids = []
        self.pct_served = 0
        state = self.get_state()
        return state

    def step_in_sumo(self, action):
        self.apply_action(action)
        traci.simulationStep()
        next_state = self.get_state()
        reward = self.calculate_reward()
        done = self.check_done_condition()
        return next_state, reward, done

    def get_state(self):
        state = get_avg_waiting()
        state.append(self.pct_served)
        state.append(self.prev_action)
        return np.array(state)

    def apply_action(self, action):
        if action == self.prev_action:
            self.static_action = 1
            return
        self.simulate_phase(2 * self.prev_action + 1, self.yellow_duration)
        self.pct_served = pct_served(self.waiting_ids)
        self.waiting_ids = get_waiting_ids(phase_lane_control[action])
        self.simulate_phase(2 * action, self.green_duration)
        self.prev_action = action

    def simulate_phase(self, action, duration):
        traci.trafficlight.setPhase(trafficlight_id, action)
        steps = 0
        while steps < duration / TIME_STEP:
            traci.simulationStep()
            steps += 1

    def calculate_reward(self):
        reward = self.static_action + math.exp(4 * self.pct_served) - math.exp(0.2 * sum(get_avg_waiting()))
        self.static_action = 0
        self.pct_served = 0
        return reward

    def check_done_condition(self):
        collision_data = traci.simulation.getSubscriptionResults()
        if collision_data and traci.constants.VAR_COLLIDING_VEHICLES_IDS in collision_data:
            print('takkar BC')
            return True
        current_time = traci.simulation.getTime()
        return current_time > 3300


In [4]:

# Define the Double DQN agent
class DQN(nn.Module):
    def __init__(self, n_state_params, n_actions):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(n_state_params, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, n_actions)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

class DoubleDQNAgent:
    def __init__(self, n_state_params, n_actions):
        self.n_state_params = n_state_params
        self.n_actions = n_actions
        self.memory = deque(maxlen=3300)
        self.gamma = 0.95  # Discount rate
        self.epsilon = 0.05  # Exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.999

        # Primary and Target Networks
        self.model = DQN(n_state_params, n_actions)
        self.target_model = DQN(n_state_params, n_actions)
        self.update_target_network()
        
        # Optimizer and loss
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.0001)
        self.criterion = nn.MSELoss()

    def update_target_network(self):
        """Copy weights from the primary network to the target network."""
        self.target_model.load_state_dict(self.model.state_dict())

    def remember(self, state, action, reward, next_state, done):
        """Store experience in replay memory."""
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        """Select an action using an epsilon-greedy policy."""
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.n_actions)
        state = torch.FloatTensor(state)
        q_values = self.model(state)
        return np.argmax(q_values.detach().numpy())

    def replay(self, batch_size):
        """Sample a minibatch from replay memory and update the primary network."""
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        
        for state, action, reward, next_state, done in minibatch:
            # Double DQN target calculation
            target = reward
            if not done:
                next_state_tensor = torch.FloatTensor(next_state)
                
                # Double DQN: use model to select action, and target_model for Q-value
                next_action = np.argmax(self.model(next_state_tensor).detach().numpy())
                target_q_value = self.target_model(next_state_tensor).detach().numpy()[next_action]
                target += self.gamma * target_q_value

            # Prepare for gradient update
            target_f = self.model(torch.FloatTensor(state)).detach().numpy()
            if 0 <= action < self.n_actions:
                target_f[action] = target
            else:
                print(f"Invalid action: {action}")

            # Convert back to tensor for loss calculation
            target_f_tensor = torch.FloatTensor(target_f)
            self.model.zero_grad()
            loss = self.criterion(target_f_tensor, self.model(torch.FloatTensor(state)))
            loss.backward()
            self.optimizer.step()

        # Decay epsilon after each replay
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def train_target_network(self, update_frequency, episode):
        """Update target network every 'update_frequency' episodes."""
        if episode % update_frequency == 0:
            self.update_target_network()



In [5]:

# Simulation interaction loop
def run_simulation(agent, env, num_episodes, batch_size, update_frequency=10):
    for e in range(num_episodes):
        state = env.reset_sumo_environment(environment)
        done = False
        total_reward = 0

        while not done:
            action = agent.act(state)
            next_state, reward, done = env.step_in_sumo(action)
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward

        print(f"Episode: {e+1}/{num_episodes}, Total Reward: {total_reward}")
        agent.replay(batch_size)
        agent.train_target_network(update_frequency, e)


In [6]:

# Initialize environment and agent
env = Environment()
n_state_params = len(env.get_state())
program = traci.trafficlight.getAllProgramLogics(trafficlight_id)[0]
n_actions = int(len(program.phases) / 2)
agent = DoubleDQNAgent(n_state_params, n_actions)

# Run simulation
run_simulation(agent, env, num_episodes=200, batch_size=32)


Episode: 1/200, Total Reward: 2474.8337734904635
Episode: 2/200, Total Reward: 1195.4366010638541
Episode: 3/200, Total Reward: -4419.21374449738
Episode: 4/200, Total Reward: -47399.04810852992
Episode: 5/200, Total Reward: -11639.539085169688
Episode: 6/200, Total Reward: -18622.393175228473
Episode: 7/200, Total Reward: 2318.4199700932463
Episode: 8/200, Total Reward: -1567.0429387653076
Episode: 9/200, Total Reward: -5972.931829616328
Episode: 10/200, Total Reward: -2958.738113521114
Episode: 11/200, Total Reward: 4400.855099147097
Episode: 12/200, Total Reward: 3510.566859945309
Episode: 13/200, Total Reward: 3008.443402726259
Episode: 14/200, Total Reward: 3902.6937350699786
Episode: 15/200, Total Reward: 4508.836169436125
Episode: 16/200, Total Reward: 4471.553262708733
Episode: 17/200, Total Reward: 3948.0644291061008
Episode: 18/200, Total Reward: 4570.635917599656
Episode: 19/200, Total Reward: 4284.789996151451
Episode: 20/200, Total Reward: 3840.748753557468
Episode: 21/200