In [None]:
#DEEP Q LEARNING ALGORITHM
#We are implementing a Deep Q-Learning (DQN) algorithm to optimize traffic light control at an intersection.
#The agent observes the number of vehicles in two directions (East-West and North-South) and the current traffic light phase.
#It then selects actions to either switch or maintain the traffic light phase.
#The goal is to minimize traffic congestion and waiting times by learning the best sequence of traffic light changes.
#The agent learns through interactions with the environment, receiving rewards based on the traffic flow efficiency.
#This approach uses deep neural networks to approximate the Q-values for state-action pairs.

import numpy as np               # For numerical operations, especially for array manipulation
import random                    # For generating random numbers for initial actions or states
import tensorflow as tf          # Framework for building and training neural networks
from tensorflow import keras     # High-level neural network API
from tensorflow.keras import layers  # Used to define different layers in a neural network model
from collections import deque     # Data structure to efficiently store and sample from memory

# Q-learning parameters
learning_rate = 0.1         # Alpha: controls the speed of learning updates
discount_factor = 0.9       # Gamma: balances importance of immediate vs. future rewards
exploration_rate = 1.0      # Epsilon: controls exploration vs. exploitation in action selection
exploration_decay = 0.99    # Decay rate for epsilon to reduce exploration over time
num_episodes = 10000        # Total learning episodes
max_steps_per_episode = 100 # Max actions per episode
batch_size = 32             # Number of experiences used in each training step

# Environment parameters
num_vehicles_per_lane = 12              # Max number of vehicles per lane (EW or NS)
num_traffic_phases = 2                  # Two possible traffic light phases: EW green or NS green
total_states = num_vehicles_per_lane * num_vehicles_per_lane * num_traffic_phases  # Total states in the environment
num_actions = num_traffic_phases       # Actions: 0 = keep current phase, 1 = switch phase

# Neural Network for Deep Q-Learning
class DQNAgent:
    def __init__(self, total_states, num_actions):
        self.total_states = total_states  # Total possible states
        self.num_actions = num_actions    # Total possible actions
        self.memory = deque(maxlen=2000)   # Replay memory to store past experiences
        self.gamma = discount_factor      # Discount factor for future rewards
        self.epsilon = exploration_rate   # Exploration rate
        self.epsilon_decay = exploration_decay  # Decay of exploration rate
        self.epsilon_min = 0.01           # Minimum exploration rate
        self.model = self.create_model()  # Main model for Q-learning
        self.target_model = self.create_model()  # Target model (for stability)
        self.update_target_model()        # Initialize the target model with the main model

    # Create the neural network model
    def create_model(self):
        model = keras.Sequential()
        model.add(layers.Dense(24, activation='relu', input_dim=self.total_states))  # First hidden layer
        model.add(layers.Dense(24, activation='relu'))  # Second hidden layer
        model.add(layers.Dense(self.num_actions, activation='linear'))  # Output layer (Q-values for each action)
        model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate), loss='mse')  # Compile model with mean squared error loss
        return model

    # Choose an action based on the current state (with exploration vs exploitation)
    def act(self, state):
        state = np.reshape(state, [1, self.total_states])  # Reshape state for model input
        if np.random.rand() <= self.epsilon:  # Exploration: random action
            return random.choice(range(self.num_actions))
        q_values = self.model.predict(state)  # Exploitation: choose action with highest Q-value
        return np.argmax(q_values[0])  # Return the index of the action with the max Q-value

    # Store experiences in memory
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))  # Add experience to memory

    # Train the model using a batch of experiences sampled from memory
    def replay(self, batch_size):
        if len(self.memory) < batch_size:  # If there's not enough memory, return
            return
        minibatch = random.sample(self.memory, batch_size)  # Sample a batch of experiences
        for state, action, reward, next_state, done in minibatch:
            target = reward  # Initial target is the immediate reward
            if not done:  # If episode is not done, calculate future reward
                target += self.gamma * np.amax(self.target_model.predict(next_state)[0])  # Q-value for next state
            target_f = self.model.predict(state)  # Get current Q-values for the state
            target_f[0][action] = target  # Update Q-value for the action taken
            self.model.fit(state, target_f, epochs=1, verbose=0)  # Train the model
        if self.epsilon > self.epsilon_min:  # Decay exploration rate
            self.epsilon *= self.epsilon_decay  # Reduce exploration over time

    # Update the target model to match the main model
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())  # Copy weights from the main model to the target model

# Initialize the DQN agent
agent = DQNAgent(total_states, num_actions)  # Create agent with state and action space sizes

# Simulate the environment and train the agent
for episode in range(num_episodes):
    # Initialize the environment for the episode (random number of vehicles in each lane)
    vehicles_ew, vehicles_ns = random.randint(0, num_vehicles_per_lane), random.randint(0, num_vehicles_per_lane)
    traffic_phase = random.choice([0, 1])  # Random initial traffic phase (0 for NS green, 1 for EW green)
    state = get_state_index(vehicles_ew, vehicles_ns, traffic_phase)  # Convert state to index for neural network

    # Simulate the environment over multiple steps (episode)
    for step in range(max_steps_per_episode):
        # Choose an action using the DQN agent
        action = agent.act(np.array([state]))  # Convert state to array format for input to the model

        # Update environment state based on the chosen action
        prev_vehicles_ew, prev_vehicles_ns = vehicles_ew, vehicles_ns  # Store previous vehicle counts
        # Apply traffic simulation logic here (vehicle arrival/departure based on traffic phase)
        next_traffic_phase = action if action == 1 else traffic_phase  # Change phase if action is to switch
        next_state = get_state_index(vehicles_ew, vehicles_ns, next_traffic_phase)  # Get next state index

        # Calculate the reward based on state transition
        reward = calculate_reward(vehicles_ew, vehicles_ns, prev_vehicles_ew, prev_vehicles_ns, traffic_phase, action)

        # Check if the episode is done (e.g., after max steps)
        done = (step == max_steps_per_episode - 1)

        # Store experience in the agent's memory
        agent.remember(np.reshape(state, [1, total_states]), action, reward, np.reshape(next_state, [1, total_states]), done)

        # Move to the next state
        state = next_state
        traffic_phase = next_traffic_phase

        if done:
            agent.update_target_model()  # Update the target model after each episode
            break

    # Train the model after each episode
    agent.replay(batch_size=batch_size)