In [158]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

In [159]:
def reset_sumo_environment():
    return np.floor(np.random.rand(10,4) * 10)

def step_in_sumo(action):
    if action == 1: return (np.random.rand(10,4), np.random.rand(), True)
    return (np.random.rand(10,4), np.random.rand(), False)

In [160]:
# Define the neural network for the Q-function
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 24)
        self.fc2 = nn.Linear(24, 24)
        self.fc3 = nn.Linear(24, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

In [161]:
# Define the RL agent
class RLAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95  # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.model = DQN(state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        self.criterion = nn.MSELoss()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state = torch.FloatTensor(state)
        q_values = self.model(state)
        return np.argmax(q_values.detach().numpy())

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target += self.gamma * np.amax(self.model(torch.FloatTensor(next_state)).detach().numpy())
            target_f = self.model(torch.FloatTensor(state))
            # Check if action index is valid
            if 0 <= action < self.action_size:
                target_f[action] = target
            else:
                print(f"Invalid action: {action}")

            # Convert back to tensor for loss calculation
            target_f_tensor = torch.FloatTensor(target_f)
            self.model.zero_grad()
            loss = self.criterion(target_f_tensor, self.model(torch.FloatTensor(state)))
            loss.backward()
            self.optimizer.step()
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [162]:
# Simulation interaction loop
def run_simulation(agent, num_episodes, batch_size):
    for e in range(num_episodes):
        state = reset_sumo_environment()  # Reset the SUMO environment and get the initial state
        done = False
        total_reward = 0

        while not done:
            action = agent.act(state)
            next_state, reward, done = step_in_sumo(action)  # Step through the SUMO simulation
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward

        print(f"Episode: {e+1}/{num_episodes}, Total Reward: {total_reward}")
        agent.replay(batch_size)

In [163]:
# Sample parameters
state_size = 4  # Example state size, adjust based on your simulation
action_size = 10  # Example action size, adjust based on your simulation
agent = RLAgent(state_size, action_size)
run_simulation(agent, num_episodes=1000, batch_size=32)

Episode: 1/1000, Total Reward: 4.328444536022692
Episode: 2/1000, Total Reward: 1.2714364760071626
Episode: 3/1000, Total Reward: 1.2600046283768607
Episode: 4/1000, Total Reward: 2.5433055995741105
Episode: 5/1000, Total Reward: 12.319360461396014
Episode: 6/1000, Total Reward: 24.184672210105877
Episode: 7/1000, Total Reward: 10.585140195617608
Episode: 8/1000, Total Reward: 1.265143369249005
Episode: 9/1000, Total Reward: 5.171384339535756
Episode: 10/1000, Total Reward: 2.347870318586777
Invalid action: 37
Episode: 11/1000, Total Reward: 3.7740128090574743
Episode: 12/1000, Total Reward: 1.6389701382248965
Invalid action: 37
Episode: 13/1000, Total Reward: 1.8203534030591821
Invalid action: 37
Episode: 14/1000, Total Reward: 0.47245134164152536
Episode: 15/1000, Total Reward: 3.9438818542591525
Episode: 16/1000, Total Reward: 3.105546480123383
Invalid action: 60
Episode: 17/1000, Total Reward: 1.669335228917141
Episode: 18/1000, Total Reward: 4.377588009760505
Episode: 19/1000, Tot