In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import random
from collections import deque

In [3]:
class NetworkEnvironment:
    def __init__(self, topology, source, destination):
        self.topology = topology
        self.source = source
        self.destination = destination
        self.reset()

    def reset(self):
        self.current_position = self.source
        return self.update_state()

    def step(self, action):
        next_position = tuple(map(sum, zip(self.current_position, action)))
        if next_position in self.get_valid_actions():
            self.current_position = next_position
            reward = -1  # Default penalty
            done = False
            if self.current_position == self.destination:
                reward = 100
                done = True
            return self.update_state(), reward, done, {}
        else:
            return self.update_state(), -10, False, {}

    def update_state(self):
        state = np.zeros(self.topology.shape)
        state[self.current_position] = 1
        return state

    def get_valid_actions(self):
        actions = [(0, 1), (1, 0), (0, -1), (-1, 0)]
        valid_actions = []
        for action in actions:
            new_position = tuple(map(sum, zip(self.current_position, action)))
            if (0 <= new_position[0] < self.topology.shape[0] and
                0 <= new_position[1] < self.topology.shape[1] and
                self.topology[new_position] == 0):
                valid_actions.append(action)
        return valid_actions


In [4]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=1000)
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.99
        self.learning_rate = 0.001
        self.batch_size = 16
        self.model = self.build_model()

    def build_model(self):
        model = Sequential()
        model.add(Dense(16, input_dim=self.state_size, activation='relu'))
        model.add(Dense(16, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, valid_actions):
        if np.random.rand() <= self.epsilon:
            return random.choice(valid_actions)
        act_values = self.model.predict(state, verbose=0)[0]
        action_indices = [self.get_action_index(action) for action in valid_actions]
        filtered_act_values = act_values[action_indices]
        best_action_index = np.argmax(filtered_act_values)
        return valid_actions[best_action_index]

    def replay(self):
        if len(self.memory) < self.batch_size:
            return
        minibatch = random.sample(self.memory, self.batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(self.model.predict(next_state, verbose=0)[0])
            target_f = self.model.predict(state, verbose=0)
            target_f[0][self.get_action_index(action)] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def get_action_index(self, action):
        return [(0, 1), (1, 0), (0, -1), (-1, 0)].index(action)

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

    def preprocess_state(self, state):
        return np.reshape(state, [1, self.state_size])

    def train_dqn_agent(self, env, episodes=500):
        for e in range(episodes):
            state = env.reset()
            state = self.preprocess_state(state)
            for time in range(100):
                valid_actions = env.get_valid_actions()
                action = self.act(state, valid_actions)
                next_state, reward, done, _ = env.step(action)
                next_state = self.preprocess_state(next_state)
                self.remember(state, action, reward, next_state, done)
                state = next_state
                if done:
                    print(f"Episode {e+1}/{episodes} - Time: {time} - Epsilon: {self.epsilon:.4f}")
                    break
                self.replay()

    def test_dqn_agent(self, env):
        state = env.reset()
        state = self.preprocess_state(state)
        done = False
        while not done:
            valid_actions = env.get_valid_actions()
            action = self.act(state, valid_actions)
            next_state, reward, done, _ = env.step(action)
            state = self.preprocess_state(next_state)
            print(f"Action taken: {action}, Reward: {reward}, Done: {done}")


In [5]:
if __name__ == "__main__":
    topology = np.array([
        [0, 1, 0, 0],
        [0, 1, 0, 1],
        [0, 0, 0, 0],
        [1, 1, 1, 0]
    ])
    source = (0, 0)
    destination = (3, 3)
    env = NetworkEnvironment(topology, source, destination)
    state_size = np.prod(topology.shape)
    action_size = 4
    agent = DQNAgent(state_size, action_size)
    
    agent.train_dqn_agent(env)  # Train the agent
    agent.save("dqn_model.h5")  # Save the trained model
    agent.load("dqn_model.h5")  # Load and test the agent
    agent.test_dqn_agent(env)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
