In [1]:
import gym
from gym import Env
from gym.spaces import Discrete, Box, Dict
import numpy as np
import random
import os

from simulation.simulation import Simulation

In [2]:
class JunctionEnv(Env):
    def __init__(self):
        self.junction_file_path = "cross_road.junc"
        self.config_file_path = "cross_road.config"

        self.simulation = Simulation(self.junction_file_path, self.config_file_path)
        self.observation_space_size = 10

        self.observation_space = Box(0, 10, shape=(1, self.observation_space_size), dtype=float)
        self.action_space = Discrete(5)
        
        self.state = np.asarray(np.zeros(self.observation_space_size)).astype('float32')
        self.total_reward = 0
        self.iteration = 0
        
    def step(self, action):
        self.take_action(action)
        self.simulation.compute_single_iteration()
        
        if self.total_reward < -500000:
            done = True
            print(f"Steps: {self.iteration}")
        else:
            done = False
        
        info = {}
        self.iteration += 1
        state = np.asarray(self.get_continuous_state()).astype('float32')
        
        return state, 1, done, info
    
    def get_vehicle_state(self):
        return [
            vehicle.get_path_distance_travelled(),
            vehicle.get_length(),
            vehicle.get_speed(),
            vehicle.get_acceleration()
        ]

    def get_traffic_light_state(self):
        return [light.get_state(), light.get_time_remaining()]

    def get_continuous_state(self):
        inputs = []
        for light in self.simulation.model.get_lights():
            inputs += self.get_traffic_light_state(light)

        for vehicle in self.simulation.model.get_vehicles():
            route = self.simulation.model.get_route(vehicle.get_route_uid())
            if route.get_path_uid(vehicle.get_path_index()) in [1, 4]:
                inputs += self.get_vehicle_state(vehicle)

        inputs = inputs[-100:]
        inputs += [np.NAN] * (100 - len(inputs))
        return inputs
    
    def take_action(self, action_index):
        if action_index == 0:
            pass
        elif action_index == 1:
            light = self.simulation.model.lights[0]
            if light.colour == "green":
                light.set_red()
        elif action_index == 2:
            light = self.simulation.model.lights[1]
            if light.colour == "green":
                light.set_red()
        elif action_index == 3:
            light = self.simulation.model.lights[0]
            if light.colour == "red":
                light.set_green()
        elif action_index == 4:
            light = self.simulation.model.lights[1]
            if light.colour == "red":
                light.set_green()
        return 0
    
    def compute_simulation_metrics(self):
        for vehicle in self.simulation.model.vehicles:
            if vehicle.get_speed() < 5:
                vehicle.add_wait_time(self.simulation.model.tick_time)

            route = self.simulation.model.get_route(vehicle.get_route_uid())
            path = self.simulation.model.get_path(route.get_path_uid(vehicle.get_path_index()))
            if vehicle.get_path_distance_travelled() >= path.get_length():
                if vehicle.get_path_index() + 1 == len(route.get_path_uids()):
                    self.wait_time.append(vehicle.get_wait_time())
                    self.wait_time = self.wait_time[-self.wait_time_vehicle_limit:]
                    
    def render(self):
        pass
        
    def reset(self):
        self.simulation = Simulation(self.junction_file_path, self.config_file_path)
        self.iteration = 0
        self.state = np.asarray(np.zeros(self.observation_space_size)).astype('float32')
        return self.state
        

In [3]:
env = JunctionEnv()

In [4]:
episode = 5
for episode in range(1, episode + 1):
    state = env.reset()
    done = False
    score = 0
    steps = 0
    
    while not done:
        steps += 1
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
    print(f"Episode: {episode} Score: {score} Steps: {steps}")

TypeError: compute_single_iteration() takes 1 positional argument but 2 were given

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers.legacy import Adam

In [None]:
states = env.observation_space.shape
actions = env.action_space.n
env = JunctionEnv(Simulation("cross_road.junc", visualise=False))

In [None]:
# def build_model(states, actions):
#     model = Sequential()
#     model.add(Dense(24, activation='relu', input_shape=states))
#     model.add(Dense(24, activation='relu', input_shape=states))
#     model.add(Flatten())
#     model.add(Dense(actions , activation='linear'))
#     return model

In [None]:
# del model

In [None]:
# model = build_model(states, actions)

In [None]:
# model.summary()

In [None]:
# from rl.agents import DQNAgent
# from rl.policy import BoltzmannQPolicy
# from rl.memory import SequentialMemory

In [None]:
# def build_agent(model, actions):
#     policy = BoltzmannQPolicy()
#     memory = SequentialMemory(limit=100000, window_length=1)
#     dqn = DQNAgent(model=model, memory=memory, policy=policy, nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
#     return dqn

In [None]:
# dqn = build_agent(model, actions)
# dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])
# dqn.fit(env, nb_steps=5000, visualize=False, verbose=1)

In [None]:
# dqn.save_weights('dqn_weights.h5f', overwrite=True)

In [None]:
# del model
# del dqn
# del env

# env = JunctionEnv(Simulation("cross_road.junc", visualise=False))
# states = env.observation_space.shape
# actions = env.action_space.n

# model = build_model(states, actions)
# dqn = build_agent(model, actions)
# dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])
# dqn.load_weights('dqn_weights.h5f')

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
# Configuration paramaters for the whole setup
seed = 42
gamma = 0.9  # Discount factor for past rewards
epsilon = 1.0  # Epsilon greedy parameter
epsilon_min = 0.1  # Minimum epsilon greedy parameter
epsilon_max = 1.0  # Maximum epsilon greedy parameter
epsilon_interval = (
    epsilon_max - epsilon_min
)  # Rate at which to reduce chance of random action being taken
batch_size = 32  # Size of batch taken from replay buffer
max_steps_per_episode = 10000

In [None]:
num_actions = 5

# def create_q_model():
#     l2_norm = tf.keras.regularizers.L2(l2=0.0005)
#     model = tf.keras.Sequential()
#     model.add(tf.keras.layers.Dense(12, input_dim = 10, activation = 'relu')) # input layer requires input_dim param
# #     model.add(tf.keras.layers.Dense(24, activation = 'relu'))
#     model.add(tf.keras.layers.Dense(num_actions, activation='softmax'))
#     return model

def create_q_model():
    inputs = tf.keras.layers.Input(shape=(10,))
    layer1 = layers.Dense(24, activation="relu")(inputs)
    layer2 = layers.Dense(24, activation="relu")(layer1)
    layer3 = layers.Dense(24, activation="relu")(layer2)
    outputs = tf.keras.layers.Dense(num_actions, activation='linear')(layer3)
    return tf.keras.models.Model(inputs=inputs, outputs=outputs)


# The first model makes the predictions for Q-values which are used to
# make a action.
model = create_q_model()
# Build a target model for the prediction of future rewards.
# The weights of a target model get updated every 10000 steps thus when the
# loss between the Q-values is calculated the target Q-value is stable.
model_target = create_q_model()

In [None]:
###### In the Deepmind paper they use RMSProp however then Adam optimizer
# improves training time
optimizer = keras.optimizers.legacy.Adam(learning_rate=0.0005, clipnorm=1.0)

# Experience replay buffers
action_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []
episode_reward_history = []
running_reward = 0
episode_count = 0
frame_count = 0

# Number of frames to take random action and observe output
epsilon_random_frames = 10000

# Number of frames for exploration
epsilon_greedy_frames = 50000

# Maximum replay length
# Note: The Deepmind paper suggests 1000000 however this causes memory issues
max_memory_length = 100000

# Train the model after 4 actions
update_after_actions = 20

# How often to update the target network
update_target_network = 10000

# Using huber loss for stability
loss_function = keras.losses.Huber()

while True:  # Run until solved
    state = np.array(env.reset())
    episode_reward = 0

    for timestep in range(1, max_steps_per_episode):
        # env.render(); Adding this line would show the attempts
        # of the agent in a pop up window.
        frame_count += 1

        # Use epsilon-greedy for exploration
        if frame_count < epsilon_random_frames or epsilon > np.random.rand(1)[0]:
            # Take random action
            action = np.random.choice(num_actions, p=[0.8, 0.05, 0.05, 0.05, 0.05])
        else:
            # Predict action Q-values
            # From environment state
            state_tensor = tf.convert_to_tensor(state)
            state_tensor = tf.expand_dims(state_tensor, 0)
            action_probs = model(state_tensor, training=False)
            # Take best action
            action = tf.argmax(action_probs[0]).numpy()

        # Decay probability of taking random action
        epsilon -= epsilon_interval / epsilon_greedy_frames
        epsilon = max(epsilon, epsilon_min)

        # Apply the sampled action in our environment
        state_next, reward, done, _ = env.step(action)
        state_next = np.array(state_next)
        episode_reward += reward

        # Save actions and states in replay buffer
        action_history.append(action)
        state_history.append(state)
        state_next_history.append(state_next)
        done_history.append(done)
        rewards_history.append(reward)
        state = state_next

        # Update every fourth frame and once batch size is over 32
        if frame_count % update_after_actions == 0 and len(done_history) > batch_size:

            # Get indices of samples for replay buffers
            indices = np.random.choice(range(len(done_history)), size=batch_size)

            # Using list comprehension to sample from replay buffer
            state_sample = np.array([state_history[i] for i in indices])
            state_next_sample = np.array([state_next_history[i] for i in indices])
            rewards_sample = [rewards_history[i] for i in indices]
            action_sample = [action_history[i] for i in indices]
            done_sample = tf.convert_to_tensor(
                [float(done_history[i]) for i in indices]
            )

            # Build the updated Q-values for the sampled future states
            # Use the target model for stability
            future_rewards = model_target.predict(state_next_sample, verbose=0)
            # Q value = reward + discount factor * expected future reward
            updated_q_values = rewards_sample + gamma * tf.reduce_max(
                future_rewards, axis=1
            )

            # If final frame set the last value to -1
            updated_q_values = updated_q_values * (1 - done_sample) - done_sample

            # Create a mask so we only calculate loss on the updated Q-values
            masks = tf.one_hot(action_sample, num_actions)

            with tf.GradientTape() as tape:
                # Train the model on the states and updated Q-values
                q_values = model(state_sample)

                # Apply the masks to the Q-values to get the Q-value for action taken
                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                # Calculate loss between new Q-value and old Q-value
                loss = loss_function(updated_q_values, q_action)

            # Backpropagation
            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        if frame_count % update_target_network == 0:
            # update the the target network with new weights
            model_target.set_weights(model.get_weights())
            # Log details
            template = "running reward: {:.2f} at episode {}, frame count {}"
            print(template.format(running_reward, episode_count, frame_count))

        # Limit the state and reward history
        if len(rewards_history) > max_memory_length:
            del rewards_history[:1]
            del state_history[:1]
            del state_next_history[:1]
            del action_history[:1]
            del done_history[:1]

        if done:
            break

    # Update running reward to check condition for solving
    episode_reward_history.append(episode_reward)
    if len(episode_reward_history) > 50:
        del episode_reward_history[:1]
    running_reward = np.mean(episode_reward_history)

    episode_count += 1

    if running_reward > 1400:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break

In [None]:
model.save('saved_model')

In [None]:
# model = keras.models.load_model('saved_model')
# env = Simulation("cross_road.junc", visualise=True, play=False, saved_model=model)