# Traffic Limit with Deep Reinforcement Learning for 4 inputs and 12 outputs João Ferreira demo v02

In [2]:
#---------Imports-------------

import tensorflow as tf

from tensorflow import reshape, shape
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers
from keras.layers import Dropout

from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

from gym import Env
from gym.spaces import Discrete, Box

import numpy as np
import random
import math
import matplotlib.pyplot as plt

Dropout_value = 0.3
JSP = "Glorot"
Max_fit_rep = 20 #numero maximo de ciclos
Max_value = 0
Num_trains = 5
Num_hidden_layers = 1
Num_neurons = 336
Optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=1e-3)
Load_weights = 1 #primeira vez que faz o codigo por 0 de resto por 1
Show_model = 0

In [7]:
class TrafLimitEnv(Env):
    
    def __init__(self):
        self.action_space = Discrete(12)
        low = np.array([-10, -10, -10, -10], dtype=np.float32)
        high = np.array([10, 10, 10, 10], dtype=np.float32)        
        self.observation_space = Box(low, high, dtype=np.float32)
        self.state = None
        self.Traf_length = 100
        
    def step(self, action):
        # Apply action
        # 0 -1 = -1 decr limite
        # 1 -1 = 0 sem acao
        # 2 -1 = 1  incr limite
        if action>=0 and action <=2 :
            self.state[0] += 0.25*(action - 1) #trafego ICMP benigno
        elif action>=3 and action <=5 :         
            self.state[1] += 0.25*(action - 4) #trafego outros benigno
        elif action>=6 and action <=8 :
            self.state[2] += 0.25*(action - 7) #trafego ICMP maligno
        elif action>=9 and action <=11 :         
            self.state[3] += 0.25*(action - 10) #trafego outros maligno       
            
        reward = -1  # set default reward to -1
        if 27-30 <= self.state[0] <= 33-30:
            reward = 1  #Calculate reward do limite de trafego ICMP benigno  
        if 27-30 <= self.state[1] <= 33-30:
            reward = 1  # Calculate reward do limite de trafego outros benigno  
        if 17-20 <= self.state[2] <= 23-20:
            reward = 1  # Calculate reward do limite de trafego ICMP maligno  
        if 17-20 <= self.state[3] <= 23-20:
            reward = 1  # Calculate reward do limite de trafego outros maligno
                    
        self.Traf_length -= 1 # Reduce observation length by 1 second               
        done = True if self.Traf_length <= 0 else False # Check if Traf is done
            
        info = {} # Set placeholder for info        
        return self.state, reward, done, info # Return step information

    def render(self):
        pass
    
    def reset(self):
        self.state = 25 + self.np_random.uniform(-4, 4, size=(4,))-25
        self.Traf_length = 100 
        return self.state
    
def build_model(states, actions):
    model = Sequential()    
    model.add(Flatten(input_shape=(1,states)))   
    model.add(Dense(Num_neurons, activation='relu'))
    for r in range(0, Num_hidden_layers):
        model.add(Dense(Num_neurons, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
    nb_actions=actions, nb_steps_warmup=50, target_model_update=1e-2)
    return dqn

In [8]:
for n in range(0,Num_trains):
    print("________________________________________________________________________________")
    print ("Repetição Score nº ", n)
    env = TrafLimitEnv()
    states = env.observation_space.shape[0]
    actions = env.action_space.n

    model = build_model(states, actions)

    if Show_model:
        model.summary()

    dqn = build_agent(model, actions)
    dqn.compile(optimizer=Optimizer, metrics=['mae'])

    cont=1

    if Load_weights:
        with open("Reward_"+JSP+"_max_dqn_weights.txt", 'r') as f:
            max_w = float(str(f.read()))
    else:
        with open("Reward_"+JSP+"_max_dqn_weights.txt", 'w') as f:
            f.write("0")
            max_w = 0 

    with open("Rewards_"+JSP+".txt", 'a') as f:
        f.write('\n')
        
    with open("Reward_"+JSP+"_max_dqn_weights.txt", 'w') as f:
        f.write(str(np.round(max_w,2)))

    while True:
        if Load_weights:
            dqn.load_weights(JSP+"_dqn_weights")
        hist = dqn.fit(env, nb_steps=100, visualize=False, verbose=0)
        w = np.mean(hist.history['episode_reward'])
        print("Iteração: ", cont, " - Pontuação para 10 episódios = ", w)
        with open("Rewards_"+JSP+".txt", 'a') as f:
            f.write('\n'+str(np.round(w,2)))
        if w >= max_w:
            max_w = w
            dqn.save_weights(JSP+"_dqn_weights", overwrite=True)
            with open("Reward_"+JSP+"_max_dqn_weights.txt", 'w') as f:
                f.write(str(np.round(w,2)))     
        if w >= 100 or cont >= Max_Fit_rep:
            break
        cont +=1

    Reward_fit = np.mean(hist.history['episode_reward'])
    print("                                                                   Reward Fit "+JSP+"=", Reward_fit)  
    with open("Reward_Fit_"+JSP+".txt", 'a') as f:
        f.write('\n'+str(np.round(Reward_fit,2)))
    print("max_w=",max_w)  
    print(hist.history)   
    # After training is done, we save the final weights one more time.
    del model
    del dqn
    del env
    env = TrafLimitEnv()
    states = env.observation_space.shape[0]
    actions = env.action_space.n
    model = build_model(states, actions)
    dqn = build_agent(model, actions)
    dqn.compile(optimizer=Optimizer, metrics=['mae'])
    dqn.load_weights(JSP+"_dqn_weights")  #sem o .h5
    scores = dqn.test(env, nb_episodes=10, visualize=False)
    scor = np.mean(scores.history['episode_reward'])
    print("                                                                   Reward Scores "+JSP+"=", scor)  
    with open("Reward_Scores_"+JSP+".txt", 'a') as f:
        f.write('\n'+str(np.round(scor,2)))
    if scor >= Max_value:
        Max_value = scor
        dqn.save_weights(JSP+"_dqn_weights_maxmax", overwrite=True)
        with open("Reward_"+JSP+"_score_dqn_weights_maxmax.txt", 'w') as f:
            f.write(str(np.round(scor,2))) 

________________________________________________________________________________
Repetição Score nº  0
Iteração:  1  - Pontuação para 10 episódios =  100.0
                                                                   Reward Fit Glorot= 100.0
max_w= 100.0
{'episode_reward': [100.0], 'nb_episode_steps': [100], 'nb_steps': [100]}
Testing for 10 episodes ...
Episode 1: reward: 100.000, steps: 100
Episode 2: reward: 100.000, steps: 100
Episode 3: reward: 100.000, steps: 100
Episode 4: reward: 100.000, steps: 100
Episode 5: reward: 100.000, steps: 100
Episode 6: reward: 100.000, steps: 100
Episode 7: reward: 100.000, steps: 100
Episode 8: reward: 100.000, steps: 100
Episode 9: reward: 100.000, steps: 100
Episode 10: reward: 100.000, steps: 100
                                                                   Reward Scores Glorot= 100.0
________________________________________________________________________________
Repetição Score nº  1
Iteração:  1  - Pontuação para 10 episódios =  1