<a href="https://colab.research.google.com/github/GirolamoOddo/AppliedMath_Notebooks/blob/main/DeepRL_for_CruseControl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***DDPG Based Control for Speed Profile Tracking***
In this notebook, the aim is to develop a controller based on DDPG Reinforcement Learning to effectively track an assigned speed for a very basic modelled racing car. To achieve this the controller will be able to act on the brake and throttle pedal.

Below are some reference articles and the Keras code upon which the development is based:

>  Deep Deterministic Path Following  
>  https://arxiv.org/abs/2104.06014

>  Path Following with Deep Reinforcement Learning for Autonomous Cars
>  https://www.scitepress.org/PublishedPapers/2021/107154/

>  Path following for Autonomous Ground Vehicle Using DDPG Algorithm: A Reinforcement Learning Approach  
>  https://www.mdpi.com/2076-3417/13/11/6847

>  Keras Implementation  
>  https://keras.io/examples/rl/ddpg_pendulum/







# 0 - Initialization

In [None]:
# @title Clear Variables

locals().clear()
globals().clear()
! pip install simple-pid




In [None]:
# @title Run-Time Info
import psutil
import platform
import cpuinfo

# Informazioni sulla CPU
cpu_info = cpuinfo.get_cpu_info()
cpu_model = cpu_info['brand_raw']
cpu_architecture = platform.architecture()[0]

# Informazioni sulla RAM
ram_info = psutil.virtual_memory()
ram_total = ram_info.total

# Nome dell'ambiente di esecuzione (Google Colab)
runtime_info = "Google Colab"

print("CPU Model:", cpu_model)
print("CPU Architecture:", cpu_architecture)
print("Total RAM:", round((ram_total / (1024 ** 3)), 2), "GB")
print("Runtime info:", runtime_info)

CPU Model: Intel(R) Xeon(R) CPU @ 2.20GHz
CPU Architecture: 64bit
Total RAM: 12.67 GB
Runtime info: Google Colab


# 3 - Define Sim. Environment

In [None]:
import random as random
import numpy as np
from matplotlib import pyplot as plt

class SimEnv:
    def __init__(self, target_speed=20.0):
        self.action_list = [0.0]
        self.current_time = 0.0
        self.target_speed = target_speed #m/s
        self.longitudinalVelocity = 0.0

    def reset(self):
        # Initialize the simulation
        self.action_list = [0.0]
        self.longitudinalVelocity = 0.0
        self.current_time = 0.0
        self.random_target_drift = np.random.randint(-int(self.target_speed /2) , int(self.target_speed /2))

    def step(self, ThrottleBrake_action):
        # Calculate the net action based on throttle and brake
        action = float(ThrottleBrake_action)
        self.action_list.append(action)

        self.current_time += 1e-1

        # Retrieve the current velocity from the FMU simulation
        G_acceleration = np.clip((- 0.5 * action**2 + 1.5 * action), a_min=-2, a_max=1) # [-]
        acceleration   = G_acceleration * 9.807

        self.longitudinalVelocity += 1e-1 * acceleration
        self.longitudinalVelocity = np.clip(self.longitudinalVelocity, a_min=0, a_max=80.0)

        # Check if the simulation should be terminated
        done = self.current_time == 10

        self.target_speed_follow = (self.target_speed + self.random_target_drift) + np.random.uniform(-1.5, 1.5)
        # Calculate the reward based on the difference between the current speed and the target speed
        reward_step = - abs(self.longitudinalVelocity - self.target_speed_follow) + 1
        if abs(self.longitudinalVelocity - self.target_speed_follow) > 2:
           reward_step = - abs(self.longitudinalVelocity - self.target_speed_follow)
        else:
           reward_step = + 1 / (abs(self.longitudinalVelocity - self.target_speed_follow) + 0.1)

        #reward_cumul += reward

        if self.current_time > 60 and abs(self.longitudinalVelocity - self.target_speed_follow) >= 30:
            reward_step -= 1000.0
            done = True

        print('Current_time_s:     ', round(self.current_time, 3))
        print('CalledAccel_in_ms2: ', round(acceleration, 5))
        print('CurrentSpeed_in_kph:', round(self.longitudinalVelocity*3.6, 3))
        print('TargetSpeed_in_kph: ', round(self.target_speed_follow*3.6, 3))
        print('Action_%:           ', round(action*100,  3))
        print('Reward:             ', round(reward_step, 3))
        if done == True: print('======== DONE: ', done, ' ========')
        print()

        return [self.longitudinalVelocity,  self.target_speed_follow], reward_step, done

    def close(self):
        return 0


  and should_run_async(code)


# 4 - Model Section

In [None]:
import gym
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt

tf.random.set_seed(89)
np.random.seed(89)

In [None]:

env = SimEnv()

num_states = 2 #current_speed, target_speed
print("Size of State Space ->  {}".format(num_states))
num_actions = 1 # ThrottelBrake_action
print("Size of Action Space ->  {}".format(num_actions))

upper_bound =  1 #action_max
lower_bound = -1 #action_min

print("Max Value of Action ->  {}".format(upper_bound))
print("Min Value of Action ->  {}".format(lower_bound))

Size of State Space ->  2
Size of Action Space ->  1
Max Value of Action ->  1
Min Value of Action ->  -1


In [None]:
class OUActionNoise:
    def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None):
        self.theta = theta
        self.mean = mean
        self.std_dev = std_deviation
        self.dt = dt
        self.x_initial = x_initial
        self.reset()

    def __call__(self):
        # Formula taken from https://www.wikipedia.org/wiki/Ornstein-Uhlenbeck_process.
        x = (
            self.x_prev
            + self.theta * (self.mean - self.x_prev) * self.dt
            + self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
        )
        # Store x into x_prev
        # Makes next noise dependent on current one
        self.x_prev = x
        return x

    def reset(self):
        if self.x_initial is not None:
            self.x_prev = self.x_initial
        else:
            self.x_prev = np.zeros_like(self.mean)

In [None]:
class Buffer:
    def __init__(self, buffer_capacity=100000, batch_size=64):
        # Number of "experiences" to store at max
        self.buffer_capacity = buffer_capacity
        # Num of tuples to train on.
        self.batch_size = batch_size

        # Its tells us num of times record() was called.
        self.buffer_counter = 0

        # Instead of list of tuples as the exp.replay concept go
        # We use different np.arrays for each tuple element
        self.state_buffer = np.zeros((self.buffer_capacity, num_states))
        self.action_buffer = np.zeros((self.buffer_capacity, num_actions))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, num_states))

    # Takes (s,a,r,s') obervation tuple as input
    def record(self, obs_tuple):
        # Set index to zero if buffer_capacity is exceeded,
        # replacing old records
        index = self.buffer_counter % self.buffer_capacity

        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]

        self.buffer_counter += 1

    # Eager execution is turned on by default in TensorFlow 2. Decorating with tf.function allows
    # TensorFlow to build a static graph out of the logic and computations in our function.
    # This provides a large speed up for blocks of code that contain many small TensorFlow operations such as this one.
    def update(
        self, state_batch, action_batch, reward_batch, next_state_batch,
    ):
        # Training and updating Actor & Critic networks.
        # See Pseudo Code.
        with tf.GradientTape() as tape:
            target_actions = target_actor(next_state_batch, training=True)
            y = reward_batch + gamma * target_critic(
                [next_state_batch, target_actions], training=True
            )
            critic_value = critic_model([state_batch, action_batch], training=True)
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables)
        critic_optimizer.apply_gradients(
            zip(critic_grad, critic_model.trainable_variables)
        )

        with tf.GradientTape() as tape:
            actions = actor_model(state_batch, training=True)
            critic_value = critic_model([state_batch, actions], training=True)
            # Used `-value` as we want to maximize the value given
            # by the critic for our actions
            actor_loss = -tf.math.reduce_mean(critic_value)

        actor_grad = tape.gradient(actor_loss, actor_model.trainable_variables)
        actor_optimizer.apply_gradients(
            zip(actor_grad, actor_model.trainable_variables)
        )

    # We compute the loss and update parameters
    def learn(self):
        # Get sampling range
        record_range = min(self.buffer_counter, self.buffer_capacity)
        # Randomly sample indices
        batch_indices = np.random.choice(record_range, self.batch_size)

        # Convert to tensors
        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])

        self.update(state_batch, action_batch, reward_batch, next_state_batch)


# This update target parameters slowly
# Based on rate `tau`, which is much less than one.
def update_target(target_weights, weights, tau):
    for (a, b) in zip(target_weights, weights):
        a.assign(b * tau + a * (1 - tau))

In [None]:
def get_actor():
    # Initialize weights between -3e-3 and 3-e3
    last_init = tf.random_uniform_initializer(minval=0.01, maxval=0.9)

    inputs = layers.Input(shape=(num_states,))
    out = layers.Dense(256, activation="relu")(inputs)
    out = layers.Dense(256, activation="relu")(out)
    outputs = layers.Dense(1, activation="tanh", kernel_initializer=last_init)(out)

    # Our upper bound is 2.0 for Pendulum.
    outputs = outputs * upper_bound
    model = tf.keras.Model(inputs, outputs)
    return model


def get_critic():
    # State as input
    state_input = layers.Input(shape=(num_states))
    state_out1 = layers.Dense(16, activation="relu")(state_input)
    state_out = layers.Dense(32, activation="tanh")(state_out1)

    # Action as input
    action_input = layers.Input(shape=(num_actions))
    action_out = layers.Dense(32, activation="tanh")(action_input)

    # Both are passed through seperate layer before concatenating
    concat = layers.Concatenate()([state_out, action_out])

    out = layers.Dense(256, activation="relu")(concat)
    out = layers.Dense(256, activation="tanh")(out)
    outputs = layers.Dense(1)(out)

    # Outputs single value for give state-action
    model = tf.keras.Model([state_input, action_input], outputs)

    return model

In [None]:
def policy(state, noise_object):
    sampled_actions = tf.squeeze(actor_model(state))
    noise = noise_object()
    # Adding noise to action
    sampled_actions = sampled_actions.numpy() + noise

    # We make sure action is within bounds
    legal_action = np.clip(sampled_actions, lower_bound, upper_bound)

    return [np.squeeze(legal_action)]

In [None]:
std_dev = 0.3
ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))

actor_model = get_actor()
critic_model = get_critic()

target_actor = get_actor()
target_critic = get_critic()

# Making the weights equal initially
target_actor.set_weights(actor_model.get_weights())
target_critic.set_weights(critic_model.get_weights())

# Learning rate for actor-critic models
critic_lr = 0.002
actor_lr = 0.001

critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)

# Discount factor for future rewards
gamma = 0.99
# Used to update target networks
tau = 0.005

buffer = Buffer(50000, 64)

In [None]:

from simple_pid import PID

target_speed_ms = 15.0
env = SimEnv(target_speed = target_speed_ms)
total_episodes = 500
normalization_scalar = 80.0
# To store reward history of each episode
ep_reward_list = []
# To store average reward history of last few episodes
avg_reward_list = []
sp = 0

# Takes about 4 min to train
for ep in range(total_episodes):

    env.reset()
    prev_state = [x / normalization_scalar for x in [0.0, target_speed_ms]]

    episodic_reward = 0.0

    while env.current_time < 120:
        sp += 1
        tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)

        print()
        print('Episode:            ', ep)
        print('Age:                ', sp)
        action = policy(tf_prev_state, ou_noise)


        if ep % 5 == 0:
           use_pid = True
        else:
           use_pid = False

        if use_pid:
           print('PID:                ', use_pid)
           K_P = 10.0
           K_I = 5.00
           K_D = 0.10

           np_prev_state = tf_prev_state.numpy()
           pid = PID(K_P, K_I, K_D, setpoint=np_prev_state[0, 1])
           action = np.clip((np.array([pid(np_prev_state[0, 0])])), a_min=-1, a_max=1)


        # Recieve state and reward from environment.
        state, reward, done = env.step(action[0])

        buffer.record((prev_state, action, reward, state))
        episodic_reward += reward

        buffer.learn()
        update_target(target_actor.variables, actor_model.variables, tau)
        update_target(target_critic.variables, critic_model.variables, tau)

        # End this episode when `done` is True
        if done:
            break

        prev_state = [x / normalization_scalar for x in state]


    ep_reward_list.append(episodic_reward)

    # Mean of last 40 episodes
    avg_reward = np.mean(ep_reward_list[-40:])
    print()
    print("Episode: {}, Avg Reward is ==> {}".format(ep, avg_reward))
    print()
    avg_reward_list.append(avg_reward)

# Plotting graph
# Episodes versus Avg. Rewards
plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Epsiodic Reward")
plt.show()

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Reward:              -11.436


Episode:             17
Age:                 18945
Current_time_s:      33.1
CalledAccel_in_ms2:  -12.75268
CurrentSpeed_in_kph: 0.0
TargetSpeed_in_kph:  35.4
Action_%:            -70.244
Reward:              -9.833


Episode:             17
Age:                 18946
Current_time_s:      33.2
CalledAccel_in_ms2:  -12.69942
CurrentSpeed_in_kph: 0.0
TargetSpeed_in_kph:  35.199
Action_%:            -69.997
Reward:              -9.777


Episode:             17
Age:                 18947
Current_time_s:      33.3
CalledAccel_in_ms2:  -13.20121
CurrentSpeed_in_kph: 0.0
TargetSpeed_in_kph:  35.838
Action_%:            -72.311
Reward:              -9.955


Episode:             17
Age:                 18948
Current_time_s:      33.4
CalledAccel_in_ms2:  -14.71689
CurrentSpeed_in_kph: 0.0
TargetSpeed_in_kph:  34.806
Action_%:            -79.157
Reward:              -9.668


Episode:             17
Age

KeyboardInterrupt: 

In [None]:
# Save the weights
actor_model.save_weights("pendulum_actor.h5")
critic_model.save_weights("pendulum_critic.h5")

target_actor.save_weights("pendulum_target_actor.h5")
target_critic.save_weights("pendulum_target_critic.h5")