<a href="https://colab.research.google.com/github/JZ76/Training-Overtaking-Algorithm/blob/main/Colab_version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Note: better using colab PRO, because you can have more memory, about 35GB, and longer run time duration. 
# When training the model, will take a lot of memory depends on the size of dataset, rather than need a very powerful GPU
# you will find the training is quite slow, because SimpleRNN cannot use CUDA cores to accelerate
# The free colab version is enough for the Australia dataset, but not enough for larger datasets.
# One is memory limit, another is duration limit.

# This code sheet should also work in Windows machine, after changing folder's path and installing correct version of libraries
# Need A LOT OF memory, recommend 32GB memory

In [None]:
Copyright 2022 Jiancheng Zhang

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

In [None]:
# Please use numpy 1.18.5 version, otherwise will have an error. 
# Have to RESTART runtime after installed, not DELETE

In [None]:
pip install -U numpy==1.18.5

In [None]:
import tensorflow
print(tensorflow.__version__)

In [None]:
import numpy as np
import pandas as pd

from tensorflow.python.keras.models import Model, load_model
from tensorflow.python.keras.layers import LSTM, Dense, Input, CuDNNLSTM, Masking, Embedding, SimpleRNN, concatenate

import os
import re
import gc

In [None]:
def loading_data(folder):

    # use regular expression to filter files
    # file's name start with 'ML'
    check_name = re.compile('^ML')

    datasets = []

    # due to different operating system has different way in keeping files, here I would like to read files in sorted by name order and say it explicitly
    # the reason why must sorted by name will be explained later
    for filename in sorted(os.listdir(folder)):

        files = os.path.join(folder, filename)

        # check whether the name match the regular expression and actually is a file
        if re.match(check_name, filename) and os.path.isfile(files):

            # using pandas to read csv file
            # there are only three headers: Speed, Steering_angle, LiDAR_scan. 
            # But there are 1083 columns data, which means the number of header doesn't match number of column
            # So, we need to skip first row, and set header is None.
            # Besides, the value in last column is all None, we have to drop column at index -1
            temp = pd.read_csv(files, skiprows=1, header=None).iloc[:, :-1]

            # Because the speed is a very large number compare to steering angle,
            # We have to normalize it into [-1, 1], and do the same to steering angle
            # When apply the model in the simulator, remember to product corresponding value to the output of the model
            # index 0 is speed, index 1 is steering angle
            temp[0] = temp[0].map(lambda t : t/16.0)
            temp[1] = temp[1].map(lambda t : t/0.192)

            # append this csv file to the result, and turn it into numpy array with float format
            datasets.append(np.array(temp, dtype=float))

    # when append data to a list, there will be a copy of old list, which took a lot of memory
    # but there is no reference to them, so we can call the garbage collector
    gc.collect()

    # datasets is a 3D list, shape 0 is number of csv files, shape 1 is number of rows in that csv file, shape 2 is 1083
    return datasets

In [None]:
def loading_speed_steering_data(folder):

    # file's name start with 'car_state_blue'
    check_name = re.compile('^car_state_blue')

    datasets = []

    # Also need to iterate in sorted by name order
    for filename in sorted(os.listdir(folder)):

        files = os.path.join(folder, filename)

        if re.match(check_name, filename) and os.path.isfile(files):

            # We need another two columns from car_state_blue files,
            # one is Velocity_X, another is Steering_angle
            datasets.append(np.array(pd.read_csv(files).iloc[:, [3, 5]], dtype=float))

    gc.collect()
    # datasets is a 3D list, shape 0 is number of csv files, shape 1 is number of rows in that csv file, shape 2 is 2
    return datasets

In [None]:
# I made my dataset public, the link can be found in GitHub: https://github.com/JZ76/Training-Overtaking-Algorithm
# Here, we will use google drive to store data
# after you connect to runtime, you can mount your drive on left-hand-side, in files icon, there is a Mount Drive button on the top
# And the drive will in path /content/drive/MyDrive/
# replace name_of_the_datasets to the folder you want to use
# OR, if you are using personal computer, feel free to replace the whole path
folder = "/content/drive/MyDrive/Australia_dataset"

# Currently, datasets contains partial input and all output, speed_steering contains partial input
datasets = loading_data(folder)
speed_steering = loading_speed_steering_data(folder)

In [None]:
# if you want to use part of the data, feel free to add bracket after datasets and speed_steering, like [0:100], make sure the size of sub data matches
new_datasets = datasets
new_speed_steering = speed_steering

In [None]:
X = []
y = []

# First two columns are speed and steering angle where are the output, aka y value
# rest of columns are LiDAR data where are part of the input, aka X value
for x in new_datasets:
    X.append(x[:, 2:])
    y.append(x[:, 0:2])

In [None]:
"""
As you can see, different csv file has different size, which means different number of rows.
However, the input matrix must have same shape in each instance (each csv file), for example 5000 * 1083
So, we need to add values to those instance have less rows compare to the largest instance
For example, if the largest instance is 5000 * 1083, and rest instances are x * 1083, where x is (0, 5000),
Then, we add values to those instances to make all of them have 5000 * 1083 shape.
The value need to be unique in the dataset, let's list data ranges in our new_datasets:
  speed: [-1, 1]
  steering angle: [-1, 1]
  LiDAR: [0, 10]
new_speed_steering:
  Velocity_X: [-16, 16]
  Steering_angle: [-0.192, 0.192]
any value that not in the ranges is ok, such as -100.0
All these can be done by using tensorflow.keras.preprocessing.sequence.pad_sequences()
"""

In [None]:
special_value = -100.0

In [None]:
# Sidenote: here is the most out of memory failure happened place
Xpad_A = tensorflow.keras.preprocessing.sequence.pad_sequences(
    X, padding="post", value=special_value
)

In [None]:
ypad = tensorflow.keras.preprocessing.sequence.pad_sequences(
    y, padding="post", value=special_value
)

In [None]:
# free some memory
datasets = []
new_datasets = []
X = []
y = []
gc.collect()

In [None]:
Xpad_B = tensorflow.keras.preprocessing.sequence.pad_sequences(
    new_speed_steering, padding="post", value=special_value
)

In [None]:
speed_steering = []
new_speed_steering = []
gc.collect()

In [None]:
# starting with an Input layer, None means the length of each instance is varaible
# but the number of columns is fixed, 1081
inputA = Input(shape=(None, 1081))

# Masking layer is to tell other layer that when see the special_value in the data, just ignore them
# Padding and Masking usually used togather, 
# because special_value is useless, we don't want them have effects on the result
A = Masking(input_shape=(None, 1081), mask_value=special_value)(inputA)

# this Dense layer has similar effects to Embedding layer
x = Dense(500, activation="relu")(A)

x = SimpleRNN(150, return_sequences=True, input_shape=(None, 1081))(x)


# Second Input layer
inputB = Input(shape=(None, 2))

# Still need another Masking layer
B = Masking(input_shape=(None, 2), mask_value=special_value)(inputB)

# I will concatenate output from SimpleRNN and data from Xpad_B which is from car_state_blue csv files
# In order to make sure data from two different files can be matched, 
# we need to make sure they are matched when reading the dataset
# Otherwise it is impossible to sort them after read them as Dataframe
# Here is the reason why I must read every csv files in sorted by name order
# When I created the dataset, I used current Time as part of the file name
# So, if we sort files by name, there will be no ambiguous, a newer csv can only after an older csv
# Although the exact time that creating ML and car_state_blue file probably don't match, 
# their position in the csv file list definitely matched
combined = concatenate([x, B], axis=2)

z = Dense(256, activation="relu")(combined)
z = Dense(128, activation="relu")(z)
z = Dense(32, activation="tanh")(z)
z = Dense(2, activation="tanh")(z)

# build the model
RNN_model = Model(inputs=[inputA, inputB], outputs=z)

In [None]:
# OR, you can using an existing model
# change the path or model name as you want
RNN_model = load_model("/content/drive/MyDrive/models/model_RNN_x")

In [None]:
RNN_model.compile(loss="mean_squared_error", optimizer="RMSprop", metrics=['mean_squared_error'])

RNN_model.summary()

# shape of Xpad_A: [number of csv files, None, 1081]
# shape of Xpad_B: [number of csv files, None, 2]
# shape of ypad:   [number of csv files, None, 2]
# epochs is how many iterations of all csv files, it depends on the dataset
# like when you creating the model, you probably need larger number of epochs,
# but when using an existing model, small number of epochs is enough,
# Again, be aware of duration limit in Colab
# batch_size means update the params after processing how many instances, 
#here, one instance is one csv file
RNN_model.fit([Xpad_A, Xpad_B], ypad, epochs=10, batch_size=5)

In [None]:
# change the path or model name as you want
RNN_model.save("/content/drive/MyDrive/models/new_model_RNN_x")

In [None]:
"""
you may ask, where is the model testing code? how would we evaluate the model?
Well, since this is for autonomous racing, higher accuracy doesn't necessary means better behaviour in racing
And our data is quite small tbh, so I decided to use all data as training data
and put the model into the simulator to evaluate it, rather than split data into training and testing
"""