<a href="https://colab.research.google.com/github/NachoDave/Automated-Vehicle-Anomalous-Driving-Behaviour/blob/main/CAMs_Autoencoder_project_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Autoencoder project

## What do we need to do?


1.   Get and format the data
2.   Split the data into training, test and validation sets - initially will just add the data one measure at a time, but may need to combine them somehow
3. Define the model
4. Crea



### Import Libraries and connect to drive

In [1]:
import os
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split

from google.colab import drive

drive.mount('/content/drive')
dataset_dir = '/content/drive/MyDrive/datasets/car_hacking'

Mounted at /content/drive


### Load the data

In [14]:
# Load standard dataset
rpm_df = pd.read_csv(f"{dataset_dir}/RPM_dataset.csv", header=None)
rpm_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1478191000.0,0316,8,05,22,68,09,22,20,00,75,R
1,1478191000.0,018f,8,fe,3b,00,00,00,3c,00,00,R
2,1478191000.0,0260,8,19,22,22,30,ff,8f,6e,3f,R
3,1478191000.0,02a0,8,60,00,83,1d,96,02,bd,00,R
4,1478191000.0,0329,8,dc,b8,7e,14,11,20,00,14,R


### Format the data

In [15]:
def format_CAN_bus_data(df:pd.DataFrame) -> pd.DataFrame:
  # Rows with only two bytes have nans
  df.replace(to_replace = np.nan, value = '00', inplace=True) # replace the NaNs
  df[5].replace(to_replace = 'R', value = '00', inplace=True) # replace the R Values in the 5 column for the 2 byte channels

  # Get the bus Ids and convert to numbers then to one hot encoding
  bus_id = df[1].apply(int,base=16)
  bus_id_one_hot = pd.get_dummies(bus_id).astype(int)

  # Get the payload and convert to int
  payload = df.iloc[:,3:11].applymap(int,base=16)/255

  # Concat the bus_id and payload to give the final data frame
  return pd.concat([bus_id_one_hot, payload], axis=1)

rpm_df = format_CAN_bus_data(rpm_df)
rpm_df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[5].replace(to_replace = 'R', value = '00', inplace=True) # replace the R Values in the 5 column for the 2 byte channels
  payload = df.iloc[:,3:11].applymap(int,base=16)/255


Unnamed: 0,2,160,161,304,305,320,339,399,497,608,...,1520,1680,3,4,5,6,7,8,9,10
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0.019608,0.133333,0.407843,0.035294,0.133333,0.12549,0.0,0.458824
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0.996078,0.231373,0.0,0.0,0.0,0.235294,0.0,0.0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0.098039,0.133333,0.133333,0.188235,1.0,0.560784,0.431373,0.247059
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0.376471,0.0,0.513725,0.113725,0.588235,0.007843,0.741176,0.0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0.862745,0.721569,0.494118,0.078431,0.066667,0.12549,0.0,0.078431


### Get data ready for loading

In [16]:
X = rpm_df.values.astype("float32")   # your dataframe of 34 features
X_train, X_temp = train_test_split(X, test_size=0.3, random_state=42)
X_val, X_test = train_test_split(X_temp, test_size=0.5, random_state=42)

In [5]:
X_train.shape

(3235191, 34)

### Create the autoencoder

In [17]:
input_dim = X_train.shape[1]   # should be 34
encoding_dim = 16              # latent size (tune this)

input_layer = layers.Input(shape=(input_dim,)) # this is the
encoder = layers.Dense(64, activation="relu")(input_layer)
encoder = layers.Dense(32, activation="relu")(encoder)
latent = layers.Dense(encoding_dim, activation="relu")(encoder)

decoder = layers.Dense(32, activation="relu")(latent)
decoder = layers.Dense(64, activation="relu")(decoder)
output_layer = layers.Dense(input_dim, activation="sigmoid")(decoder)

autoencoder = models.Model(inputs=input_layer, outputs=output_layer)
autoencoder.compile(optimizer="adam", loss="mse")
autoencoder.summary()

In [7]:
X_train[0:100]

array([[  1.,   0.,   0., ...,   8.,   5.,  57.],
       [  0.,   1.,   0., ...,  31.,   2.,   0.],
       [  0.,   0.,   0., ...,  36.,   0., 255.],
       ...,
       [  0.,   0.,   0., ...,  36.,   0., 255.],
       [  0.,   0.,   0., ...,  36.,   0., 255.],
       [  0.,   0.,   0., ...,   2., 189.,   0.]], dtype=float32)

### Train the model

In [18]:
callback = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
x = 200000
history = autoencoder.fit(
    X_train[0:x], X_train[0:x],
    epochs=20,
    batch_size=128,
    shuffle=True,
    validation_data=(X_val[0:x], X_val[0:x]),
    #callbacks=[callback]
)

Epoch 1/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - loss: 0.0392 - val_loss: 0.0070
Epoch 2/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - loss: 0.0069 - val_loss: 0.0069
Epoch 3/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - loss: 0.0069 - val_loss: 0.0069
Epoch 4/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - loss: 0.0069 - val_loss: 0.0069
Epoch 5/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - loss: 0.0065 - val_loss: 0.0062
Epoch 6/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - loss: 0.0062 - val_loss: 0.0062
Epoch 7/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - loss: 0.0054 - val_loss: 0.0049
Epoch 8/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - loss: 0.0048 - val_loss: 0.0049
Epoch 9/20
[1m1563/

In [22]:
X_val[0] - autoencoder.predict(X_val[0:1])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step


array([[ 3.75032425e-04, -6.31413005e-20, -4.76683172e-11,
        -1.19724684e-27, -6.23911205e-07, -4.13691259e-09,
        -3.49473231e-34, -4.95110442e-20, -1.33580137e-12,
        -1.14108538e-13, -6.05412553e-10, -1.80433723e-04,
        -9.48745138e-14, -3.88414171e-17, -1.96964329e-08,
        -1.57185452e-04, -1.34581400e-11, -6.54869876e-27,
        -3.89161712e-13, -1.37685705e-11, -1.38074280e-20,
        -1.37555051e-10, -3.72776893e-10, -6.27334434e-11,
        -4.27120284e-10, -2.96111855e-12, -3.69213708e-08,
        -3.99012864e-03, -2.41975727e-06, -2.33875835e-06,
        -8.11789709e-04,  2.33866740e-02,  4.70507294e-02,
         9.98006761e-03]], dtype=float32)