# Data Cleaning

In this notebook I will import and inspect the image and telemetry data taken from Donkey Sim, and organize it for modeling steps.

* Import telemetry .csv
* Import image data
* Convert image data
* Create dataframe with: 
  * steering inputs, 
  * throttle inputs
  * converted imageds
* Save as a pickle format file for modeling

In [7]:
## Imports
import numpy as np
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt

# https://stackoverflow.com/questions/57318892/convert-base64-encoded-image-to-a-numpy-array
# from PIL import Image
# import PIL
# import base64
# import io

# import tensorflow as tf
from tensorflow.keras.utils import image_dataset_from_directory, img_to_array, load_img


## Telemetry Data
### TODO: ### 
* code up a way to get most recent?
* **change this when it's time to submit**

In [8]:
## Constants
telemetry_filename = '/home/grant/projects/donkeysim-client/data/11_06_2021/16_31_26/data.csv'
image_folder = '/home/grant/projects/donkeysim-client/data/11_06_2021/16_31_26/images/'

### Load CSV file as DataFrame

In [9]:
df = pd.read_csv(telemetry_filename)
df.head()

Unnamed: 0,steering_angle,throttle,speed,image,hit,time,accel_x,accel_y,accel_z,gyro_x,...,totalNodes,pos_x,pos_y,pos_z,vel_x,vel_y,vel_z,on_road,progress_on_shortest_path,lap
0,0.0,0.060606,0.014416,2_63297.png,none,2.63297,-0.008254,0.113567,-0.027561,-0.000144,...,307,14.03953,0.563753,-68.18574,0.00098,-0.01051,0.009818,0,0,0
1,0.0,0.39785,0.005633,2_682824.png,none,2.682824,-0.008932,0.125598,-1.216979,-0.000223,...,307,14.03894,0.563582,-68.18635,0.000712,-0.00554,0.000735,0,0,0
2,0.0,1.0,0.12134,2_732961.png,none,2.732961,-0.008724,0.13386,2.346596,0.000915,...,307,14.0342,0.563059,-68.19118,0.000248,0.002305,0.121318,0,0,0
3,0.0,1.0,0.257272,2_782386.png,none,2.782386,-0.018795,0.083559,3.245206,0.001853,...,307,14.02756,0.562652,-68.1982,-0.00044,0.00621,0.257197,0,0,0
4,0.0,1.0,0.402505,2_832628.png,none,2.832628,-0.049414,0.023296,1.974267,0.001858,...,307,14.01181,0.562419,-68.21478,-0.001762,0.008644,0.402409,0,0,0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7644 entries, 0 to 7643
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   steering_angle             7644 non-null   float64
 1   throttle                   7644 non-null   float64
 2   speed                      7644 non-null   float64
 3   image                      7644 non-null   object 
 4   hit                        7644 non-null   object 
 5   time                       7644 non-null   float64
 6   accel_x                    7644 non-null   float64
 7   accel_y                    7644 non-null   float64
 8   accel_z                    7644 non-null   float64
 9   gyro_x                     7644 non-null   float64
 10  gyro_y                     7644 non-null   float64
 11  gyro_z                     7644 non-null   float64
 12  gyro_w                     7644 non-null   float64
 13  pitch                      7644 non-null   float

In [11]:
df.drop(columns=['speed','hit', 'time',
       'accel_x', 'accel_y', 'accel_z', 'gyro_x', 'gyro_y', 'gyro_z', 'gyro_w',
       'pitch', 'yaw', 'roll', 'cte', 'activeNode', 'totalNodes', 'pos_x',
       'pos_y', 'pos_z', 'vel_x', 'vel_y', 'vel_z', 'on_road',
       'progress_on_shortest_path',], inplace=True)

In [12]:
df.lap.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [13]:
sys.getsizeof(df)

710165

In [14]:
df.dtypes

steering_angle    float64
throttle          float64
image              object
lap                 int64
dtype: object

### Convert datatypes (?)

In [15]:
df['steering_angle'] = pd.to_numeric(df['steering_angle'], downcast='float')
df['throttle'] = pd.to_numeric(df['throttle'], downcast='float')
sys.getsizeof(df)

649013

In [16]:
df.dtypes

steering_angle    float32
throttle          float32
image              object
lap                 int64
dtype: object

### Remove "Extra" Lap

In [17]:
df['lap'].value_counts()

1     865
6     853
8     848
7     848
9     831
5     830
3     829
2     825
4     822
10     60
0      33
Name: lap, dtype: int64

In [18]:
## Cut off the little bit after the end of the training session
df_y = df.loc[df['lap'] < df['lap'].max(), ['steering_angle', 'throttle', 'image']]
df_y

Unnamed: 0,steering_angle,throttle,image
0,0.0,0.060606,2_63297.png
1,0.0,0.397850,2_682824.png
2,0.0,1.000000,2_732961.png
3,0.0,1.000000,2_782386.png
4,0.0,1.000000,2_832628.png
...,...,...,...
7579,0.0,0.000000,381_5842.png
7580,0.0,0.000000,381_6345.png
7581,0.0,0.000000,381_6844.png
7582,0.0,0.000000,381_7345.png


## Image Data

### Verify Files

In [19]:
## Verify Files
os.listdir(image_folder)[:5]

['17_13449.png', '261_9896.png', '313_4438.png', '148_5905.png', '32_5345.png']

## Create Image Array

In [20]:
## Using keras, load images as list, adding to a list
img_array_list = []
for img in df_y['image']:
    img_array_list.append(img_to_array(load_img(f"{image_folder}{img}", color_mode='grayscale')))

In [21]:
## convert list of arrays into a numpy array (of arrays())
X = np.array(img_array_list)

## Create Datasets

### Create Targets

In [23]:
## Target: throttle and steering data
y = df_y.drop(columns=['image']).to_numpy().copy()

## Verify size
print(f'X.shape: {X.shape}')
print(f'y.shape: {y.shape}')

X.shape: (7584, 32, 32, 1)
y.shape: (7584, 2)


### Scale Image Data

In [24]:
X /= 255

### Save Datasets
### TODO: Remvoe Hardcode

In [38]:
## Save as binary NumPy .npy format
with open(f'../data/X.npy', 'wb') as X_out:
    np.save(file=X_out, arr=X)
with open(f'../data/y.npy', 'wb') as y_out:
    np.save(file=y_out, arr=y)