# Import the required libraries

In [1]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

In [2]:
from cv2 import cv2
from tqdm import tqdm
from glob import glob
import zipfile
import os
import time
import pandas as pd
from keras.preprocessing import image

# Download dataset to google colab instance

In [3]:
!wget --no-check-certificate --no-clobber https://raw.githubusercontent.com/Karenw1004/Deeppicar-v3/main/Dataset.zip

--2021-02-12 17:29:57--  https://raw.githubusercontent.com/Karenw1004/Deeppicar-v3/main/Dataset.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 69313921 (66M) [application/zip]
Saving to: ‘Dataset.zip’


2021-02-12 17:29:59 (114 MB/s) - ‘Dataset.zip’ saved [69313921/69313921]



In [4]:
list_zip = glob("./*.zip")
print(list_zip)
local_zip_file_path = list_zip[0]
print(local_zip_file_path)

zip_ref = zipfile.ZipFile(local_zip_file_path)
data_dir = local_zip_file_path.replace(".zip","")
print(data_dir)
zip_ref.extractall(data_dir)
print("The Dataset folder has been added to the colab instance")

['./Dataset.zip']
./Dataset.zip
./Dataset
The Dataset folder has been added to the colab instance


# Preprocess dataset video to frames

In [5]:
# Config
NFRAMES = 1000

img_height = 66
img_width =  200
img_channels = 3
input_shape = [ img_height, img_width, img_channels]

In [6]:
!mkdir $data_dir/train_images

In [7]:
vid_ext = ".avi"
all_video_files = glob(f"{data_dir}/*{vid_ext}")

for path in tqdm(all_video_files):
  
  # Open the video file
  assert os.path.isfile(path)
  cap = cv2.VideoCapture(path)

  curFrame = 0
  vid_name = path.split("/")[-1]
  vid_name = vid_name.split(".")[0]

  while (cap.isOpened()):
    if curFrame < NFRAMES:
      cam_start = time.time()
      ret, img = cap.read()

      if cv2.waitKey(1) & 0xFF == ord('q'):
        break

      curFrame += 1
      filename = f"train_images/{vid_name}_frame_{curFrame}.jpg" 
      frame = img
      cv2.imwrite(f"{data_dir}/{filename}", frame)

    else:
      break

cap.release()
cv2.destroyAllWindows()

100%|██████████| 11/11 [00:24<00:00,  2.18s/it]


<font color='blue'>NOTE</font><br/>
You can click on the files on the left.<br/>
> Files -> Dataset -> train_images
<br/>

It will have the 11000 frames images where each video produce 1000 frames images (not yet preprocess)


# Load Data


## Load the csv files 

In [8]:
all_csv_files = glob("./Dataset/*.csv")
df_dict = {}
print(all_csv_files)
for filepath in all_csv_files:
  df = pd.read_csv(filepath, index_col=None,header=0)
  filename = filepath.split("/")[-1]
  out_number = filename.split("-")[-1]
  out_number = int(out_number.split(".")[0])
  df_dict[out_number] = df

['./Dataset/out-key-2.csv', './Dataset/out-key-0.csv', './Dataset/out-key-9.csv', './Dataset/out-key-4.csv', './Dataset/out-key-1.csv', './Dataset/out-key-5.csv', './Dataset/out-key-6.csv', './Dataset/out-key-3.csv', './Dataset/out-key-8.csv', './Dataset/out-key-10.csv', './Dataset/out-key-7.csv']


In [9]:
df_dict.keys()

dict_keys([2, 0, 9, 4, 1, 5, 6, 3, 8, 10, 7])

Looks great! We have all the 11 csv files !

### Let's check out the csv data 

In [10]:
df = df_dict[0]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ts_micro  1000 non-null   int64  
 1   frame     1000 non-null   int64  
 2   wheel     1000 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 23.6 KB


**Very clean** data with no nulls value and no missing value

In [11]:
df.wheel.unique()

array([ 0.        ,  0.52359878, -0.52359878])


| Column | Explanation |
| --- | --- |
| ts_micro | TODO: EXPLAIN THIS |
| frame | the current frame number  |
| wheel | **- 0.52359878 rad == 30 degrees** representing turning **right**|
|| **0 rad == 0 degrees** representing **center** |
|| **-0.52359878 rad == -30 degrees** reprsent turning **left**  |


# Preprocess the frames images

In [12]:
def preprocess(img):
  assert img_channels == 3 # for now we expect a color image
  img = cv2.resize(img, (img_width, img_height))
  img = img / 255.
  return img

In [13]:
images = glob(f"{data_dir}/train_images/*.jpg")
train_image_path = []
train_image = []
train_wheel_val = []

for i in tqdm(range(len(images))):

  image_path = images[i]
  train_image_path.append(image_path)

  img = image.load_img(image_path)
  img = image.img_to_array(img)
  img = preprocess(img)
  train_image.append(img)

  out_number = images[i].split("-")[-1]
  out_number = int(out_number.split("_")[0])
  df = df_dict[out_number]

  frame_number = images[i].split("_")[-1]
  frame_number = int(frame_number.split(".")[0])
  df_row = df[ df['frame'] == frame_number ]
  data = df_row['wheel']
  train_wheel_val.append(float(data))


100%|██████████| 11000/11000 [00:38<00:00, 285.86it/s]


In [14]:
train_df = pd.DataFrame()
train_df['image_path'] = train_image_path
train_df['wheel'] = train_wheel_val
train_df['image'] = train_image
train_df.head()

Unnamed: 0,image_path,wheel,image
0,./Dataset/train_images/out-video-9_frame_463.jpg,0.523599,"[[[0.08078431, 0.057254903, 0.0], [0.0873975, ..."
1,./Dataset/train_images/out-video-1_frame_646.jpg,0.523599,"[[[0.328467, 0.25748664, 0.030481283], [0.3776..."
2,./Dataset/train_images/out-video-5_frame_434.jpg,0.523599,"[[[0.22509804, 0.13490197, 0.0097860955], [0.2..."
3,./Dataset/train_images/out-video-10_frame_32.jpg,0.0,"[[[0.57012475, 0.6054189, 0.585811], [0.559037..."
4,./Dataset/train_images/out-video-1_frame_202.jpg,0.0,"[[[0.5686275, 0.5921569, 0.58431375], [0.56862..."


`**Save preprocessed data can be ignored and load preprocess can be ignored if you copy the REST of the cells at the Training.ipynb**`

# Save preprocessed data
## Save the image col (np.array) to npy and the image_path and wheel col to csv

In [15]:
import numpy as np
np.save("preprocess_image", train_df["image"])

preprocess_image.npy will be saved at a google drive link because github maximum file size is 100MB.

In [16]:
train_df[["image_path","wheel"]].to_csv('preprocess_path.csv', header=True, index=False)

<font color='blue'>NOTE</font><br/>
The reason why we **DO NOT** do ```train_df.to_pickle``` 
is because we will get this error below: <br/>
> ValueError: malformed node or string:array([[<br/>
  [4.2959005e-02, 5.5971481e-02, 14438502e-02],<br/>
  [4.2959005e-02, 5.5971481e-02, 1.4438502e-02],<br/>
  [4.2959005e-02, 5.5971481e-02, 1.4438502e-02],<br/>
  ...,<br/>
  [5.3012478e-01, 5.9928697e-01, 5.7575756e-01],<br/>
because to_pickle save the image np.array to object <br/>

Same reason with ```train_df.to_csv``` because we will not want to save it as a object (string)  with missing commas and \n. The table below shows what happens when you save it in to_csv.<br/>




|Index| Image|
| --- | --- |
|0 |      [[[4.2959005e-02 5.5971481e-02 1.4438502e-02]\... |
|1 |       [[[0.447451   0.47098035 0.46313724]\n  [0.425...|
|2 |       [[[0.48516932 0.4930125  0.47340462]\n  [0.479...|