<a href="https://colab.research.google.com/github/MapleWolfe/Anomaly_detection_waymo_open/blob/main/yolo_detection_model/yolo_detection_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building Model for Waymo open dataset
- please see the detection_model_readme.txt

##### Enviornment Notebook was built in:
- Kindly run this notebook in Google Colab

##### Hardware needs:

- Kindly use a minimum of 12 GB CPU ram
- the standard Nvidia T4 GPU that comes with the free version of Google Colab.
- 50 GB disk Minimum (less should work but this was the minimum during notebook creation)

##### following files should be in local directory if using the local notebook
- these files should be in the same folder as notebook

##### instructions to run notebook,
- Kindly fill in the code cell below to use the local parquet files stored in the same folder as notebook or the parquet files stored in a GCS bucket

In [None]:
# do you want to use local sample files or google cloud storage?
file_store = 'GCS' # kindly use either 'GCS' for Google cloud storage or 'LOCAL' for files stored in the same folder as notebook

# leave both as None if using LOCAL or kindly replace this with your GCS api key json file and Bucket name or else
gcs_json_key = 'put_google_json_key.json' # please remember to start file path here with '/content/' + your json key file name
bucket_name = 'waymo_sample_bucket' # remember to replace this

#these are the file download limits for hyper param tuning, it limits the number of parquet files being downloaded for training / validation / test
train_limit = 2 #-1 means all relevant files in your google bucket
val_limit = 1 #-1 means all relevant files in your google bucket
test_limit = 1 #-1 means all relevant files in your google bucket

final_model_train_limit = 40 #-1 means all relevant files in your google bucket

# the path below is to the sample files: there are 4 parquets in total, you just need to upload the waymo_sample_data folder from github repo
train_box_data_file_path  = 'waymo_sample_data/training/camera_box/training_camera_box_10017090168044687777_6380_000_6400_000.parquet'
train_image_data_file_path = 'waymo_sample_data/training/camera_image/training_camera_image_10017090168044687777_6380_000_6400_000.parquet'
val_box_data_file_path = 'waymo_sample_data/validation/camera_box/validation_camera_box_10203656353524179475_7625_000_7645_000.parquet'
val_image_data_file_path = 'waymo_sample_data/validation/camera_image/validation_camera_image_10203656353524179475_7625_000_7645_000.parquet'


## installs, imports, pre-sets

- kindly open and run cell blocks based on the enviornment being run in to save computational resources.

### Using detect_model_requirements.txt
- please uncomment to use

In [None]:
# uncomment to create notebook package requirments file called detect_model_requirements.txt
# !pip freeze > detect_model_requirements.txt

# use the code below to use detect_model_requirements.txt to install all necessary packages
#!pip install -r detect_model_requirements.txt

### Neccessary installs on top of google colab
- uncomment and Run this cell if you aren't using detect_model_requirements.txt and are operating in the GPU google colab enviornment

In [None]:
#!pip install google-cloud-storage
#!pip install ultralytics
#!pip install altair

### Imports

In [None]:
# installs for google cloud storage
from google.cloud import storage

# general tool installs
import os, io, shutil, warnings
from tqdm.notebook import tqdm

#image processing and plotting libraries
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import altair as alt

# data processing librarires
import pandas as pd
import numpy as np
import dask.dataframe as dd

# model evaluation
from sklearn.model_selection import ParameterGrid

# model libraries
import tensorflow as tf
from ultralytics import YOLO

#pre-sets
warnings.filterwarnings("ignore")

## utility functions
- functions that may be used at a later point

### delete file function

In [None]:
# code to delete a file
def delete_file(file_path_list):
  for a_file in (file_path_list):
    os.remove(a_file)
  return None

### move copy function

In [None]:
def move_copy(old_location, new_location):
  shutil.copy(old_location, new_location)
  return None

### download blob function

In [None]:
# to download a blob file
def download_blob(a_blob,file_name):
  a_blob.download_to_filename(file_name)
  return None

### Let's create folder directory for YOLO
- built as per .yaml file

In [None]:
#folder name list for directory
def make_directory(folder_name_list):
  folder_path = os.path.join(*folder_name_list)
  if not os.path.exists(folder_path):
    os.makedirs(folder_path)

  return None

#function to build yolo directory structure
def build_yolo_directory():
  for folder_type in ['train','test','eval']:
    for data_type in ['images', 'labels']:
      make_directory(['datasets',folder_type,data_type])
  return None



In [None]:
# running directory function
build_yolo_directory()

## Google Cloud Storage section

- Run these cells if you are using your private google cloud storage.
- kindly ensure that your bucket has the same structure and file names as the waymo_open_dataset_v_2_0_0 bucket

In [None]:
def start_gcs(gcs_json_key = gcs_json_key, bucket_name = bucket_name):
# Please input API JSON KEY FOR your private google cloud storage where the files are kept in gcs_json_key defined in the first codeblock
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = gcs_json_key
    client = storage.Client()
# replace the bucket name with the bucket name in your GCS, use the bucket_name object  in the first codeblock
    bucket = client.get_bucket(bucket_name)

#getting file list
    files = bucket.list_blobs()
    files_list = [a_file.name for a_file in files]
    return bucket, files_list


### GCS Iterative File Dowload function
(Bare Image parquet and Bounding box parquet files)

In [None]:
# in the following function we are creating blobs: one blob object for images parquet and another blob for box coordinates parquet
def open_gcs_file(files_list,bucket, storage_folder = 'training'):
    # lets ensure that we have the correct file type
    if storage_folder not in ['training','testing','validation']:
      print('''please retype storage_folder it should either be ('training','testing','validation') in the second parameter of function)''')
      return None

    image_box_str = storage_folder +'/'+ 'camera_box'+'/'
    bare_image_str = storage_folder +'/'+ 'camera_image'+'/'
    print('files_list: ', files_list)
    for a_file in files_list:
        if image_box_str in a_file:
          try:
            box_file_name = a_file
            box_file_blob = bucket.blob(box_file_name)
            bare_image_file_name = box_file_name.replace(image_box_str, bare_image_str)
            bare_image_blob = bucket.blob(bare_image_file_name)
            if bare_image_blob is not None:
              yield box_file_blob, os.path.basename(box_file_name),bare_image_blob, os.path.basename(bare_image_file_name)
          except:
            continue


## Storing files for yolo model
- functions to unpack and store from parquet files

#### save path and create unique key function

In [None]:
# creating a save path
def create_path(file_id,store_type, images_labels = 'images'):
  if store_type == 'train':
    if images_labels == 'images':
      return 'datasets/train/images/' +str(file_id) + '.jpg'
    else:
      return 'datasets/train/labels/' +str(file_id) + '.txt'

  elif store_type == 'eval':
    if images_labels == 'images':
      return 'datasets/eval/images/' +str(file_id) + '.jpg'
    else:
      return 'datasets/eval/labels/' +str(file_id) + '.txt'
  elif store_type == 'train':
    if images_labels == 'images':
      return 'datasets/train/images/' +str(file_id) + '.jpg'
    else:
      return 'datasets/train/labels/' +str(file_id) + '.txt'
  else:
    print('error type parameter: train / eval / test')

# creating a key column
def create_key_column(select_df):
  return select_df['key.segment_context_name'].astype(str) +  select_df['key.frame_timestamp_micros'].astype(str) + select_df['key.camera_name'].astype(str)


#### unpacking, scaling, storing images and annotations seperately

In [None]:
# constructing and saving image to path we are
# here we are checking the image size and resizing it to 640 x 640
def build_save_image(byte_string,image_path):
  image_bytes = io.BytesIO(byte_string)
  image = Image.open(image_bytes)
  original_image_size = image.size
  if (original_image_size[0] %32 == 0) and (original_image_size[1] %32 == 0):
    resized_image = image.resize((640,640))
    resized_image.save(image_path)
    return ('success', original_image_size)
  else:
    return 'fail'

# building bounding box labels
# here we are also scaling the bounding box positions to the new image of 640 x 640
def build_save_labels(df,txt_file_path,image_size):
  width, height = image_size
  df.loc[:,'center_x'] = (df.loc[:,'[CameraBoxComponent].box.center.x'] * (640/width))/640
  df.loc[:,'center_y'] = (df.loc[:,'[CameraBoxComponent].box.center.y'] * (640/height))/640
  df.loc[:,'size_x'] = (df.loc[:,'[CameraBoxComponent].box.size.x'] * (640/width))/640
  df.loc[:,'size_y'] = (df.loc[:,'[CameraBoxComponent].box.size.y'] * (640/height))/640

  select_col_df = df[['[CameraBoxComponent].type','center_x','center_y','size_x','size_y']]
  select_col_pd = select_col_df
  select_col_pd.to_csv(txt_file_path, sep=' ', header=False, index=False)
  return None



In [None]:
# combining all functions above
def unpack_store_images(image_parquet,box_parquet, store_type = 'train'):
  file_path_store = []
  image_df = pd.read_parquet(image_parquet)
  box_df = pd.read_parquet(box_parquet)

  image_df.loc[:,'key_column'] = create_key_column(image_df)
  box_df.loc[:,'key_column'] = create_key_column(box_df)

  commmon_key_list = list(set(box_df['key_column'].tolist()) & set(image_df['key_column'].tolist()))
  id_iterator = 0
  for common_key in tqdm(commmon_key_list):
    key_loc_image = image_df.loc[image_df['key_column'] == common_key]
    image_bytes = key_loc_image.reset_index().loc[0, '[CameraImageComponent].image']
    image_path = create_path(id_iterator,store_type, images_labels = 'images')

    key_loc_box = box_df.loc[box_df['key_column'] == common_key]
    label_path = create_path(id_iterator,store_type, images_labels = 'text')

    save_confirm = build_save_image(image_bytes,image_path)
    if save_confirm[0] == 'success':
      # lets now complete box

      build_save_labels(key_loc_box,label_path,save_confirm[1])

      #lets update accumulators
      id_iterator +=1
      file_path_store.append(image_path)
      file_path_store.append(label_path)

  return file_path_store

## building model

In [None]:
def train_model(file_counter, prev_model_list):
  if (file_counter == 0):
    model_list = []
    model_data_dict = {}
    param_grid = {'data':['yolov8_waymo.yaml'], 'epochs' : [20],
                  'device' : [0], 'patience' : [10],
                  'workers' : [0],'batch': [64],
                  'optimizer': ['SGD','Adamax'],
                  'lr0': [0.01,0.05], 'lrf': [0.01,0.05],
                  'momentum': [0.90,0.95], 'weight_decay': [0.0001,0.0005]}
    grid_iterator = ParameterGrid(param_grid)
    model_number = 0
    for a_grid in grid_iterator:
      model = YOLO('yolov8n.yaml')
      model.train(data=a_grid['data'], epochs = a_grid['epochs'],device = a_grid['device'],
                  patience = a_grid['patience'], workers = a_grid['workers'], batch = a_grid['batch'],
                  optimizer = a_grid['optimizer'], lr0 = a_grid['lr0'],
                  lrf = a_grid['lrf'], momentum = a_grid['momentum'], weight_decay = a_grid['weight_decay'])

      model_path ='model_'+str(model_number) + '.pt'
      model_data_dict={'parameters': a_grid,'model_path': model_path}

      output_location = model.export()
      updated_location = output_location.replace('torchscript','pt')
      move_copy(updated_location, model_path)

      model_list.append(model_data_dict)
      model_number +=1

    return model_list

  else:
      for a_model_dict in prev_model_list:
        model = YOLO(a_model_dict['model_path'])
        model.train(data=a_model_dict['parameters']['data'], epochs = a_model_dict['parameters']['epochs'],
                    device = a_model_dict['parameters']['device'],patience = a_model_dict['parameters']['patience'],
                    workers = a_model_dict['parameters']['workers'], batch = a_model_dict['parameters']['batch'],
                    optimizer = a_model_dict['parameters']['optimizer'], lr0 = a_model_dict['parameters']['lr0'],
                    lrf = a_model_dict['parameters']['lrf'], momentum = a_model_dict['parameters']['momentum'],
                    weight_decay = a_model_dict['parameters']['weight_decay'])

        output_location = model.export()
        updated_location = output_location.replace('torchscript','pt')
        delete_file([a_model_dict['model_path']])
        move_copy(updated_location, a_model_dict['model_path'])
      return prev_model_list


## Creating validation data

In [None]:
# checking and then initiating GCS file download
if file_store == 'GCS':
  bucket, files_list = start_gcs()
  gcs_iterator = open_gcs_file(files_list,bucket, storage_folder = 'validation')
  file_counter = 0
  train_file_path_list = []
  for a_file in gcs_iterator:
    # evaluation data
    box_file_blob, box_file_name,bare_image_blob, bare_image_file_name = a_file

    download_blob(box_file_blob,box_file_name)
    new_box_file_name = 'box_file_'+ box_file_name
    os.rename(box_file_name, new_box_file_name)

    download_blob(bare_image_blob,bare_image_file_name)
    new_image_file_name = 'image_file_'+ bare_image_file_name
    os.rename(bare_image_file_name, new_image_file_name)

    eval_file_path_list = unpack_store_images(new_image_file_name,new_box_file_name, store_type = 'eval')
    delete_file([new_box_file_name,new_image_file_name])

    print('completed_file_number: ',file_counter)
    file_counter += 1
    if (file_counter >= val_limit) and (val_limit != -1):
      break

elif file_store == 'LOCAL':
    train_file_path_list = unpack_store_images(val_box_data_file_path,val_image_data_file_path, store_type = 'eval')
else:
  print('''check file_store variable, it should be in capital 'GCS' or 'LOCAL' ''')


files_list:  ['testing/', 'training/', 'training/camera_box/10017090168044687777_6380_000_6400_000.parquet', 'training/camera_box/10023947602400723454_1120_000_1140_000.parquet', 'training/camera_box/1005081002024129653_5313_150_5333_150.parquet', 'training/camera_box/10061305430875486848_1080_000_1100_000.parquet', 'training/camera_box/10072140764565668044_4060_000_4080_000.parquet', 'training/camera_box/10072231702153043603_5725_000_5745_000.parquet', 'training/camera_box/10075870402459732738_1060_000_1080_000.parquet', 'training/camera_box/10082223140073588526_6140_000_6160_000.parquet', 'training/camera_box/10094743350625019937_3420_000_3440_000.parquet', 'training/camera_box/10096619443888687526_2820_000_2840_000.parquet', 'training/camera_box/10107710434105775874_760_000_780_000.parquet', 'training/camera_box/10153695247769592104_787_000_807_000.parquet', 'training/camera_box/10206293520369375008_2796_800_2816_800.parquet', 'training/camera_box/10212406498497081993_5300_000_5320_

  0%|          | 0/959 [00:00<?, ?it/s]

completed_file_number:  0


## Main run to create train data store and train different hyperparam models

In [None]:
# checking and then initiating GCS file download
if file_store == 'GCS':
  bucket, files_list = start_gcs()
  gcs_iterator = open_gcs_file(files_list,bucket, storage_folder = 'training')
  file_counter = 0
  model_list = []
  train_file_path_list = []
  for a_file in gcs_iterator:
    # train data
      box_file_blob, box_file_name,bare_image_blob, bare_image_file_name = a_file

      download_blob(box_file_blob,box_file_name)
      new_box_file_name = 'box_file_'+ box_file_name
      os.rename(box_file_name, new_box_file_name)

      download_blob(bare_image_blob,bare_image_file_name)
      new_image_file_name = 'image_file_'+ bare_image_file_name
      os.rename(bare_image_file_name, new_image_file_name)

      train_file_path_list = unpack_store_images(new_image_file_name,new_box_file_name, store_type = 'train')
      model_list = train_model(file_counter, model_list)
      print('completed_file_number: ',file_counter)
      file_counter += 1

      delete_file([new_box_file_name,new_image_file_name]+train_file_path_list)
      if (file_counter >= train_limit) and (train_limit != -1):
        break

elif file_store == 'LOCAL':
    file_counter = 0
    model_list = []
    train_file_path_list = unpack_store_images(train_box_data_file_path,train_image_data_file_path, store_type = 'train')
    model_list = train_model(file_counter, model_list)
    delete_file([train_box_data_file_path,train_image_data_file_path]+train_file_path_list)
else:
  print('''check file_store variable, it should be in capital 'GCS' or 'LOCAL' ''')


## Model evaluation and comparison

In [None]:
# lets get evaluation metrics in our dicts
parameter_dict_list =[]
for model_dict in model_list:
  model = YOLO(model_dict['model_path'])
  metrics = model.val(batch = 64, device	= 'cpu')
  map_50_95 = metrics.box.map
  model_dict['parameters']['map_50_95'] = map_50_95
  parameter_dict_list.append(model_dict['parameters'])

In [None]:
# let's create a model performance dataframe
model_eval_df = pd.DataFrame(parameter_dict_list)
dropped_df = model_eval_df.drop(['data','device','patience','workers'],axis = 1)
dropped_df.to_csv('yolo_hyperparameter_evaluation.csv', index=False)
dropped_df['lr0:lrf'] = dropped_df['lr0'].astype(str) + ' : '+ dropped_df['lrf'].astype(str)


In [None]:
# let's create a plot

learning_rate_chart = alt.Chart(dropped_df).mark_tick(size = 25, thickness =3).encode(
    x= alt.X('map_50_95:Q',title = 'mean_average_percision (50:95)' ),
    y= alt.X('lr0:lrf:N',title = 'start : end (learning rate)' ),
    color=alt.Color("optimizer:N",legend=alt.Legend(title="Optimizer function"))
).properties(title='Impact of learning rate on yolo model performance', width=600,height=200
)

momentum = alt.Chart(dropped_df).mark_tick(size = 25, thickness =3).encode(
    x= alt.X('map_50_95:Q',title = 'mean_average_percision (50:95)' ),
    y= alt.X('momentum:N',title = 'momentum' ),
    color=alt.Color("optimizer:N",legend=None)
).properties(title='Impact of momentum on yolo model performance', width=600,height=200 )

weight_decay = alt.Chart(dropped_df).mark_tick(size = 25, thickness =3).encode(
    x= alt.X('map_50_95:Q',title = 'mean_average_percision (50:95)' ),
    y= alt.X('weight_decay:N',title = 'weight_decay' ),
    color=alt.Color("optimizer:N",legend=None)
).properties(title='Impact of weight decay on yolo model performance', width=600,height=200 )

evaluation_chart = (learning_rate_chart & momentum & weight_decay).configure_axisY(titleAngle=0, titleAnchor='start').configure_axis(ticks=False
).configure_view(stroke=None)

evaluation_chart

## Selected Model Train
- here we only train one model, based on the top. parameters identified above

In [None]:
def updated_train_model(file_counter, prev_model_list):
  if (file_counter == 0):
    model_list = []
    model_data_dict = {}
    param_grid = {'data':['yolov8_waymo.yaml'], 'epochs' : [20],
                  'device' : [0], 'patience' : [10],
                  'workers' : [0],'batch': [64],
                  'optimizer': ['SGD'],
                  'lr0': [0.05], 'lrf': [0.05],
                  'momentum': [0.95], 'weight_decay': [0.0001], 'pretrained' : [True]}
    grid_iterator = ParameterGrid(param_grid)
    model_number = 0
    for a_grid in grid_iterator:
      model = YOLO('yolov8n.yaml')
      model.train(data=a_grid['data'], epochs = a_grid['epochs'],device = a_grid['device'],
                  patience = a_grid['patience'], workers = a_grid['workers'], batch = a_grid['batch'],
                  optimizer = a_grid['optimizer'], lr0 = a_grid['lr0'], pretrained = a_grid['pretrained'],
                  lrf = a_grid['lrf'], momentum = a_grid['momentum'], weight_decay = a_grid['weight_decay'])

      model_path ='final_model' + '.pt'
      model_data_dict={'parameters': a_grid,'model_path': model_path}

      output_location = model.export()
      updated_location = output_location.replace('torchscript','pt')
      move_copy(updated_location, model_path)

      model_list.append(model_data_dict)
      model_number +=1

    return model_list

  else:
      for a_model_dict in prev_model_list:
        model = YOLO(a_model_dict['model_path'])
        model.train(data=a_model_dict['parameters']['data'], epochs = a_model_dict['parameters']['epochs'],
                    device = a_model_dict['parameters']['device'],patience = a_model_dict['parameters']['patience'],
                    workers = a_model_dict['parameters']['workers'], batch = a_model_dict['parameters']['batch'],
                    optimizer = a_model_dict['parameters']['optimizer'], lr0 = a_model_dict['parameters']['lr0'],
                    lrf = a_model_dict['parameters']['lrf'], momentum = a_model_dict['parameters']['momentum'],
                    weight_decay = a_model_dict['parameters']['weight_decay'])

        output_location = model.export()
        updated_location = output_location.replace('torchscript','pt')
        delete_file([a_model_dict['model_path']])
        move_copy(updated_location, a_model_dict['model_path'])
      return prev_model_list


In [None]:
# @title
# checking and then initiating GCS file download
if file_store == 'GCS':
  bucket, files_list = start_gcs()
  gcs_iterator = open_gcs_file(files_list,bucket, storage_folder = 'training')
  file_counter = 0
  model_list = []
  train_file_path_list = []
  for a_file in gcs_iterator:
    # train data
      box_file_blob, box_file_name,bare_image_blob, bare_image_file_name = a_file

      download_blob(box_file_blob,box_file_name)
      new_box_file_name = 'box_file_'+ box_file_name
      os.rename(box_file_name, new_box_file_name)

      download_blob(bare_image_blob,bare_image_file_name)
      new_image_file_name = 'image_file_'+ bare_image_file_name
      os.rename(bare_image_file_name, new_image_file_name)

      train_file_path_list = unpack_store_images(new_image_file_name,new_box_file_name, store_type = 'train')
      model_list = updated_train_model(file_counter, model_list)
      print('completed_file_number: ',file_counter)
      file_counter += 1

      delete_file([new_box_file_name,new_image_file_name]+train_file_path_list)
      if (file_counter >= final_model_train_limit) and (final_model_train_limit != -1):
        break

elif file_store == 'LOCAL':
    file_counter = 0
    model_list = []
    train_file_path_list = unpack_store_images(train_box_data_file_path,train_image_data_file_path, store_type = 'train')
    model_list = train_model(file_counter, model_list)
    delete_file([train_box_data_file_path,train_image_data_file_path]+train_file_path_list)
else:
  print('''check file_store variable, it should be in capital 'GCS' or 'LOCAL' ''')
