# Import Libraries

In [1]:
%%capture
#!pip install --upgrade fastapi ffmpeg google-cloud-storage uvicorn python-multipart tensorflow-gpu scikit-image imutils wandb tensorflow_hub Pillow pyyaml

from fastapi import FastAPI
from skimage.metrics import structural_similarity as compare_ssim

from collections import Counter

import tensorflow_hub as hub
import numpy as np
import tensorflow as tf
from PIL import Image
import ffmpeg, shutil, glob

import multiprocessing as mp

import pandas as pd

import re

from google.cloud import storage
import nest_asyncio, uvicorn, os, pathlib
import yaml
import cv2#, wandb

# Set Up Google Cloud Parameters


## Set Up Google Cloud Project and Model Location

In [2]:
location = 'james-mlsys' # Model Storage Bucket

## Create Storage Bucket

In [3]:
storage_client = storage.Client()

bucket = storage_client.bucket(location)

# WandB Functions

In [4]:
#def init_wandb(project_name):
#   global wandb_project
#   wandb_project = str(project_name)
#   wandb.init(project=wandb_project, sync_tensorboard=True)
#   return True

# Set Bucket

In [5]:
def set_bucket(in_location):
    global location 
    location = str(in_location)
    global bucket
    bucket = storage_client.bucket(location)
    return True

# Download Scripts

## Download Images and Labels

In [6]:
def download_images_and_labels(project_folder, image_folder, label_folder):
    print(f'Downloading project folder: {project_folder}.                       ')
    print("Checking if folder already downloaded", end="\r")
    img_dir = f'{project_folder}/{image_folder}'
    label_dir = f'{project_folder}/{label_folder}'
    if os.path.exists(img_dir):
        print(f'Image folder {img_dir} already downloaded                       ')
        download_images = False
    else:
        download_images = True
    if os.path.exists(label_dir):
        print(f'Label folder {img_dir} already downloaded                       ')
        download_labels = False
    else:
        download_labels = True
    if download_images:
        os.makedirs(img_dir, exist_ok=True)
        print(f'Downloading Image Folder: {img_dir}                       ')
        blobs = storage_client.list_blobs(location, prefix = img_dir)
        for blob in blobs:
            print(f'Downloading: {blob.name}                       ', end = '\r')
            try:
                blob.download_to_filename(blob.name) 
            except:
                print(f'{blob.name} failed to download, skipping')
    if download_labels:
        os.makedirs(label_dir, exist_ok=True)
        print(f'Downloading Label Folder: {label_dir}                       ')
        blobs = storage_client.list_blobs(location, prefix = label_dir)
        for blob in blobs:
            print(f'Downloading: {blob.name}                       ', end = '\r')
            try:
                blob.download_to_filename(blob.name) 
            except:
                print(f'{blob.name} failed to download, skipping')
    print(f'Downloading project folder: {project_folder} complete.                       ')

## Download Video to Local Instance

In [7]:
def download_video(video_name, extension, origin_folder, dest_folder):
    print("Checking if video already downloaded", end="\r")
    vid_dir = f'{dest_folder}/{video_name}.{extension}'
    if os.path.exists(vid_dir):
        print(f'Video: {video_name}.{extension} already downloaded. Skipping Download.')
        return
    try:
        os.mkdir(str(dest_folder))
    except:
        print(f'Folder: {dest_folder} already exists.          ')
    def_location = f'{origin_folder}/{video_name}.{extension}'
    print(f'Downloading: {def_location}')
    blob = bucket.blob(def_location)
    def_destination = f'{dest_folder}/{video_name}.{extension}'
    blob.download_to_filename(def_destination)
    print('Download Complete')

# Split Video to Frames and Upload

## Break down video to frames

In [8]:
def split_video_frames(video_name, extension, source_folder, dest_folder):
    print("Checking if video already split", end="\r")
    path = f'{dest_folder}/frame00001.png'
    if os.path.exists(path):
        text_output = f'Video: {video_name} already split. Skipping split.'
        print(text_output)
        return
    try:
        os.mkdir(str(dest_folder))
    except:
        text_output = f'Folder: {dest_folder} already exists.'
        print(text_output)
    video_location = f'{source_folder}/{video_name}.{extension}'
    video_capture = cv2.VideoCapture(video_location)
    saved_frame_name = 1

    while True:
        print("Frame: " + format(saved_frame_name, '05d'), end="\r")
        success, frame = video_capture.read()

        if success:
            cv2.imwrite(f"{str(dest_folder)}/frame{format(saved_frame_name, '05d')}.png", frame)
            saved_frame_name += 1
        else:
            break
    print("Done                       ")

## Upload Video Frames

In [9]:
def upload_frames(folder_name, extension):
    files=sorted(glob.glob(f'{folder_name}/*.{extension}'))
    #files=files[1:]
    
    print("Uploading Frames")
    for i in range(len(files)):
        print(files[i] + "             ", end="\r")
        blob = bucket.blob(folder_name + "/" + files[i])
        blob.upload_from_filename(folder_name + "/" + files[i])
        
    print("Done Uploading               ", end="\r")

# SSIM Compare Video Frames for Novel Frames

## Remove Blurry Images from Set

### Remove Blurry Images

In [30]:
def parallel_laplacian_variance(file):
    print(file + "             ", end="\r")
    img=cv2.imread(file)
    img_gray=cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    laplacian=cv2.Laplacian(img_gray, cv2.CV_64F).var()
    return laplacian

def remove_blurry_images(folder_name, extension, delete_frames = False):
    files=sorted(glob.glob(f'{folder_name}/*.{extension}'))
    
    #blurriness = np.zeros(len(files))
    
    print("Calculating Average Blurriness")
    
    # Parallelize
    pool = mp.Pool(mp.cpu_count())
    
    blurriness = [pool.apply(parallel_laplacian_variance, args=(file,)) for file in files]

    pool.close()    
    
    median_blur = float(np.median(blurriness))
    min_blur = float(np.min(blurriness))
    max_blur = float(np.max(blurriness))
    #wandb.log({'Individual Laplacian': blurriness, 'Batch Median Laplacian': median_blur})
    print("Median Blur (Laplacian Variance): " + str(median_blur))
    blur_cutoff = median_blur*0.95 #+ ((1-average_blur)*0.1)
    print("Blur Cutoff (Laplacian Variance): " + str(blur_cutoff))
    
    print("Removing Noisy Images")
    
    count = 0
    removed_files = []
    
    cutoffs = {}
    for percentage in np.arange(0, 1, 0.05):
        cutoffs.update({f'{percentage}': float(np.quantile(blurriness, percentage))})
    
    for i in range(len(files)):
        if blurriness[i] < blur_cutoff:
            #print("Deleting " + files[i] + " - Laplacian Noisiness: " + str(blurriness[i]))
            removed_files.append(files[i])
            if delete_frames == True:
                os.remove(files[i])
            count += 1
    blur_ratio = count/len(files)
    #wandb.log({'Noisy Frame Ratio': blur_ratio})
    print(f"Done Checking Frames, {count} frames removed.                 ")
    return {'Total Original Frames': len(files), 'Removed Blurry Frame Count': count, 'Removed Blurry Frames': removed_files, 'Median Laplacian Variance': median_blur, 'Minimum Laplacian Variance': min_blur, 'Maximum Laplacian Variance': max_blur, 'Noisy Frame Ratio': blur_ratio, 'Laplacian Cutoffs': cutoffs}

# Deduplicate Similar Frames

## Calculate Similarity Between Images

In [11]:
def compare_images(image1, image2):
    image_gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
    image_gray2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)
    try:
        diff, _ = compare_ssim(image_gray1, image_gray2, full=True)
    except:
        image_gray2 = cv2.resize(image_gray2, image_gray1.shape, interpolation = cv2.INTER_AREA)
        diff, _ = compare_ssim(image_gray1, image_gray2, full=True)
    return diff

## Remove Duplicates

In [32]:
def parallel_compare_images(i, files):
    image1 = cv2.imread(files[i])
    image2 = cv2.imread(files[i+1])
    image_gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
    image_gray2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)
    try:
        diff, _ = compare_ssim(image_gray1, image_gray2, full=True)
    except:
        image_gray2 = cv2.resize(image_gray2, (image_gray1.shape[1], image_gray1.shape[0]), interpolation = cv2.INTER_AREA)
        diff, _ = compare_ssim(image_gray1, image_gray2, full=True)
    print(f'Similarity between {files[i]} and {files[i+1]}: {diff}', end="\r")
    return diff
    

def remove_duplicates(folder_name, extension, delete_frames = False):
    files=sorted(glob.glob(f'{folder_name}/*.{extension}'))
    print("Removing Duplicate and Highly Similar Frames\nCalculating Frame Similarities")
    #diff = np.zeros(len(files)-1)    
    
    # Parallelize
    pool = mp.Pool(mp.cpu_count())

    diff = [pool.apply(parallel_compare_images, args=(i, files)) for i in range(len(files)-1)]

    pool.close()   
    
    #for i in range(len(files)-1):
    #    image1 = cv2.imread(files[i])
    #    image2 = cv2.imread(files[i+1])
    #    try:
    #        diff[i] = compare_images(image1, image2)
    #    except:
    #        image2 = cv2.resize(image2, (image1.shape[1], image1.shape[0]), interpolation = cv2.INTER_AREA)
    #        diff[i] = compare_images(image1, image2)
    #    wandb.log({'Individual Frame Similarities': diff[i]})
    #    print(str(diff[i]), end="\r")
    
    median_diff = float(np.median(diff))
    #wandb.log({'Individual Frame Similarities': diff, 'Batch Median Frame Similarity': median_diff})
    
    diff_cutoff = median_diff*1.05
    
    if diff_cutoff < 0.95:
        diff_cutoff = 0.95
        
    print(f'Similarity Cutoff (OpenCV Compare Images): {diff_cutoff}')
    print('Removing Duplicate Images')
    
    count = 0
    removed_files = []
    
    cutoffs = {}
    for percentage in np.arange(0, 1, 0.05):
        cutoffs.update({f'{percentage}': float(np.quantile(diff, percentage))})
    
    for i in range(len(diff)):
        if diff[i] > 0.99:
            #print("Deleting " + files[i] + " - Similarity: " + str(diff[i]), end="\r")
            removed_files.append(files[i])
            if delete_frames == True:
                os.remove(files[i])
            #wandb.log({'Duplicates Similarity': diff})
            count += 1
        
    duplicate_ratio = count/len(files)
    #wandb.log({'Batch Duplicate Remove Ratio': duplicate_ratio})
    print("Done Checking Frames, " + str(count) + " frames removed.")
    return {'Removed Duplicate Frame Count': count, 'Removed Duplicate Frames': removed_files, 'Median Frame Similarity': median_diff, 'Duplicate Frame Ratio': duplicate_ratio, 'Similarity Cutoffs': cutoffs}

# Get Insight into the Dataset using Faster RCNN Resnet50 COCO 2018/01/28

In [13]:
model_url = 'http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet50_coco_2018_01_28.tar.gz'
base_url = os.path.dirname(model_url)+"/"
model_file = os.path.basename(model_url)
model_name = os.path.splitext(os.path.splitext(model_file)[0])[0]
model_dir = tf.keras.utils.get_file(fname=model_name, origin=base_url + model_file, untar=True)
model_dir = pathlib.Path(model_dir)/"saved_model"
model = tf.saved_model.load(str(model_dir))
model = model.signatures['serving_default']

2021-10-05 21:42:18.034466: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-05 21:42:18.186718: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-05 21:42:18.187641: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-05 21:42:18.189003: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [14]:
CLASS_LABELS = {1: "person", 2: "bicycle", 3: "car", 4: "motorcycle", 5: "airplane", 6: "bus", 7: "train", 8: "truck", 9: "boat", 10: "traffic light", 11: "fire hydrant", 13: "stop sign", 14: "parking meter", 15: "bench", 16: "bird", 17: "cat", 18: "dog", 19: "horse", 20: "sheep", 21: "cow", 22: "elephant", 23: "bear", 24: "zebra", 25: "giraffe", 27: "backpack", 28: "umbrella", 31: "handbag", 32: "tie", 33: "suitcase", 34: "frisbee", 35: "skis", 36: "snowboard", 37: "sports ball", 38: "kite", 39: "baseball bat", 40: "baseball glove", 41: "skateboard", 42: "surfboard", 43: "tennis racket", 44: "bottle", 46: "wine glass", 47: "cup", 48: "fork", 49: "knife", 50: "spoon", 51: "bowl", 52: "banana", 53: "apple", 54: "sandwich", 55: "orange", 56: "broccoli", 57: "carrot", 58: "hot dog", 59: "pizza", 60: "donut", 61: "cake", 62: "chair", 63: "couch", 64: "potted plant", 65: "bed", 67: "dining table", 70: "toilet", 72: "tv", 73: "laptop", 74: "mouse", 75: "remote", 76: "keyboard", 77: "cell phone", 78: "microwave", 79: "oven", 80: "toaster", 81: "sink", 82: "refrigerator", 84: "book", 85: "clock", 86: "vase", 87: "scissors", 88: "teddy bear", 89: "hair drier", 90: "toothbrush"}

In [15]:
def detect_objects(image):
    img = Image.open(image)
    input_tensor = tf.convert_to_tensor(img)
    input_tensor = input_tensor[tf.newaxis,...]
    output_dict = model(input_tensor)
    num_detections = int(output_dict.pop('num_detections'))
    output_dict = {key:value[0, :num_detections].numpy() for key,value in output_dict.items()}
    classes = output_dict['detection_classes'].astype(np.int64)
    class_names = [None] * len(classes)
    for i in range(len(classes)):
        class_names[i]=CLASS_LABELS[classes[i]]
    output = Counter(class_names)
    #wandb.log({'Detections per Image': num_detections})
    return {'Objects Detected':output, 'Number of Detections': num_detections}

In [16]:
def detect_file(folder_name, extension):
    detect_list = []
    for file in sorted(glob.glob(f'{folder_name}/*.{extension}')):
        print(f'Detecting on Image: {file}', end = '\r')
        output = {'Frame': file}
        output.update(detect_objects(file))
        detect_list.append(output)
    return {'Classification Information': detect_list}

# Full Video Analysis and Upload

In [17]:
def clean_unlabeled_video(video_name, bucket_location, extension, upload_frames = False):
    folder_name = video_name
    yaml_val = {'Video': f'{video_name}.{extension}'}
    download_video(video_name, extension, bucket_location, folder_name)
    frames_folder = f'{video_name}_frames'
    split_video_frames(video_name, extension, folder_name, frames_folder)
    yaml_val.update(remove_blurry_images(frames_folder, 'png'))
    yaml_val.update(remove_duplicates(frames_folder, 'png'))
    yaml_val.update(detect_file(frames_folder, 'png'))
    remaining_frames = sorted(glob.glob(f'{frames_folder}/*.{extension}'))
    yaml_val.update({'Clean Frames': remaining_frames})
    f = open(f'{video_name}.yaml', "w")
    yaml.dump(yaml_val, f, default_flow_style=False)
    f.close()
    # Upload yaml
    blob = bucket.blob(f'{video_name}.yaml')
    blob.upload_from_filename(f'{video_name}.yaml')
    if upload_frames == True:
        upload_frames(frame_folder, 'png')
    shutil.rmtree(folder_name)
    shutil.rmtree(frames_folder)

# Image Sequence Analysis

## Download Images

In [18]:
def download_images(folder_name):
    print("Checking if folder already downloaded", end="\r")
    #if os.path.exists(folder_name):
    #    print("Folder " + folder_name + " already downloaded                                                     ")
    #    return
    print(f'Downloading: {folder_name}                               ')
    os.makedirs(str(folder_name)+'/images', exist_ok=True)
    os.makedirs(str(folder_name)+'/labels', exist_ok=True)
    blobs = storage_client.list_blobs(location, prefix = folder_name)
    for blob in blobs:
        print(f'Downloading: {blob.name}                           ', end = '\r')
        try:
            blob.download_to_filename(blob.name) 
        except:
            print(f'{blob.name} failed to download, skipping')
    print(f'Downloading {folder_name} complete.                                 ')

## KITTI Label Workflow

In [None]:
def kitti_label_check(image_dir, label_dir, image_extension, label_extension):
    detect_list = []
    for image_file in sorted(glob.glob(f'{image_dir}/*.{image_extension}')):
        print(f'Detecting on Image: {image_file}', end = '\r')
        output = {'Frame': image_file}
        detections = detect_objects(image_file)
        
        fol_len = len(image_dir)
        ext_len = len(image_extension)+1
        label_file = f'{label_dir}{image_file[fol_len:-ext_len]}.{label_extension}'
        print(f'Reading GT File: {label_file}              ', end = '\r')
        
        label = pd.read_csv(label_file,delim_whitespace=True,header = None)
        label.columns = ['Object', 'Trunc', 'Occ', 'Alpha', 'min_x', 'min_y', 'max_x', 'max_y', 'dim1', 'dim2', 'dim3', 'loc1', 'loc2', 'loc3', 'rot_y']
        
        label = label.drop(columns=['Alpha', 'dim1', 'dim2', 'dim3', 'loc1', 'loc2', 'loc3', 'rot_y'])
        
        gt_detections = label['Object'].tolist()
        gt_output = Counter(gt_detections)
        
        vehicles = gt_output['Car'] + gt_output['Van'] + gt_output['Truck']
        people =  gt_output['Pedestrian'] + gt_output['Person_sitting'] + gt_output['Cyclist']
        urban_vehicles = gt_output['Tram']
        
        if urban_vehicles > 0:
            if vehicles > 0 or people > 0:
                gt_classification = 'City'
            else:
                gt_classification = 'Unknown'
        else:
            if vehicles == 0 and people > 0:
                gt_classification = 'Pedestrian Traffic'
            elif vehicles >= 2 and people == 0:
                gt_classification = 'Freeway'
            elif vehicles < 2 and people < 3:
                gt_classification = 'Rural'
            elif vehicles >= 2 and people > 0:
                gt_classification = 'City'
            else:
                gt_classification = 'Unknown'
        
        gt_detection_difference = detections['Objects Detected']-gt_output
        gt_total_labels = sum(gt_output.values())
        
        ground_truth = {'GT Classification': gt_classification, 'GT Objects': dict(gt_output), 'Number of GT Labels': gt_total_labels, 'Difference between Detections and GT': dict(gt_detection_difference)}
        
        detections['Objects Detected'] = dict(detections['Objects Detected'])
        
        output.update(detections)
        output.update(ground_truth)
        
        detect_list.append(output)
    
    sequence_classification = Counter(output['GT Classification']).most_common(1)
    
    print('Label Check Completed')
    return {'Sequence Classification': sequence_classification, 'Classification Information': detect_list}

## KITTI Sequential Label Workflow

In [None]:
def kitti_sequence_label_check(image_dir, label_file, image_extension, label_extension):
    detect_list = []
    
    # Import Label File
    label_file = f'{label_file}.{label_extension}'
    print(f'Reading GT File: {label_file}              ', end = '\r')
    labels = pd.read_csv(label_file,delim_whitespace=True,header = None)
    labels.columns = ['Frame', 'Track_ID', 'Object', 'Trunc', 'Occ', 'Alpha', 'min_x', 'min_y', 'max_x', 'max_y', 'dim1', 'dim2', 'dim3', 'loc1', 'loc2', 'loc3', 'rot_y']
    labels = labels.drop(columns=['Track_ID', 'Alpha', 'dim1', 'dim2', 'dim3', 'loc1', 'loc2', 'loc3', 'rot_y'])
    
    for image_file in sorted(glob.glob(f'{image_dir}/*.{image_extension}')):
        print(f'Detecting on Image: {image_file}', end = '\r')
        output = {'Frame': image_file}
        detections = detect_objects(image_file)
        
        frame_num = int(re.search(r'\d+', os.path.basename(image_file)).group(0))
        frame_label = labels.loc[labels['Frame'] == frame_num]
        
        gt_detections = frame_label['Object'].tolist()
        gt_output = Counter(gt_detections)
        
        vehicles = gt_output['Car'] + gt_output['Van'] + gt_output['Truck']
        people =  gt_output['Pedestrian'] + gt_output['Person_sitting'] + gt_output['Cyclist']
        urban_vehicles = gt_output['Tram']
        
        if urban_vehicles > 0:
            if vehicles > 0 or people > 0:
                gt_classification = 'City'
            else:
                gt_classification = 'Unknown'
        else:
            if vehicles == 0 and people > 0:
                gt_classification = 'Pedestrian Traffic'
            elif vehicles >= 2 and people == 0:
                gt_classification = 'Freeway'
            elif vehicles < 2 and people < 3:
                gt_classification = 'Rural'
            elif vehicles >= 2 and people > 0:
                gt_classification = 'City'
            else:
                gt_classification = 'Unknown'
        
        gt_detection_difference = detections['Objects Detected']-gt_output
        gt_total_labels = sum(gt_output.values())
        
        ground_truth = {'GT Classification': gt_classification, 'GT Objects': dict(gt_output), 'Number of GT Labels': gt_total_labels, 'Difference between Detections and GT': dict(gt_detection_difference)}
        
        detections['Objects Detected'] = dict(detections['Objects Detected'])
        
        output.update(detections)
        output.update(ground_truth)
        
        
        detect_list.append(output)
    
    sequence_classification = Counter(output['GT Classification']).most_common(1)
    
    print('Label Check Completed')
    return {'Sequence Classification': sequence_classification, 'Classification Information': detect_list}

## Images Workflow

In [39]:
def clean_images_labeled(project_folder, image_folder, image_extension, label_folder, label_extension, dataset_format, upload_frames=False):
    current_formats = ['KITTI', 'OCT', 'JAAD', 'KITTI_sequence'] # Current List of supported dataset formats
    if dataset_format not in current_formats: # Check to make sure dataset_format is supported
        print(f'dataset_format dataset format not supported. Cancelling')
        return False
    yaml_val = {'Image Folder': f'{project_folder}/{image_folder}', 'Label Folder': f'{project_folder}/{label_folder}'}
    download_images_and_labels(project_folder, image_folder, label_folder)
    image_dir = f'{project_folder}/{image_folder}'
    label_dir = f'{project_folder}/{label_folder}'
    yaml_val.update(remove_blurry_images(image_dir, image_extension))
    yaml_val.update(remove_duplicates(image_dir, image_extension))
    if dataset_format == 'JAAD':
        yaml_val.update(detect_file(image_dir, image_extension))
    elif dataset_format == 'KITTI':
        yaml_val.update(kitti_label_check(image_dir, label_dir, image_extension, label_extension))
    elif dataset_format == 'OCT':
        yaml_val.update(oct_label_check(image_dir, label_dir, image_extension, label_extension))
    elif dataset_format == 'KITTI_sequence':
        yaml_val.update(kitti_sequence_label_check(image_dir, label_dir, image_extension, label_extension))
    remaining_frames = sorted(glob.glob(f'{image_dir}/*.{image_extension}'))
    yaml_val.update({'Clean Frames': remaining_frames})
    
    image_folder_unslashed = image_folder.replace('/', '-')
    yaml_dir = f'{project_folder}_{image_folder_unslashed}.yaml'
    f = open(yaml_dir, "w")
    yaml.dump(yaml_val, f, default_flow_style=False)
    f.close()
    print(f'YAML File: {yaml_dir} created')
    # Upload yaml
    blob = bucket.blob(yaml_dir)
    blob.upload_from_filename(yaml_dir)
    print(f'YAML File: {yaml_dir} uploaded to gs://{project_folder}/{yaml_dir}')
    if upload_frames == True:
        upload_frames(project_folder, extension)
    shutil.rmtree(project_folder)
    print('Process Complete')
    return True

### Old Version

In [22]:
def clean_images(folder_name, extension, upload_frames=False):
    folder_name = str(folder_name)
    yaml_val = {'Folder': folder_name}
    download_images(folder_name)
    image_folder = f'{folder_name}/images'
    yaml_val.update(remove_blurry_images(image_folder, extension))
    yaml_val.update(remove_duplicates(image_folder, extension))
    yaml_val.update(detect_file(image_folder, extension))
    f = open(f'{folder_name}.yaml', "w")
    yaml.dump(yaml_val, f, default_flow_style=False)
    f.close()
    print(f'YAML File: {folder_name}.yaml created')
    # Upload yaml
    blob = bucket.blob(f'{folder_name}.yaml')
    blob.upload_from_filename(f'{folder_name}.yaml')
    print(f'YAML File: {folder_name}.yaml uploaded to gs://{location}/{folder_name}.yaml')
    if upload_frames == True:
        upload_frames(folder_name, extension)
    shutil.rmtree(folder_name)

### Checking Script

In [None]:
clean_images_labeled('kitti_sequential', 'images/training/image_02/0000', 'png', 'labels/training/label_02/0000', 'txt', 'KITTI_sequence')
clean_images_labeled('kitti_sequential', 'images/training/image_02/0001', 'png', 'labels/training/label_02/0001', 'txt', 'KITTI_sequence')
clean_images_labeled('kitti_sequential', 'images/training/image_02/0002', 'png', 'labels/training/label_02/0002', 'txt', 'KITTI_sequence')
clean_images_labeled('kitti_sequential', 'images/training/image_02/0003', 'png', 'labels/training/label_02/0003', 'txt', 'KITTI_sequence')
clean_images_labeled('kitti_sequential', 'images/training/image_02/0004', 'png', 'labels/training/label_02/0004', 'txt', 'KITTI_sequence')
clean_images_labeled('kitti_sequential', 'images/training/image_02/0005', 'png', 'labels/training/label_02/0005', 'txt', 'KITTI_sequence')
clean_images_labeled('kitti_sequential', 'images/training/image_02/0006', 'png', 'labels/training/label_02/0006', 'txt', 'KITTI_sequence')
clean_images_labeled('kitti_sequential', 'images/training/image_02/0007', 'png', 'labels/training/label_02/0007', 'txt', 'KITTI_sequence')
clean_images_labeled('kitti_sequential', 'images/training/image_02/0008', 'png', 'labels/training/label_02/0008', 'txt', 'KITTI_sequence')
clean_images_labeled('kitti_sequential', 'images/training/image_02/0009', 'png', 'labels/training/label_02/0009', 'txt', 'KITTI_sequence')
clean_images_labeled('kitti_sequential', 'images/training/image_02/0010', 'png', 'labels/training/label_02/0010', 'txt', 'KITTI_sequence')
clean_images_labeled('kitti_sequential', 'images/training/image_02/0011', 'png', 'labels/training/label_02/0011', 'txt', 'KITTI_sequence')
clean_images_labeled('kitti_sequential', 'images/training/image_02/0012', 'png', 'labels/training/label_02/0012', 'txt', 'KITTI_sequence')
clean_images_labeled('kitti_sequential', 'images/training/image_02/0013', 'png', 'labels/training/label_02/0013', 'txt', 'KITTI_sequence')
clean_images_labeled('kitti_sequential', 'images/training/image_02/0014', 'png', 'labels/training/label_02/0014', 'txt', 'KITTI_sequence')
clean_images_labeled('kitti_sequential', 'images/training/image_02/0015', 'png', 'labels/training/label_02/0015', 'txt', 'KITTI_sequence')
clean_images_labeled('kitti_sequential', 'images/training/image_02/0016', 'png', 'labels/training/label_02/0016', 'txt', 'KITTI_sequence')
clean_images_labeled('kitti_sequential', 'images/training/image_02/0017', 'png', 'labels/training/label_02/0017', 'txt', 'KITTI_sequence')
clean_images_labeled('kitti_sequential', 'images/training/image_02/0018', 'png', 'labels/training/label_02/0018', 'txt', 'KITTI_sequence')
clean_images_labeled('kitti_sequential', 'images/training/image_02/0019', 'png', 'labels/training/label_02/0019', 'txt', 'KITTI_sequence')
clean_images_labeled('kitti_sequential', 'images/training/image_02/0020', 'png', 'labels/training/label_02/0020', 'txt', 'KITTI_sequence')
clean_images_labeled('kitti', 'images', 'png', 'labels', 'txt', 'KITTI')

Downloading project folder: kitti_sequential.                       
Downloading Image Folder: kitti_sequential/images/training/image_02/0000                       
Downloading Label Folder: kitti_sequential/labels/training/label_02/0000                       
Downloading project folder: kitti_sequential complete.                                
Calculating Average Blurriness
Median Blur (Laplacian Variance): 665.005707949245853.png             
Blur Cutoff (Laplacian Variance): 631.7554225517835
Removing Noisy Images
Done Checking Frames, 72 frames removed.                 
Removing Duplicate and Highly Similar Frames
Calculating Frame Similarities
Similarity Cutoff (OpenCV Compare Images): 0.95ning/image_02/0000/000152.png and kitti_sequential/images/training/image_02/0000/000153.png: 0.53423207863272665
Removing Duplicate Images
Done Checking Frames, 0 frames removed.
Label Check Completeditti_sequential/images/training/image_02/0000/000153.png    
YAML File: kitti_sequential_images

In [None]:
clean_images_labeled('kitti', 'images', 'png', 'labels', 'txt', 'KITTI')

# Analyze Entire Bucket (Deprecated)

In [None]:
#def clean_entire_bucket_videos():
#    blobs = storage_client.list_blobs(location)
#    for blob in blobs:
#        clean_video(blob.name)

# FastAPI Deployment

In [None]:
app = FastAPI()

#@app.on_event("startup")
#def start_wandb():
#    init_wandb(location)
#    return {'message': ('Weights and Balances Started as project: ' + wandb_project)}

@app.get('/')
def index():
    return {'message': 'This is the homepage of the model, add \'/docs\' to the end of the URL to access FastAPI to make predictions with the model'}

@app.get('/set_gcp_bucket')
def set_gcp_bucket(string_input):
    set_bucket(str(string_input))
    return {'message': ('GCP Location Set to: ' + location)}

@app.get('/clean_single_video_unlabelled')
async def app_clean_unlabeled_video(video_name, bucket_location, extension, upload_frames):
    clean_unlabeled_video(video_name, bucket_location, extension, upload_frames)
    if upload_frames == True:
        return {'message': (f'Video: {video_name}.{extension} cleaned and yaml and frames uploaded to gs://{location}/')}
    else:
        return {'message': (f'Video: {video_name}.{extension} cleaned and yaml uploaded as gs://{location}/{video_name}.yaml')}

@app.get('/clean_folder_image_sequence')
async def single_folder_imageseq(folder_name, extension, upload_frames):
    clean_images(folder_name, extension, upload_frames)
    return {'message': (f'Images in folder: {folder_name} cleaned and yaml uploaded to gs://{location}')}

#@app.get('/clean_bucket_video')
#async def full_clean_video():
#    clean_entire_bucket_video()
#    return {'message': ('Bucket: ' + location + ' cleaned and uploaded to gs://' + location)}

# Main Function

In [None]:
nest_asyncio.apply()
#wandb.login(relogin=True)
uvicorn.run(app, host='0.0.0.0', port=8000)