# Import Libraries

In [1]:
%%capture
#!pip install --upgrade fastapi ffmpeg uvicorn python-multipart tensorflow-gpu scikit-image imutils wandb tensorflow_hub Pillow pyyaml

from fastapi import FastAPI, File, UploadFile
from skimage.metrics import structural_similarity as compare_ssim

from collections import Counter

import tensorflow_hub as hub
import numpy as np
import tensorflow as tf
from PIL import Image
import ffmpeg, shutil, glob

import multiprocessing as mp

from google.cloud import storage
import nest_asyncio, uvicorn, os, pathlib
import yaml
import cv2, wandb

# Set Up Google Cloud Parameters


## Set Up Google Cloud Project and Model Location

In [2]:
location = 'james-mlsys' # Model Storage Bucket

## Create Storage Bucket

In [3]:
storage_client = storage.Client()

bucket = storage_client.bucket(location)

## Double Check Cloud Bucket (Development Code Only)

In [4]:
%%capture
blobs = storage_client.list_blobs(location)
for blob in blobs:
    print(blob.name)

# WandB Functions

In [5]:
def init_wandb(project_name):
   global wandb_project
   wandb_project = str(project_name)
   wandb.init(project=wandb_project, sync_tensorboard=True)
   return True

# Set Bucket

In [6]:
def set_bucket(in_location):
    global location 
    location = str(in_location)
    global bucket
    bucket = storage_client.bucket(location)
    return True

# Split Video to Frames and Upload

## Download Video to Local Instance

In [40]:
def download_video(video_name, extension, origin_folder, dest_folder):
    print("Checking if video already downloaded", end="\r")
    if os.path.exists(dest_folder + '/' + video_name + '.' + extension):
        text_output = f'Video: {video_name}.{extension} already downloaded. Skipping Download.'
        print(text_output)
        return
    try:
        os.mkdir(str(dest_folder))
    except:
        text_output = f'Folder: {dest_folder} already exists.          '
        print(text_output)
    def_location = f'{origin_folder}/{video_name}.{extension}'
    def_text = f'{origin_folder}_{video_name}.{extension}'
    text_output = f'Downloading: {def_location}'
    print(text_output)
    blob = bucket.blob(def_location)
    blob.download_to_filename(def_text)
    def_destination = f'{dest_folder}/{video_name}.{extension}'
    shutil.move(def_text, def_destination)  

## Break down video to frames

In [42]:
def split_video_frames(video_name, extension, source_folder, dest_folder):
    print("Checking if video already split", end="\r")
    path = f'{dest_folder}/frame00001.png'
    if os.path.exists(path):
        text_output = f'Video: {video_name} already downloaded. Skipping Download.'
        print(text_output)
        return
    try:
        os.mkdir(str(dest_folder))
    except:
        text_output = f'Folder: {dest_folder} already exists.'
        print(text_output)
    video_location = f'{source_folder}/{video_name}.{extension}'
    video_capture = cv2.VideoCapture(video_location)
    saved_frame_name = 1

    while True:
        print("Frame: " + format(saved_frame_name, '05d'), end="\r")
        success, frame = video_capture.read()

        if success:
            cv2.imwrite(f"{str(dest_folder)}/frame{format(saved_frame_name, '05d')}.png", frame)
            saved_frame_name += 1
        else:
            break
    print("Done                       ")

## Upload Video Frames

In [9]:
def upload_frames(folder_name, extension):
    files=sorted(glob.glob(f'{folder_name}/*.{extension}'))
    #files=files[1:]
    
    print("Uploading Frames")
    for i in range(len(files)):
        print(files[i] + "             ", end="\r")
        blob = bucket.blob(folder_name + "/" + files[i])
        blob.upload_from_filename(folder_name + "/" + files[i])
        
    print("Done Uploading               ", end="\r")

# SSIM Compare Video Frames for Novel Frames

## Remove Blurry Images from Set

### Calculate Blurriness using Laplacian

In [10]:
def variance_of_laplacian(image):
    # compute the Laplacian of the image and then return the focus
    # measure, which is simply the variance of the Laplacian
    return cv2.Laplacian(image, cv2.CV_64F).var()

### Remove Blurry Images

In [11]:
def parallelized_remove_blurry(file):
    print(file + "             ", end="\r")
    img=cv2.imread(file)
    img_gray=cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    blurriness=variance_of_laplacian(img_gray)
    return blurriness

def remove_blurry_images(folder_name, extension):
    files=sorted(glob.glob(f'{folder_name}/*.{extension}'))
    
    #blurriness = np.zeros(len(files))
    
    print("Calculating Average Blurriness")
    
    # Parallelize
    pool = mp.Pool(mp.cpu_count())
    
    blurriness = [pool.apply(parallelized_remove_blurry, args=(file,)) for file in files]

    pool.close()    
        
    #for i in range(len(files)):
    #    print(files[i] + "             ", end="\r")
    #    img=cv2.imread(files[i])
    #    img_gray=cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    #    blurriness[i]=variance_of_laplacian(img_gray)
    #    wandb.log({'Individual Laplacian': blurriness[i]})
    
    median_blur = np.median(blurriness)
    min_blur = np.min(blurriness)
    max_blur = np.max(blurriness)
    wandb.log({'Individual Laplacian': blurriness, 'Batch Median Laplacian': median_blur})
    print("Median Blur (Laplacian Variance): " + str(median_blur))
    blur_cutoff = median_blur*1.05 #+ ((1-average_blur)*0.1)
    print("Blur Cutoff (Laplacian Variance): " + str(blur_cutoff))
    
    print("Removing Noisy Images")
    
    count = 0
    for i in range(len(files)):
        if blurriness[i] > blur_cutoff:
            #print("Deleting " + files[i] + " - Laplacian Noisiness: " + str(blurriness[i]))
            os.remove(files[i])
            count += 1
    blur_ratio = count/len(files)
    wandb.log({'Noisy Frame Ratio': blur_ratio})
    print(f"Done Checking Frames, {count} frames removed.                 ")
    return {'Total Original Frames': len(files), 'Removed Blurry Frames': count, 'Median Laplacian Variance': median_blur, 'Minimum Laplacian Variance': min_blur, 'Maximum Laplacian Variance': max_blur, 'Noisy Frame Ratio': blur_ratio}

# Deduplicate Similar Frames

## Calculate Similarity Between Images

In [12]:
def compare_images(image1, image2):
    image_gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
    image_gray2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)
    try:
        diff, _ = compare_ssim(image_gray1, image_gray2, full=True)
    except:
        image_gray2 = cv2.resize(image_gray2, image_gray1.shape, interpolation = cv2.INTER_AREA)
        diff, _ = compare_ssim(image_gray1, image_gray2, full=True)
    return diff

## Remove Duplicates

In [13]:
def parallel_remove_duplicates(i, files):
    image1 = cv2.imread(files[i])
    image2 = cv2.imread(files[i+1])
    try:
        diff = compare_images(image1, image2)
    except:
        image2 = cv2.resize(image2, (image1.shape[1], image1.shape[0]), interpolation = cv2.INTER_AREA)
        diff = compare_images(image1, image2)
    print(str(diff), end="\r")
    return diff
    

def remove_duplicates(folder_name, extension):
    files=sorted(glob.glob(f'{folder_name}/*.{extension}'))
    print("Removing Duplicate and Highly Similar Frames\nCalculating Frame Similarities")
    
    #diff = np.zeros(len(files)-1)    
    
    # Parallelize
    pool = mp.Pool(mp.cpu_count())

    diff = [pool.apply(parallel_remove_duplicates, args=(i, files)) for i in range(len(files)-1)]

    pool.close()   
    
    #for i in range(len(files)-1):
    #    image1 = cv2.imread(files[i])
    #    image2 = cv2.imread(files[i+1])
    #    try:
    #        diff[i] = compare_images(image1, image2)
    #    except:
    #        image2 = cv2.resize(image2, (image1.shape[1], image1.shape[0]), interpolation = cv2.INTER_AREA)
    #        diff[i] = compare_images(image1, image2)
    #    wandb.log({'Individual Frame Similarities': diff[i]})
    #    print(str(diff[i]), end="\r")
    
    median_diff = np.median(diff)
    wandb.log({'Individual Frame Similarities': diff, 'Batch Median Frame Similarity': median_diff})
    
    diff_cutoff = median_diff*1.05
    
    if diff_cutoff < 0.95:
        diff_cutoff = 0.95
        
    print("Similarity Cutoff (OpenCV Compare Images): " + str(diff_cutoff))
    print("Removing Duplicate Images")
    
    count = 0
    for i in range(len(diff)):
        if diff[i] > 0.99:
            #print("Deleting " + files[i] + " - Similarity: " + str(diff[i]), end="\r")
            os.remove(files[i])
            wandb.log({'Duplicates Similarity': diff})
            count += 1
        
    duplicate_ratio = count/len(files)
    wandb.log({'Batch Duplicate Remove Ratio': duplicate_ratio})
    print("Done Checking Frames, " + str(count) + " frames removed.")
    return {'Removed Duplicate Frames': count, 'Median Frame Similarity': median_diff, 'Duplicate Frame Ratio': duplicate_ratio}

# Get Insight into the Dataset using Faster RCNN Resnet50 COC0 2018/01/28

In [14]:
model_url = 'http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet50_coco_2018_01_28.tar.gz'
base_url = os.path.dirname(model_url)+"/"
model_file = os.path.basename(model_url)
model_name = os.path.splitext(os.path.splitext(model_file)[0])[0]
model_dir = tf.keras.utils.get_file(fname=model_name, origin=base_url + model_file, untar=True)
model_dir = pathlib.Path(model_dir)/"saved_model"
model = tf.saved_model.load(str(model_dir))
model = model.signatures['serving_default']

2021-10-01 03:24:08.874208: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-01 03:24:09.018450: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-01 03:24:09.019374: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-01 03:24:09.021536: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [15]:
CLASS_LABELS = {1: "person", 2: "bicycle", 3: "car", 4: "motorcycle", 5: "airplane", 6: "bus", 7: "train", 8: "truck", 9: "boat", 10: "traffic light", 11: "fire hydrant", 13: "stop sign", 14: "parking meter", 15: "bench", 16: "bird", 17: "cat", 18: "dog", 19: "horse", 20: "sheep", 21: "cow", 22: "elephant", 23: "bear", 24: "zebra", 25: "giraffe", 27: "backpack", 28: "umbrella", 31: "handbag", 32: "tie", 33: "suitcase", 34: "frisbee", 35: "skis", 36: "snowboard", 37: "sports ball", 38: "kite", 39: "baseball bat", 40: "baseball glove", 41: "skateboard", 42: "surfboard", 43: "tennis racket", 44: "bottle", 46: "wine glass", 47: "cup", 48: "fork", 49: "knife", 50: "spoon", 51: "bowl", 52: "banana", 53: "apple", 54: "sandwich", 55: "orange", 56: "broccoli", 57: "carrot", 58: "hot dog", 59: "pizza", 60: "donut", 61: "cake", 62: "chair", 63: "couch", 64: "potted plant", 65: "bed", 67: "dining table", 70: "toilet", 72: "tv", 73: "laptop", 74: "mouse", 75: "remote", 76: "keyboard", 77: "cell phone", 78: "microwave", 79: "oven", 80: "toaster", 81: "sink", 82: "refrigerator", 84: "book", 85: "clock", 86: "vase", 87: "scissors", 88: "teddy bear", 89: "hair drier", 90: "toothbrush"}

In [16]:
def detect_objects(image):
    img = Image.open(image)
    input_tensor = tf.convert_to_tensor(img)
    input_tensor = input_tensor[tf.newaxis,...]
    output_dict = model(input_tensor)
    num_detections = int(output_dict.pop('num_detections'))
    output_dict = {key:value[0, :num_detections].numpy() for key,value in output_dict.items()}
    classes = output_dict['detection_classes'].astype(np.int64)
    class_names = [None] * len(classes)
    for i in range(len(classes)):
        class_names[i]=CLASS_LABELS[classes[i]]
    output = Counter(class_names)
    wandb.log({'Detections per Image': num_detections})
    return {'Objects Detected':dict(output), 'Number of Detections': num_detections}

In [17]:
def detect_file(folder_name, extension):
    detect_list = []
    for file in sorted(glob.glob(f'{folder_name}/*.{extension}')):
        print(f'Detecting on Image: {file}', end = '\r')
        output = {'Frame': file}
        output.update(detect_objects(file))
        detect_list.append(output)
    return {'Classification Information': detect_list}

# Full Video Analysis and Upload

In [46]:
def clean_video(video_name, bucket_location, extension, upload_frames = True):
    folder_name = video_name
    yaml_val = {'Video': f'{video_name}.{extension}'}
    download_video(video_name, extension, bucket_location, folder_name)
    frames_folder = f'{video_name}_frames'
    split_video_frames(video_name, extension, folder_name, frames_folder)
    yaml_val.update(remove_blurry_images(frames_folder, 'png'))
    yaml_val.update(remove_duplicates(frames_folder, 'png'))
    yaml_val.update(detect_file(frames_folder, 'png'))
    f = open(f'{video_name}.yaml', "w")
    yaml.dump(yaml_val, f, default_flow_style=False)
    f.close()
    # Upload yaml
    blob = bucket.blob(f'{video_name}.yaml')
    blob.upload_from_filename(f'{video_name}.yaml')
    if upload_frames == True:
        upload_frames(frame_folder, 'png')
    shutil.rmtree(folder_name)
    shutil.rmtree(frames_folder)

In [47]:
clean_video("video_0001","JAAD_clips","mp4",upload_frames = False)

Video: video_0001.mp4 already downloaded. Skipping Download.
Video: video_0001 already downloaded. Skipping Download.
Calculating Average Blurriness
Median Blur (Laplacian Variance): 77.59506591796044
Blur Cutoff (Laplacian Variance): 81.47481921385847
Removing Noisy Images
Done Checking Frames, 1 frames removed.                 
Removing Duplicate and Highly Similar Frames
Calculating Frame Similarities
Similarity Cutoff (OpenCV Compare Images): 1.0421769452679859
Removing Duplicate Images
Done Checking Frames, 338 frames removed.
Detecting on Image: video_0001_frames/frame00600.png

# Image Sequence Analysis

## Download Images

In [20]:
def parallized_download(blob, folder_name):
    filename = blob.name.replace('/', '_') 
    print(f'Downloading: {filename}', end = '\r')
    blob.download_to_filename(filename) 
    new_filename = blob.name[len(folder_name):]
    shutil.move(filename, folder_name + new_filename)
    
def download_images(folder_name):
    print("Checking if folder already downloaded", end="\r")
    if os.path.exists(folder_name):
        print("Folder " + folder_name + " already downloaded                                                     ")
        return
    print("Downloading: " + str(folder_name))
    os.mkdir(str(folder_name))
    os.mkdir(str(folder_name)+'/images')
    os.mkdir(str(folder_name)+'/labels')
    os.mkdir(str(folder_name)+'/labels/car')
    os.mkdir(str(folder_name)+'/labels/cyclist')
    os.mkdir(str(folder_name)+'/labels/pedestrian')
    blobs = storage_client.list_blobs(location, prefix = folder_name)
    
    # Parallelize
    pool = mp.Pool(mp.cpu_count())

    [pool.apply(parallelized_download, args=(blob, folder_name)) for blob in blobs]

    pool.close()    
    
    #for blob in blobs:
    #    filename = blob.name.replace('/', '_') 
    #    print(filename, end = '\r')
    #    blob.download_to_filename(filename) 
    #    new_filename = blob.name[len(folder_name):]
    #    shutil.move(filename, "kitti" + new_filename)
    #    #print(blob.name, end="\r")
    #    #blob.download_to_filename(folder_name)

## Images Workflow

In [24]:
def clean_images(folder_name, extension, upload_frames=False):
    folder_name = str(folder_name)
    yaml_val = {'Folder': folder_name}
    download_images(folder_name)
    image_folder = f'{folder_name}/images'
    yaml_val.update(remove_blurry_images(image_folder, extension))
    yaml_val.update(remove_duplicates(image_folder, extension))
    yaml_val.update(detect_file(image_folder, extension))
    f = open(f'{folder_name}.yaml', "w")
    yaml.dump(yaml_val, f, default_flow_style=False)
    f.close()
    print(f'YAML File: {folder_name}.yaml created')
    # Upload yaml
    blob = bucket.blob(f'{folder_name}.yaml')
    blob.upload_from_filename(f'{folder_name}.yaml')
    if upload_frames == True:
        upload_frames(folder_name, 'png')
    shutil.rmtree(folder_name)

In [23]:
clean_images('kitti', 'png', upload_frames = False)

Folder kitti already downloaded                                                     
Calculating Average Blurriness
Median Blur (Laplacian Variance): 445.3854210688824
Blur Cutoff (Laplacian Variance): 467.65469212232654
Removing Noisy Images
Done Checking Frames, 937 frames removed.                 
Removing Duplicate and Highly Similar Frames
Calculating Frame Similarities
Similarity Cutoff (OpenCV Compare Images): 0.95
Removing Duplicate Images
Done Checking Frames, 0 frames removed.
Detecting on Image: kitti/images/000001.png

2021-10-01 03:27:25.874250: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Detecting on Image: kitti/images/007514.png

In [22]:
init_wandb(location)

[34m[1mwandb[0m: Currently logged in as: [33mjamesysato[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.3 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


True

# Analyze Entire Bucket

In [None]:
#def clean_entire_bucket_videos():
#    blobs = storage_client.list_blobs(location)
#    for blob in blobs:
#        clean_video(blob.name)

# FastAPI Deployment

In [50]:
app = FastAPI()

@app.on_event("startup")
def start_wandb():
    init_wandb(location)
    return {'message': ('Weights and Balances Started as project: ' + wandb_project)}

@app.get('/')
def index():
    return {'message': 'This is the homepage of the model, add \'/docs\' to the end of the URL to access FastAPI to make predictions with the model'}

@app.get('/set_gcp_bucket')
def set_gcp_bucket(string_input):
    set_bucket(str(string_input))
    return {'message': ('GCP Location Set to: ' + location)}

@app.get('/clean_single_video')
async def single_clean(video_name, bucket_location, extension, upload_frames):
    clean_video(video_name, bucket_location, extension, upload_frames)
    #clean_video(str(string_input))
    if upload_frames == True:
        return {'message': (f'Video: {video_name}.{extension} cleaned and yaml and frames uploaded to gs://{location}/')}
    else:
        return {'message': (f'Video: {video_name}.{extension} cleaned and yaml uploaded to gs://{location}/')}

@app.get('/clean_folder_image_sequence')
async def single_folder_imageseq(folder_name, extension, upload_frames):
    clean_images(folder_name, extension, upload_frames)
    return {'message': (f'Images in folder: {folder_name} cleaned and yaml uploaded to gs://{location}')}

#@app.get('/clean_bucket_video')
#async def full_clean_video():
#    clean_entire_bucket_video()
#    return {'message': ('Bucket: ' + location + ' cleaned and uploaded to gs://' + location)}

# Main Function

In [51]:
nest_asyncio.apply()
wandb.login(relogin=True)
uvicorn.run(app, host='0.0.0.0', port=8000)

INFO:     Started server process [1360]
INFO:     Waiting for application startup.


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Batch Duplicate Remove Ratio,0.0
Batch Median Frame Similarity,0.88907
Batch Median Laplacian,128.8135
Detections per Image,6.0
Noisy Frame Ratio,0.37619


0,1
Batch Duplicate Remove Ratio,▁
Batch Median Frame Similarity,▁
Batch Median Laplacian,▁
Detections per Image,▂▄▄▄▃▅▄▄▄▄▄▇▆▃▄▃▄▃▁▂▇▃▄▃▅▅▄▆▄▅▇▇▆▃▅█▆▇▃▃
Noisy Frame Ratio,▁


[34m[1mwandb[0m: wandb version 0.12.3 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     99.147.232.13:57151 - "GET /docs HTTP/1.1" 200 OK
INFO:     99.147.232.13:57151 - "GET /openapi.json HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [1360]
