<a href="https://colab.research.google.com/github/MatthewYancey/16-9GAN/blob/master/src/process_frames.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Processing

This notebook takes the video files in a folder and saves the individual frames.

## Imports and Parameters

In [1]:
import os
import glob
import cv2
import shutil
from google.colab import drive
import zipfile

In [2]:
# parameters
drive.mount('/content/gdrive')
VIDEO_PATH = '/content/gdrive/My Drive/16:9GAN/data_raw/FMA - 16:9/'
FRAME_PATH = '/content/frames/'
ZIP_FOLDER = '/content/gdrive/My Drive/16:9GAN/data_out/frames_16_9/'

FRAME_SECONDS_SKIP = 0.5
IMAGE_SIZE = 256
SKIP_SECONDS_BEGINNING = 120
SKIP_SECONDS_END = 120
FRAME_LIMIT = None

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Image hash function for checking if we have duplicate images

In [3]:
def dhash(image, hashSize=8):
	# convert the image to grayscale and resize the grayscale image,
	# adding a single column (width) so we can compute the horizontal
	# gradient
	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	resized = cv2.resize(gray, (hashSize + 1, hashSize))
	# compute the (relative) horizontal gradient between adjacent
	# column pixels
	diff = resized[:, 1:] > resized[:, :-1]
	# convert the difference image to a hash and return it
	return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])

## Video loop and frame saving

In [None]:
def save_frames(video_list, zip_path):

    try:
        shutil.rmtree(FRAME_PATH)
    except:
        pass
    
    os.mkdir(FRAME_PATH)

    frame_count = 0
    for f in video_list:
        print(f)
        vidcap = cv2.VideoCapture(f)
        video_length = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT) / 24)
        success, image = vidcap.read()


        # loops through and save the frames
        image_hashes = []
        while success:
            current_frame = vidcap.get(cv2.CAP_PROP_POS_FRAMES)
            # skips the intro, outtro, and every 24 frames
            if (current_frame >= (SKIP_SECONDS_BEGINNING * 24) and current_frame <= (video_length - SKIP_SECONDS_END) * 24 and current_frame % (FRAME_SECONDS_SKIP * 24) == 0):
                
                # makes a small image and does a hash on it to see if we've had this image before
                image_small = cv2.resize(image, (100, 100))
                image_hash = dhash(image_small)

                # if not a image we've had before we save it 
                if image_hash not in image_hashes:

                    image = cv2.resize(image, (IMAGE_SIZE, IMAGE_SIZE))

                    # appends the hash and saves the file
                    image_hashes.append(image_hash)
                    cv2.imwrite(f'{FRAME_PATH}{frame_count}.jpg', image)
                    frame_count += 1

            # loop to the next frame
            success, image = vidcap.read()
            
        # breaks the loop if we have enough images
        print(f'Number of images saved: {frame_count}')
        if FRAME_LIMIT != None and frame_count >= FRAME_LIMIT:
            break

    shutil.make_archive(zip_path, 'zip', FRAME_PATH)
    print(f'Saved zip {zip_path}')

# keeps one video out for testing
train_files = glob.glob(VIDEO_PATH + '*')
test_files = train_files.pop()

# saves a train an test zip file
save_frames(train_files, ZIP_FOLDER + 'train.zip')
save_frames([test_files], ZIP_FOLDER + 'test.zip')


/content/gdrive/My Drive/16:9GAN/data_raw/FMA - 16:9/Full Metal Alchemist Brotherhood.E28.Father.flv
Number of images saved: 2058
/content/gdrive/My Drive/16:9GAN/data_raw/FMA - 16:9/Full Metal Alchemist Brotherhood.E47.Emissary of Darkness.mp4
