## Pre-Processing Notebook

### Goals: 
- Need to degrade image resolution and grayscale test and train set before transfer onto GPU nodes to save space
- Need to copy over the label files for the frames 

#### Author: Megan Tabbutt

In [None]:
base_path = "/Users/megantabbutt/Desktop/Computer Science Classes/762_AdvancedDeepLearning/762_Project/Data/"
images_paths = ["random_frames_v1_fullRes/test/", "random_frames_v1_fullRes/train/"]
output_paths = ["random_frames_v1_256/test/", "random_frames_v1_256/train/"]

img_height = 256
img_width = 256

In [None]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
import random
import shutil

# Image processing
import cv2

___ 

## Image pre-prosessing to grayscale and 256 res

In [None]:
show_plots = False

for path_idx, path in enumerate(images_paths):
    base = base_path + path
    
    for frame_path in os.listdir(str(base_path+path)):
        
        image = cv2.imread(base + frame_path)
        
        # Convert to greyscale
        image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) 
        
        # plot the original image
        if show_plots: 
            plt.imshow(image)
            plt.show()
        
        # Resize
        down_points = (img_width, img_height)
        image_down = cv2.resize(image, down_points, interpolation=cv2.INTER_CUBIC)
        
        if show_plots: 
            plt.imshow(image_down)
            plt.show()
        
        
        # write the image out to the folder:
        filename = base_path + output_paths[path_idx] + frame_path
        cv2.imwrite(filename, image_down)

___

## Make the labels from theirs

In [None]:
base_path_labels = "/Users/megantabbutt/Desktop/Computer Science Classes/762_AdvancedDeepLearning/"

paper_labels_full_path = "762_Project/Codes/Paper_codes/labels/"

labels_out_path = "random_frames_v1_256/labels_paper/"

In [None]:
# get all the vidoes_frames and add as keys to a dictionary:
videos_frames_dict = dict()


for path_idx, path in enumerate(images_paths):
    base = base_path + path
    
    for frame_path in os.listdir(str(base_path+path)):
        video = frame_path.split("_")[0]
        frame = frame_path.split("_")[1].split(".")[0]
        if video not in videos_frames_dict:
            videos_frames_dict[video] = []
        videos_frames_dict[video].append(frame)
               
videos_frames_dict

In [None]:
vidframes_labels_dict = dict()


for video in videos_frames_dict:
    f = open(base_path_labels + paper_labels_full_path + str(video)+".json")
    data = json.load(f)
    for frame in videos_frames_dict[video]:
        vidframe = str(video) + "_" + str(frame)
        vidframes_labels_dict[vidframe] = data[int(frame)]
    
vidframes_labels_dict

In [None]:
with open(base_path + labels_out_path + 'labels_paper.json', 'w') as fp:
    json.dump(vidframes_labels_dict, fp)

___

## Making a more balanced dataset out of the random frames:

In [None]:
path_labels = "/Users/megantabbutt/Desktop/Computer Science Classes/762_AdvancedDeepLearning/\
762_Project/Data/random_frames_v1_256/labels_paper/labels_paper.json"  

In [None]:
label_file = open(path_labels)
label_data = json.load(label_file)
plt.hist(label_data.values(), bins=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

In [None]:
np.unique(list(label_data.values()))

## Make a Balanced Training and Test set from the Paper's original videos 

In [None]:
base_path = "/Users/megantabbutt/Desktop/Computer Science Classes/762_AdvancedDeepLearning/762_Project/Data/"
source_paths = ["random_frames_v1_256/test/", "random_frames_v1_256/train/"]
destination_paths = ["random_frames_256_balanced/test/", "random_frames_256_balanced/train/"]

In [None]:
balance_dict_inv = {1:[], 2:[], 4:[], 5:[], 6:[], 7:[], 9:[]}
balance_n_train = 224
balance_n_test = 50

In [None]:
len(label_data)

In [None]:
for class_num in balance_dict_inv:
    for key in label_data:
            
        if label_data[key] == class_num:
            balance_dict_inv[class_num].append(key)
                
        if len(balance_dict_inv[class_num]) == (balance_n_train + balance_n_test): break


In [None]:
len(balance_dict_inv[9])

In [None]:
frame_labels_balanced = dict()
test_labels = dict()

for class_num in balance_dict_inv:
    print(class_num)
    
    # Do testing:
    n_test = 0
    for vid in os.listdir(base_path+source_paths[0]):
        if n_test == balance_n_test: break
        if vid[-4:] == ".jpg":
            vid_name = vid[:-4]
            if vid_name in balance_dict_inv[class_num]:
                n_test += 1
                frame_labels_balanced[vid_name] = class_num
                shutil.copy(base_path+source_paths[0]+vid, base_path+destination_paths[0]+vid)
    
    # Do training:
    n_train = 0
    for vid in os.listdir(base_path+source_paths[1]):
        if n_train == balance_n_train: break
        if vid[-4:] == ".jpg":
            vid_name = vid[:-4]
            if vid_name in balance_dict_inv[class_num]:
                frame_labels_balanced[vid_name] = class_num
                n_train += 1
                shutil.copy(base_path+source_paths[1]+vid, base_path+destination_paths[1]+vid)
                
    
with open(base_path + "random_frames_256_balanced/labels_paper/" + 'labels_paper.json', 'w') as fp:
    json.dump(frame_labels_balanced, fp)
    
#frame_labels_balanced

In [None]:
base_path = "/Users/megantabbutt/Desktop/Computer Science Classes/762_AdvancedDeepLearning/762_Project/Data/"
images_paths = ["random_frames_v1_fullRes/test/", "random_frames_v1_fullRes/train/"]
output_paths = ["random_frames_v1_256/test/", "random_frames_v1_256/train/"]

In [None]:
path_labels = "/Users/megantabbutt/Desktop/Computer Science Classes/762_AdvancedDeepLearning/762_Project/Data/random_frames_256_balanced/labels_paper/labels_paper.json"  

label_file = open(path_labels)
label_data = json.load(label_file)
plt.hist(label_data.values(), bins=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

In [None]:
label_file

In [None]:
path_labels = "/Users/megantabbutt/Desktop/Computer Science Classes/762_AdvancedDeepLearning/762_Project/Data/random_frames_256_balanced/labels_paper/pointer_table_shuffled_test.json"  

label_file = open(path_labels)
label_data = json.load(label_file)
#plt.hist(label_data.values(), bins=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

In [None]:
vals = []
for i in label_data:
    vals.append(i[1])
    
plt.hist(vals, bins=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10])    