In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Conv3D, ReLU, BatchNormalization, MaxPool3D, AveragePooling3D, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.regularizers import L1, L2
import os
import random

In [2]:
from easydict import EasyDict as edict
config = edict()

config.BATCH_SIZE = 1
config.NUM_CLASSES = 5
config.NUM_EPOCHS = 2
config.NUM_BATCHES_PER_EPOCH = 3
config.NUM_FRAMES = 105
config.RGB_ARRAY_DIR = '/Users/vijay/Downloads/Code_Data/I3D_scratch/rgb_data/'
config.FLOW_ARRAY_DIR = '/Users/vijay/Downloads/Code_Data/I3D_scratch/flow_data/'
config.RGB_CHECKPOINT_DIR   = '/Users/vijay/Downloads/Code_Data/I3D_scratch/checkpoints/rgb/'
config.FLOW_CHECKPOINT_DIR   = '/Users/vijay/Downloads/Code_Data/I3D_scratch/checkpoints/flow/'
config.FINAL_WEIGHTS_DIR = '/Users/vijay/Downloads/Code_Data/I3D_scratch/checkpoints/final/'
config.CHECKPOINT_DIR   = '/Users/vijay/Downloads/Code_Data/I3D_scratch/checkpoints/'

In [3]:
import numpy as np
import os
import glob
import cv2
import math
import random
from PIL import Image
def get_scaled_video(video_path):
    '''
    As per the paper, first we have to resize the smaller video side to 256 pixels
    '''
    scaled_video = []
    video_capture = cv2.VideoCapture(video_path)
    fps = video_capture.get(cv2.CAP_PROP_FPS)
    frame_count = video_capture.get(cv2.CAP_PROP_FRAME_COUNT)
    frame_width = video_capture.get(cv2.CAP_PROP_FRAME_WIDTH)
    frame_height = video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT)
    
    if min(frame_width, frame_height) != 256:
        scale = 256.0 / min(frame_width, frame_height)
    
    scaled_width = int(frame_width * scale)
    scaled_height = int(frame_height * scale)
    
    while(video_capture.isOpened()):


        ret, frame = video_capture.read()
        if ret:
            frame = cv2.resize(frame, (scaled_width, scaled_height))
            scaled_video.append(frame)
        else:
            break

    video_capture.release()
    cv2.destroyAllWindows()
    
    return np.asarray(scaled_video), frame_count, fps, scaled_width, scaled_height





def get_video_with_frames_every_second(video_array, fps, num_of_frames, duration_of_video):
    return_video_array = []
    diff = duration_of_video - num_of_frames
    start_frame_num = diff * fps
    for frame_num, frame in enumerate(video_array):
        if frame_num >= start_frame_num:
            if frame_num % fps == 0:
                return_video_array.append(frame)
    
    return np.asarray(return_video_array)




def get_req_num_of_frames_from_scaled_video(scaled_video, fps, frame_count, s_width, s_height, num_of_frames = None):
    
    '''
    
    '''
    if num_of_frames is not None:
        video_arr_with_req_num_frames = []
        '''
        If duration of video is greater than required num of frames, then extract a frame for each second until it equal to the required num_of_frames
        
        if available total num of frames in the video is greater than required num_of_frames, then select a subset of frames
    
        As per the paper, if the num of frames in video is less than required num of frames, then repeat the video till the frame count equals
        required num_of_frames
        '''
#         duration_of_video = int(math.ceil(frame_count / fps)) # in seconds
#         if duration_of_video >= num_of_frames: 
#             video_arr_with_req_num_frames = get_video_with_frames_every_second(scaled_video, fps, num_of_frames, duration_of_video)
            

       
#         if frame_count > num_of_frames:
#             start, end = int(frame_count / 2) - int(num_of_frames / 2), int(frame_count / 2) + int(num_of_frames / 2)
#             video_arr_with_req_num_frames = scaled_video[start : end]
        
        
#         elif frame_count < num_of_frames:
#             video_arr_with_req_num_frames = np.resize(scaled_video, (num_of_frames, s_height, s_width, 3))
#         print(frame_count, num_of_frames)
        if frame_count > num_of_frames:
            frame_numbers = random.sample(list(np.arange(frame_count - 1).astype(np.uint8)), num_of_frames)
            frame_numbers = np.sort(frame_numbers)
            
            for frame_number in frame_numbers:
                video_arr_with_req_num_frames.append(scaled_video[frame_number])
            
            return video_arr_with_req_num_frames
        
        elif frame_count == num_of_frames:
            return scale_video
        
        elif frame_count < num_of_frames:
            return np.resize(scaled_video, (num_of_frames, s_height, s_width, 3))
        
        
        


def crop_centre_of_each_frame(video_data, crop_size = 224):

    cropped_frames = []
    num_frames = len(video_data)
    for frame_no in range(num_frames):
        img = Image.fromarray(video_data[frame_no].astype(np.uint8))
        if img.width > img.height:
            scale = float(crop_size) / float(img.height)
            img = np.array(cv2.resize(np.array(img), (int(img.width * scale + 1), crop_size))).astype(np.float32)
        else:
            scale = float(crop_size) / float(img.width)
            img = np.array(cv2.resize(np.array(img), (crop_size, int(img.height * scale + 1)))).astype(np.float32)
        crop_x = int((img.shape[0] - crop_size) / 2)
        crop_y = int((img.shape[1] - crop_size) / 2)
        img = img[crop_x:crop_x + crop_size, crop_y:crop_y + crop_size, :]
        cropped_frames.append(img)
    return cropped_frames




def perform_random_cropping(video_data, crop_size = 224):
    video_shape = np.shape(video_data)
    max_crop_height = video_shape[1] - crop_size # rows
    max_crop_width = video_shape[2] - crop_size # columns
    
    start_x = random.randint(0, max_crop_width)
    start_y = random.randint(0, max_crop_height)
    
    return video_data[:, start_y : start_y + crop_size, start_x : start_x + crop_size, :]
    
    
    

def get_optical_flow(video_data):


    return_flow_data = []
#     video_data = np.dot(video_data, np.array([0.2989, 0.5870, 0.1140]))
#     print(np.shape(video_data))
    n_prev_frame = video_data[0].astype('uint8')
    prev_frame = cv2.cvtColor(n_prev_frame, cv2.COLOR_RGB2GRAY)
    num_frames = np.shape(video_data)[0]
    hsv = np.zeros_like(video_data[0])
    hsv[...,1] = 255
    count = 0
    for index in range(1, num_frames):
        

        n_curr_frame  = video_data[index]


        n_curr_frame = n_curr_frame.astype('uint8')


        curr_frame = cv2.cvtColor(n_curr_frame, cv2.COLOR_RGB2GRAY)

        optical_flow = cv2.optflow.DualTVL1OpticalFlow_create()
        flow = optical_flow.calc(prev_frame, curr_frame, None)
        mag, ang = cv2.cartToPolar(flow[...,0], flow[...,1])
        hsv[...,0] = ang*180/np.pi/2
        hsv[...,2] = cv2.normalize(mag,None,0,255,cv2.NORM_MINMAX)
        rgb = cv2.cvtColor(hsv,cv2.COLOR_HSV2BGR)
#         rgb = rgb.astype(np.float32)
#         rgb[rgb < -20] = -20
#         rgb[rgb > 20] = 20
#         rgb /= 20
        
        return_flow_data.append(rgb)
        prev_frame = curr_frame
    return_flow_data.append(return_flow_data[-1])
    return np.asarray(return_flow_data)




def get_rgb_video_data(video_data):
    video_data = ((video_data / 255.0) * 2) - 1
    return video_data



def third_dim_to_flow_data(flow_data):
    
    ret_array = []
    for i, frame in enumerate(flow_data):
        th = np.zeros((224, 224))
        th = th + 0.5
        c1, c2 = cv2.split(frame)
        c1, c2, th = np.expand_dims(c1, axis = 0), np.expand_dims(c2, axis = 0), np.expand_dims(th, axis = 0)
        img = np.concatenate((c1, c2, th), axis = 0)
        ret_array.append(img.T)
    return np.asarray(ret_array)

In [None]:
class Data_processing:
    def __init__(self):
        self.names_labels_dict = {}
        self.get_labels_names()

    def get_labels_names(self):
        self.labels =  os.listdir(config.VIDEOS_PATH)
        self.labels = [label for label in self.labels if label != '.DS_Store']
        
        
        
    def rename_video_files_names(self, dir_name, path_to_dir):
        all_videos_names = os.listdir(path_to_dir)
        label_number = self.labels.index(dir_name) # for HMD51, dir name is label
        
        for count, video_name in enumerate(all_videos_names):
            ext_start_index = video_name.rindex('.')
            ext = video_name[ext_start_index : ]
            source = path_to_dir + video_name
            dest = path_to_dir + str(label_number) + '_' + str(count) + ext
            
            os.rename(source, dest)
    
    def get_the_rgb_and_flow_data_of_all_videos(self):
        all_video_dirs = os.listdir(config.VIDEOS_PATH)
        all_video_dirs = [dir_name for dir_name in all_video_dirs if dir_name != '.DS_Store']
        num_of_labels = len(self.labels)
        for dir_name in all_video_dirs:
            if dir_name in ['climb', 'turn']:
                label_index = self.labels.index(dir_name)
                one_hot_label = np.zeros(num_of_labels)
                one_hot_label[label_index] = 1

                video_path = config.VIDEOS_PATH + dir_name + '/'
                all_videos_names = os.listdir(video_path)
                all_videos_names = [name for name in all_videos_names if name != '.DS_Store']
    #             print(all_videos_names)
                count = 0

                for index, video_name in enumerate(all_videos_names):
                    if index < 10:
                        self.names_labels_dict[video_name] = one_hot_label

                        scaled_video, frame_count, fps, s_width, s_height = get_scaled_video(video_path + video_name)
                        video_arr_with_req_num_frames = get_req_num_of_frames_from_scaled_video(scaled_video, fps, frame_count, s_width, s_height, 105)
                        center_cropped_video = crop_centre_of_each_frame(video_arr_with_req_num_frames)

                        ext_index = video_name.rindex('.')
                        np.save(config.RGB_ARRAY_DIR + video_name[0 : ext_index] + '.npy', center_cropped_video)

                        flow_data = get_optical_flow(center_cropped_video)
                        np.save(config.FLOW_ARRAY_DIR + video_name[0 : ext_index] + '.npy', flow_data)
                    else:
                        break
                

In [4]:

class I3D:
    def __init__(self):
        self.VALID_ENDPOINTS = (
                                  'Logits',
                                  'Predictions',
                              )
        
    
    def conv3D_block(self, data, op_channels, 
                     kernel_shape = [1, 1, 1], 
                     stride = [1, 1, 1], 
                     use_activation_fn = True, 
                     use_batch_norm = True, 
                     use_bias = False):
        
        data = Conv3D(filters = op_channels,
                      kernel_size = kernel_shape,
                      strides = stride,
                      padding = 'SAME',
                      use_bias = use_bias,
                      kernel_regularizer = L1(0.01))(data)
        
        if use_batch_norm:
            data = BatchNormalization()(data)
        if use_activation_fn:
            data = ReLU()(data)
        
        return data
            
    
    
    
    
    def inception_module(self, data, filters = [64, 96, 128, 16, 32, 32]):
        
        
        
        branch_0 = self.conv3D_block(data, op_channels = filters[0], kernel_shape = [1, 1, 1])
      
        branch_1 = self.conv3D_block(data, op_channels = filters[1], kernel_shape = [1, 1, 1])                 
        branch_1 = self.conv3D_block(branch_1, op_channels = filters[2], kernel_shape=[3, 3, 3])
      
        branch_2 = self.conv3D_block(data, op_channels = filters[3], kernel_shape = [1, 1, 1])
        branch_2 = self.conv3D_block(branch_2, op_channels = filters[4], kernel_shape = [3, 3, 3])
      
        branch_3 = MaxPool3D(pool_size = [3, 3, 3], strides=[1, 1, 1], padding = 'SAME')(data)
        branch_3 = self.conv3D_block(branch_3, op_channels = filters[5], kernel_shape = [1, 1, 1])
                          
        net = tf.concat([branch_0, branch_1, branch_2, branch_3], 4)
    
        return net 
    
                                     
        
    def build(self, end_point, num_of_classes, spatial_squeeze):
        if end_point not in self.VALID_ENDPOINTS:
            raise ValueError(final_endpoint + ' is not a valid final endpoint')
        
        input_data = Input(shape = (config.NUM_FRAMES, 224, 224, 3))
        
        data = self.conv3D_block(input_data, op_channels = 64, kernel_shape=[7, 7, 7], stride = [2, 2, 2])
        data = MaxPool3D(pool_size = [1, 3, 3], strides = [1, 2, 2], padding = 'SAME')(data)
        data = self.conv3D_block(data, op_channels = 64, kernel_shape = [1, 1, 1])
        data = self.conv3D_block(data, op_channels = 192, kernel_shape = [3, 3, 3])
        data = MaxPool3D(pool_size = [1, 3, 3], strides = [1, 2, 2], padding = 'SAME')(data)
                              
        data = self.inception_module(data)       
        data = self.inception_module(data, [128, 128, 192, 32, 96, 64])   
        data = self.inception_module(data, [128, 128, 192, 32, 96, 64])                              
        data = MaxPool3D(pool_size = [3, 3, 3], strides = [2, 2, 2], padding = 'SAME')(data)
        data = self.inception_module(data, [192, 96, 208, 16, 48, 64])
        data = self.inception_module(data, [160, 112, 224, 24, 64, 64])
        data = self.inception_module(data, [128, 128, 256, 24, 64, 64])
        data = self.inception_module(data, [112, 144, 288, 32, 64, 64])                          
        data = self.inception_module(data, [256, 160, 320, 32, 128, 128])
        data = MaxPool3D(pool_size = [2, 2, 2], strides = [2, 2, 2], padding = 'SAME')(data)
        data = self.inception_module(data, [256, 160, 320, 32, 128, 128])
        data = self.inception_module(data, [384, 192, 284, 48, 128, 128])
        data = AveragePooling3D(pool_size = [2, 7, 7], strides = [1, 1, 1], padding = 'VALID')(data)
        logits = self.conv3D_block(data, op_channels = num_of_classes, kernel_shape = [1, 1, 1], 
                                  use_activation_fn = False, use_batch_norm = False, use_bias = True)
        
        if spatial_squeeze:
            logits = tf.squeeze(logits, [2, 3], name = 'SpatialSqueeze')
        
        if end_point == 'Logits':
            averaged_logits = tf.reduce_mean(logits, axis=1)                         
#             output = averaged_logits
            self.i3d_model = Model(input_data, averaged_logits)
        
        if end_point == 'Predictions':
            averaged_logits = tf.reduce_mean(logits, axis=1)   
            predictions = tf.nn.softmax(averaged_logits)
#             output = predictions
            self.i3d_model = Model(input_data, predictions)
                                     
        

In [5]:
class Train:
    def __init__(self):
        self.create_dirs()
        
        i3d_rgb = I3D()
        i3d_flow = I3D()
        
        i3d_rgb.build('Logits', 5, True)
        i3d_flow.build('Logits', 5, True)
        
        self.rgb_model = i3d_rgb.i3d_model
        self.flow_model = i3d_flow.i3d_model
        
        rgb_optimizer = SGD(learning_rate = 0.0001, momentum = 0.9)
        flow_optimizer = SGD(learning_rate = 0.0001, momentum = 0.9)
#         self.i3d_rgb_checkpoint = tf.train.Checkpoint(curr_epoch = tf.Variable(0),
#                                                      optimizer = rgb_optimizer,
#                                                      model = self.rgb_model)
#         self.i3d_rgb_checkpoint_manager = tf.train.CheckpointManager(self.i3d_rgb_checkpoint,
#                                                                 directory = config.RGB_CHECKPOINT_DIR,
#                                                                 max_to_keep = 3)
        
#         self.i3d_flow_checkpoint = tf.train.Checkpoint(curr_epoch = tf.Variable(0),
#                                                       optimizer = flow_optimizer,
#                                                       model = self.flow_model)
        
#         self.i3d_flow_checkpoint_manager = tf.train.CheckpointManager(self.i3d_flow_checkpoint,
#                                                               directory = config.FLOW_CHECKPOINT_DIR,
#                                                                max_to_keep = 3)
        
        self.i3d_checkpoint = tf.train.Checkpoint(curr_epoch = tf.Variable(0),
                                                  rgb_optimizer = rgb_optimizer,
                                                 flow_optimizer = flow_optimizer,
                                                 rgb_model = self.rgb_model,
                                                 flow_model = self.flow_model
                                                 
                                                 )
        self.i3d_checkpoint_manager = tf.train.CheckpointManager(self.i3d_checkpoint,
                                                                directory = config.CHECKPOINT_DIR,
                                                                max_to_keep = 3)
        
    
    def create_dirs(self):
        if not os.path.exists(config.RGB_CHECKPOINT_DIR):
            os.makedirs(config.RGB_CHECKPOINT_DIR)
        
        if not os.path.exists(config.FLOW_CHECKPOINT_DIR):
            os.makedirs(config.FLOW_CHECKPOINT_DIR)
            
        if not os.path.exists(config.FINAL_WEIGHTS_DIR):
            os.makedirs(config.FINAL_WEIGHTS_DIR)
        
        
    
    def restore_recent_checkpoint(self):
        
#         if self.i3d_checkpoint_manager.latest_checkpoint:
#             self.i3d_checkpoint.restore(self.i3d_rgb_checkpoint_manager.latest_checkpoint)
#             print('loaded rgb checkpoint')
        if self.i3d_checkpoint_manager.latest_checkpoint:
            self.i3d_checkpoint.rgb_model.load_weights(config.FINAL_WEIGHTS_DIR + 'final_i3d_rgb_weights.h5')
            self.i3d_checkpoint.flow_model.load_weights(config.FINAL_WEIGHTS_DIR + 'final_i3d_flow_weights.h5')
            
#         if self.i3d_rgb_checkpoint_manager.latest_checkpoint:
#             self.i3d_rgb_checkpoint.restore(self.i3d_rgb_checkpoint_manager.latest_checkpoint)
#             print('loaded rgb checkpoint')
            
#         if self.i3d_flow_checkpoint_manager.latest_checkpoint:
#             self.i3d_flow_checkpoint.restore(self.i3d_flow_checkpoint_manager.latest_checkpoint)
#             print('loaded flow checkpoint')
            
    
    def in_each_train_step(self, rgb_batch, flow_batch, labels):
        
        with tf.GradientTape(persistent = True) as rgb_tape, tf.GradientTape(persistent = True) as flow_tape:
            
            print(np.shape(rgb_batch), np.shape(flow_batch), np.shape(labels))
            print('-----')
            rgb_logits = self.i3d_checkpoint.rgb_model(rgb_batch, training = True)
            flow_logits = self.i3d_checkpoint.flow_model(flow_batch, training = True)
            print(np.shape(rgb_logits.numpy()), np.shape(flow_logits.numpy()))
            
            rgb_loss   = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = labels, logits = rgb_logits))
            flow_loss  = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = labels, logits = flow_logits))

            
        print(' one')
        rgb_gradients  = rgb_tape.gradient(rgb_loss, self.i3d_checkpoint.rgb_model.trainable_variables)
        print('middle')
        flow_gradients = flow_tape.gradient(flow_loss, self.i3d_checkpoint.flow_model.trainable_variables)
        print('two')
        self.i3d_checkpoint.rgb_optimizer.apply_gradients(zip(rgb_gradients, self.i3d_checkpoint.rgb_model.trainable_variables))
        self.i3d_checkpoint.flow_optimizer.apply_gradients(zip(flow_gradients, self.i3d_checkpoint.flow_model.trainable_variables))
        print('returning')
        return rgb_loss, flow_loss
    
    
    def training(self):
        self.restore_recent_checkpoint()
        num_epochs_finished = self.i3d_checkpoint.curr_epoch.numpy() 
        
        num_epochs_remaining = config.NUM_EPOCHS - num_epochs_finished - 1
        
        rgb_loss_log = tf.keras.metrics.Mean('rgb_loss', dtype = tf.float32)
        flow_loss_log = tf.keras.metrics.Mean('flow_loss', dtype = tf.float32)
        
        for epoch in range(num_epochs_remaining):
            
            actual_epoch = self.i3d_checkpoint.curr_epoch.numpy()
            
            print('epoch ' + str(actual_epoch + epoch))
            all_videos_names = os.listdir(config.RGB_ARRAY_DIR)
            batch_count = 0
            while batch_count < config.NUM_BATCHES_PER_EPOCH:
                
                batch_video_names = random.sample(all_videos_names, config.BATCH_SIZE)

                rgb_data = []
                flow_data = []
                labels = []
                
                for video_name in batch_video_names:
                    int_label = int(video_name.split('_')[0])
                    lb = np.zeros(config.NUM_CLASSES)
                    lb[int_label] = 1
                    labels.append(lb)
                    
                    rgb_data.append(np.load(config.RGB_ARRAY_DIR + video_name, allow_pickle = True))
                    
                    
                    f_data = list(np.load(config.FLOW_ARRAY_DIR + video_name, allow_pickle = True))
                    f_data.append(f_data[-1])
                    flow_data.append(f_data)
#                     flow_data.append(np.load(config.FLOW_ARRAY_DIR + video_name, allow_pickle = True))
                
                
                rgb_loss, flow_loss = self.in_each_train_step(np.asarray(rgb_data), 
                                                              np.asarray(flow_data), 
                                                              np.asarray(labels))
                print(batch_count)
                rgb_loss_log.update_state(rgb_loss)
                flow_loss_log.update_state(flow_loss)
                
                batch_count = batch_count + 1
                print(batch_count)
            if actual_epoch % 9 == 0 and actual_epoch > 0:
                print('At epoch ' + str(actual_epoch) + ' rgb loss is ' + str(rgb_loss_log.result()) + ' and flow loss is ' + str(flow_loss_log.result()))
                rgb_loss_log.reset_states()
                flow_loss_log.reset_states()

                self.i3d_checkpoint_manager.save()
#                 self.i3d_rgb_checkpoint_manager.save()
                self.i3d_checkpoint.rgb_model.save_weights(config.FINAL_WEIGHTS_DIR + 'final_i3d_rgb_weights.h5')
                self.i3d_checkpoint.flow_model.save_weights(config.FINAL_WEIGHTS_DIR + 'final_i3d_flow_weights.h5')
                
                print('saved')

            if actual_epoch == config.NUM_EPOCHS - 1:
                self.i3d_checkpoint.rgb_model.save_weights(config.FINAL_WEIGHTS_DIR + 'final_i3d_rgb_weights.h5')
                self.i3d_checkpoint.flow_model.save_weights(config.FINAL_WEIGHTS_DIR + 'final_i3d_flow_weights.h5')
                print('training finished')
#                     self.gan_checkpoint.d_model.save_weights(config.FINAL_WEIGHTS_DIR + 'final_discriminator_weights.h5')

            if actual_epoch != config.NUM_EPOCHS - 1:
                self.i3d_checkpoint.curr_epoch.assign_add(1)
                
                

In [None]:
train_i3d = Train()
train_i3d.training()