## Imports

In [1]:
%matplotlib inline
import cv2
import os
import numpy as np
import keras
import matplotlib.pyplot as plt
# import download
from random import shuffle
from tensorflow.keras.applications import VGG16
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense, Activation
import sys
import h5py

In [2]:
keras.__version__

## Helper Functions

We will use the function ```print_progress``` to print the amount of videos processed the datasets

In [3]:
def print_progress(count, max_count):
    # Percentage completion.
    pct_complete = count / max_count

    # Status-message. Note the \r which means the line should
    # overwrite itself.
    msg = "\r- Progress: {0:.1%}".format(pct_complete)

    # Print it.
    sys.stdout.write(msg)
    sys.stdout.flush()

## Load Data

Firstly, we define the directory to place the video dataset

In [4]:
in_dir = "../input/hockey-fight-vidoes/data"

Copy some of the data-dimensions for convenience.

In [5]:
# Frame size  
img_size = 256

img_size_touple = (img_size, img_size)

# Number of channels (RGB)
num_channels = 3

# Flat frame size
img_size_flat = img_size * img_size * num_channels

# Number of classes for classification (Violence-No Violence)
num_classes = 2

# Number of files to train
_num_files_train = 1

# Number of frames per video
_images_per_file = 20

# Number of frames per training set
_num_images_train = _num_files_train * _images_per_file

# Video extension
video_exts = ".avi"

In [6]:
def get_frames(current_dir, file_name):
    
    in_file = os.path.join(current_dir, file_name)
    
    images = []
    
    vidcap = cv2.VideoCapture(in_file)
    
    success,image = vidcap.read()
        
    count = 0

    while count<_images_per_file:
                
        RGB_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
        res = cv2.resize(RGB_img, dsize=(img_size, img_size),
                                 interpolation=cv2.INTER_CUBIC)
    
        images.append(res)
    
        success,image = vidcap.read()
    
        count += 1
        
    resul = np.array(images)
    
    resul = (resul / 255.).astype(np.float16)
        
    return resul

### Helper function to get the names of the data downloaded and label it

In [7]:
def label_video_names(in_dir):
    
    # list containing video names
    names = []
    # list containin video labels [1, 0] if it has violence and [0, 1] if not
    labels = []
    
    
    for current_dir, dir_names,file_names in os.walk(in_dir):
        
        for file_name in file_names:
            
            if file_name[0:2] == 'fi':
                labels.append([1,0])
                names.append(file_name)
            elif file_name[0:2] == 'no':
                labels.append([0,1])
                names.append(file_name)
                     
            
    c = list(zip(names,labels))
    # Suffle the data (names and labels)
    shuffle(c)
    
    names, labels = zip(*c)
            
    return names, labels

### Plot a video frame to see if data is correct

In [8]:
# First get the names and labels of the whole videos
names, labels = label_video_names(in_dir)

Then we are going to load 20 frames of one video, for example

In [9]:
names[12]

The video has violence, look at the name of the video, starts with 'fi'

In [10]:
frames = get_frames(in_dir, names[12])

Convert back the frames to uint8 pixel format to plot the frame

In [11]:
visible_frame = (frames*255).astype('uint8')

In [12]:
plt.imshow(visible_frame[3])

In [13]:
plt.imshow(visible_frame[15])

In [14]:
!pip install tensorflow_addons

In [15]:
import tensorflow as tf

from keras.applications import imagenet_utils
from tensorflow.keras import layers
from tensorflow import keras

import tensorflow_datasets as tfds
import tensorflow_addons as tfa

import numpy as np

In [16]:
patch_size = 4
image_size = 256
image_size_large = 270
expansion_ratio = 2.0


In [17]:
class InvertedRes(layers.Layer):
  def __init__(self, expand_channels, output_channels, strides=1):
    super().__init__()
    self.output_channels = output_channels
    self.strides = strides
    self.expand = keras.models.Sequential([
                                          layers.Conv2D(expand_channels, 1, padding="same", use_bias=False),
                                          layers.BatchNormalization(),
                                          layers.Activation('swish')
                                        ], name="expand")
    self.dw_conv = keras.models.Sequential([
                                          layers.DepthwiseConv2D(3, strides=strides, padding="same", use_bias=False),
                                          layers.BatchNormalization(),
                                          layers.Activation('swish')
                                        ], name="depthwise")
    self.pw_conv = keras.models.Sequential([
                                          layers.Conv2D(output_channels, 1, padding="same", use_bias=False),
                                          layers.BatchNormalization(),
                                        ], name='pointwise')
  
  def call(self, x):
    o = self.expand(x)
    o = self.dw_conv(o)
    o = self.pw_conv(o)
    if self.strides == 1 and o.shape[-1] == self.output_channels:
      return o + x
    return o

In [18]:
class FullyConnected(layers.Layer):
  def __init__(self, hidden_units, dropout_rate):
    super().__init__()
    l = []
    for units in hidden_units:
      l.append(layers.Dense(units, activation=tf.nn.swish))
      l.append(layers.Dropout(dropout_rate))
    self.mlp = keras.models.Sequential(l)

  def call(self, x):
    return self.mlp(x)

In [19]:
class Transformer(layers.Layer):
  def __init__(self, projection_dim, heads=2):
    super().__init__()
    self.norm1 = layers.LayerNormalization(epsilon=1e-6)
    self.attention = layers.MultiHeadAttention(num_heads=heads, key_dim=projection_dim, dropout=0.1)
    self.norm2 = layers.LayerNormalization(epsilon=1e-6)
  
  def build(self, input_shape):
    self.mlp = FullyConnected([input_shape[-1] * 2, input_shape[-1]], dropout_rate=0.1)


  def call(self, x):
    x1 = self.norm1(x)
    att = self.attention(x1, x1)
    x2 = x + att
    x3 = self.norm2(x2)
    x3 = self.mlp(x3)
    return x3 + x2

In [20]:
class MobileVitBlock(layers.Layer):
  def __init__(self, num_blocks, projection_dim, strides=1):
    super().__init__()
    self.projection_dim = projection_dim
    self.conv_local = keras.models.Sequential([
                                           layers.Conv2D(projection_dim, 3, padding="same", strides=strides, activation=tf.nn.swish),
                                           layers.Conv2D(projection_dim, 1, padding="same", strides=strides, activation=tf.nn.swish),
                                           ])
    self.transformers = keras.models.Sequential([Transformer(projection_dim, heads=2) for i in range(num_blocks)])
    self.conv_folded = layers.Conv2D(projection_dim, 1, padding="same", strides=strides, activation=tf.nn.swish)
    self.conv_local_global = layers.Conv2D(projection_dim, 3, padding="same", strides=strides, activation=tf.nn.swish)

  def build(self, input_shape):
    num_patches = int((input_shape[1] * input_shape[2]) / patch_size)
    self.unfold = layers.Reshape((patch_size, num_patches, self.projection_dim))
    self.fold = layers.Reshape((input_shape[1], input_shape[2], self.projection_dim))

  def call(self, x):
    local_features = self.conv_local(x)
    patches = self.unfold(local_features)
    global_features = self.transformers(patches)
    folded_features = self.fold(global_features)
    folded_features = self.conv_folded(folded_features)
    local_global_features = tf.concat([x, folded_features], axis=-1)
    local_global_features = self.conv_local_global(local_global_features)
    return local_global_features

In [21]:
class MobileVit(keras.models.Model):
  def __init__(self):
    super().__init__()
    self.features = keras.models.Sequential([ 
                                              layers.Conv2D(16, 1, padding="same", strides=(2, 2), activation=tf.nn.swish),
                                              InvertedRes(16 * expansion_ratio, 16, strides=1),
                                              InvertedRes(16 * expansion_ratio, 24, strides=2),
                                              InvertedRes(24 * expansion_ratio, 24, strides=1),
                                              InvertedRes(24 * expansion_ratio, 24, strides=1),
                                              InvertedRes(24 * expansion_ratio, 48, strides=2),
                                              MobileVitBlock(2, 64, strides=1),
                                              layers.Conv2D(1, 1, padding="same", strides=(1, 1), activation=tf.nn.swish)
                                            ], name = "features")
    
  
    
  def call(self, x):
    features = self.features(x)
    return features

In [22]:
model = MobileVit()
model.build((None, 256, 256, 1))

In [23]:
model.summary()

[original paper](https://arxiv.org/abs/2110.02178)):


![](https://i.imgur.com/mANnhI7.png)

![](https://i.ibb.co/sRbVRBN/image.png)

In [24]:
MobileVit=Sequential()
MobileVit.add(layers.InputLayer(input_shape=(256, 256, 3)))
MobileVit.add(layers.Conv2D(16, 3, padding="same", strides=(2, 2), activation=tf.nn.swish))
MobileVit.add(InvertedRes(16 * expansion_ratio, 16, strides=1))
MobileVit.add(InvertedRes(16 * expansion_ratio, 24, strides=2))
MobileVit.add(InvertedRes(24 * expansion_ratio, 24, strides=1))
MobileVit.add(InvertedRes(24 * expansion_ratio, 24, strides=1))
MobileVit.add(InvertedRes(24 * expansion_ratio, 48, strides=2))
MobileVit.add(MobileVitBlock(2, 64, strides=1))
MobileVit.add(InvertedRes(64 * expansion_ratio, 64, strides=2))
MobileVit.add(MobileVitBlock(4, 80, strides=1))
MobileVit.add(InvertedRes(80 * expansion_ratio, 80, strides=2))
MobileVit.add(MobileVitBlock(3, 96, strides=1))
MobileVit.add(layers.Conv2D(64, 1, padding="same", strides=(1, 1), activation=tf.nn.swish))
MobileVit.add(layers.Flatten())
MobileVit.build((None, 256, 256, 1))
MobileVit.summary()

In [25]:
image_model =MobileVit

In [26]:
image_model.summary()


In [27]:

transfer_layer = image_model.get_layer('flatten')
image_model_transfer = Model(inputs=image_model.input,
                             outputs=transfer_layer.output)




transfer_values_size = K.int_shape(transfer_layer.output)[1]


print("The input of the VGG16 net have dimensions:",K.int_shape(image_model.input)[1:3])

print("The output of the selecter layer of VGG16 net have dimensions: ", transfer_values_size)

In [28]:
def get_transfer_values(current_dir, file_name):
    
    # Pre-allocate input-batch-array for images.
    shape = (_images_per_file,) + img_size_touple + (3,)
    
    image_batch = np.zeros(shape=shape, dtype=np.float16)
    
    image_batch = get_frames(current_dir, file_name)
      
    # Pre-allocate output-array for transfer-values.
    # Note that we use 16-bit floating-points to save memory.
    shape = (_images_per_file, transfer_values_size)
    transfer_values = np.zeros(shape=shape, dtype=np.float16)

    transfer_values = \
            image_model_transfer.predict(image_batch)
            
    return transfer_values

### Generator that process one video through VGG16 each function call

In [29]:
def proces_transfer(vid_names, in_dir, labels):
    
    count = 0
    
    tam = len(vid_names)
    
    # Pre-allocate input-batch-array for images.
    shape = (_images_per_file,) + img_size_touple + (3,)
    
    while count<tam:
        
        video_name = vid_names[count]
        
        image_batch = np.zeros(shape=shape, dtype=np.float16)
    
        image_batch = get_frames(in_dir, video_name)
        
         # Note that we use 16-bit floating-points to save memory.
        shape = (_images_per_file, transfer_values_size)
        transfer_values = np.zeros(shape=shape, dtype=np.float16)
        
        transfer_values = \
            image_model_transfer.predict(image_batch)
         
        labels1 = labels[count]
        
        aux = np.ones([20,2])
        
        labelss = labels1*aux
        
        yield transfer_values, labelss
        
        count+=1

In [30]:
def make_files(n_files):
    
    gen = proces_transfer(names_training, in_dir, labels_training)

    numer = 1

    # Read the first chunk to get the column dtypes
    chunk = next(gen)

    row_count = chunk[0].shape[0]
    row_count2 = chunk[1].shape[0]
    
    with h5py.File('prueba.h5', 'w') as f:
    
        # Initialize a resizable dataset to hold the output
        maxshape = (None,) + chunk[0].shape[1:]
        maxshape2 = (None,) + chunk[1].shape[1:]
    
    
        dset = f.create_dataset('data', shape=chunk[0].shape, maxshape=maxshape,
                                chunks=chunk[0].shape, dtype=chunk[0].dtype)
    
        dset2 = f.create_dataset('labels', shape=chunk[1].shape, maxshape=maxshape2,
                                 chunks=chunk[1].shape, dtype=chunk[1].dtype)
    
         # Write the first chunk of rows
        dset[:] = chunk[0]
        dset2[:] = chunk[1]

        for chunk in gen:
            
            if numer == n_files:
            
                break

            # Resize the dataset to accommodate the next chunk of rows
            dset.resize(row_count + chunk[0].shape[0], axis=0)
            dset2.resize(row_count2 + chunk[1].shape[0], axis=0)

            # Write the next chunk
            dset[row_count:] = chunk[0]
            dset2[row_count:] = chunk[1]

            # Increment the row count
            row_count += chunk[0].shape[0]
            row_count2 += chunk[1].shape[0]
            
            print_progress(numer, n_files)
        
            numer += 1

In [31]:
def make_files_test(n_files):
    
    gen = proces_transfer(names_test, in_dir, labels_test)

    numer = 1

    # Read the first chunk to get the column dtypes
    chunk = next(gen)

    row_count = chunk[0].shape[0]
    row_count2 = chunk[1].shape[0]
    
    with h5py.File('pruebavalidation.h5', 'w') as f:
    
        # Initialize a resizable dataset to hold the output
        maxshape = (None,) + chunk[0].shape[1:]
        maxshape2 = (None,) + chunk[1].shape[1:]
    
    
        dset = f.create_dataset('data', shape=chunk[0].shape, maxshape=maxshape,
                                chunks=chunk[0].shape, dtype=chunk[0].dtype)
    
        dset2 = f.create_dataset('labels', shape=chunk[1].shape, maxshape=maxshape2,
                                 chunks=chunk[1].shape, dtype=chunk[1].dtype)
    
         # Write the first chunk of rows
        dset[:] = chunk[0]
        dset2[:] = chunk[1]

        for chunk in gen:
            
            if numer == n_files:
            
                break

            # Resize the dataset to accommodate the next chunk of rows
            dset.resize(row_count + chunk[0].shape[0], axis=0)
            dset2.resize(row_count2 + chunk[1].shape[0], axis=0)

            # Write the next chunk
            dset[row_count:] = chunk[0]
            dset2[row_count:] = chunk[1]

            # Increment the row count
            row_count += chunk[0].shape[0]
            row_count2 += chunk[1].shape[0]
            
            print_progress(numer, n_files)
        
            numer += 1

### Split the dataset into training set and test set
We are going to split the dataset into training set and testing. The training set is used to train the model and the test set to check the model accuracy.

In [32]:
training_set = int(len(names)*0.8)
test_set = int(len(names)*0.2)

names_training = names[0:training_set]
names_test = names[training_set:]

labels_training = labels[0:training_set]
labels_test = labels[training_set:]

Then we are going to process all video frames through VGG16 and save the transfer values.

In [33]:
make_files(training_set)

In [34]:
make_files_test(test_set)

In order to load the saved transfer values into RAM memory we are going to use this two functions:

In [35]:
def process_alldata_training():
    
    joint_transfer=[]
    frames_num=20
    count = 0
    
    with h5py.File('prueba.h5', 'r') as f:
            
        X_batch = f['data'][:]
        y_batch = f['labels'][:]

    for i in range(int(len(X_batch)/frames_num)):
        inc = count+frames_num
        joint_transfer.append([X_batch[count:inc],y_batch[count]])
        count =inc
        
    data =[]
    target=[]
    
    for i in joint_transfer:
        data.append(i[0])
        target.append(np.array(i[1]))
        
    return data, target

In [36]:
def process_alldata_test():
    
    joint_transfer=[]
    frames_num=20
    count = 0
    
    with h5py.File('pruebavalidation.h5', 'r') as f:
            
        X_batch = f['data'][:]
        y_batch = f['labels'][:]

    for i in range(int(len(X_batch)/frames_num)):
        inc = count+frames_num
        joint_transfer.append([X_batch[count:inc],y_batch[count]])
        count =inc
        
    data =[]
    target=[]
    
    for i in joint_transfer:
        data.append(i[0])
        target.append(np.array(i[1]))
        
    return data, target

In [37]:
data, target = process_alldata_training()

In [38]:
data_test, target_test = process_alldata_test()

### Define LSTM architecture

When defining the LSTM architecture we have to take into account the dimensions of the transfer values. From each frame the VGG16 network obtains as output a vector of 4096 transfer values. From each video we are processing 20 frames so we will have 20 x 4096 values per video. The classification must be done taking into account the 20 frames of the video. If any of them detects violence, the video will be classified as violent.


The first input dimension of LSTM neurons is the temporal dimension, in our case it is 20. The second is the size of the features vector (transfer values).


In [39]:
chunk_size =4096
n_chunks = 20
rnn_size = 512

model = Sequential()
model.add(LSTM(rnn_size, input_shape=(n_chunks, chunk_size)))# batch_input_shape=(None, 20, 32,32 , 1)
model.add(Dense(1024))
model.add(Activation('relu'))
model.add(Dense(50))
model.add(Activation('sigmoid'))
model.add(Dense(2))
model.add(Activation('softmax'))
model.compile(loss='mean_squared_error', optimizer='adam',metrics=['accuracy'])

In [40]:
model.summary()

## Model training


In [41]:
epoch = 500
batchS = 500

history = model.fit(np.array(data[0:720]), np.array(target[0:720]), epochs=epoch,
                    validation_data=(np.array(data[720:]), np.array(target[720:])), 
                    batch_size=batchS, verbose=2)

## Test the model

We are going to test the model with 20 % of the total videos. This videos have not been used to train the network. 

In [42]:
result = model.evaluate(np.array(data_test), np.array(target_test))

## Print the model accuracy

In [43]:
for name, value in zip(model.metrics_names, result):
    print(name, value)

In [44]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.savefig('destination_path.eps', format='eps', dpi=1000)
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.savefig('destination_path1.eps', format='eps', dpi=1000)
plt.show()