# Gesture Recognition
In this group project, you are going to build a 3D Conv model that will be able to predict the 5 gestures correctly. Please import the following libraries to get started.

#### Since the scipy.misc package is deprecated for image reading the imageio is used below ####

In [128]:
#!pip install imageio

In [129]:
import numpy as np
import os
#from scipy.misc import imread, imresize
#from scipy.misc.pilutil import imread
import imageio
#import cv2
import datetime
import os
from PIL import Image 
from tensorflow.keras.layers import Dropout,LSTM,LeakyReLU
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import Adam
from keras.layers import GlobalAveragePooling3D
from keras.regularizers import l2

We set the random seed so that the results don't vary drastically.

In [130]:
np.random.seed(30)
import random as rn
rn.seed(30)
from keras import backend as K
import tensorflow as tf
#tf.set_random_seed(30)
tf.random.set_seed(30)

In this block, you read the folder names for training and validation. You also set the `batch_size` here. Note that you set the batch size in such a way that you are able to use the GPU in full capacity. You keep increasing the batch size until the machine throws an error.

#### Batch Size for each Training experiment is set to 16 ####

In [131]:
train_doc = np.random.permutation(open('/datasets/Project_data/train.csv').readlines())
val_doc = np.random.permutation(open('/datasets/Project_data/val.csv').readlines())
#train_doc = np.random.permutation(open('/content/drive/My Drive/Final_data/Collated_training/train.csv').readlines())
#val_doc = np.random.permutation(open('/content/drive/My Drive/Final_data/Collated_training/val.csv').readlines())
batch_size = 16 #experiment with the batch size

## Generator
This is one of the most important part of the code. The overall structure of the generator has been given. In the generator, you are going to preprocess the images as you have images of 2 different dimensions as well as create a batch of video frames. You have to experiment with `img_idx`, `y`,`z` and normalization such that you get high accuracy.

In [132]:
def generator(source_path, folder_list, batch_size):
    print( 'Source path = ', source_path, '; batch size =', batch_size)
    img_idx = list(range(0, 30)) #create a list of image numbers you want to use for a particular video

    x = 30  # Number of frames per video
    y = 180  # Image height after resizing
    z = 180
    while True:
        t = np.random.permutation(folder_list)
        num_batches = len(t) // batch_size  # Calculate the number of full batches
        remaining_samples = len(t) % batch_size  
        for batch in range(num_batches): # we iterate over the number of batches
            batch_data = np.zeros((batch_size,x,y,z,3)) # x is the number of images you use for each video, (y,z) is the final size of the input images and 3 is the number of channels RGB
            batch_labels = np.zeros((batch_size,5)) # batch_labels is the one hot representation of the output
            for folder in range(batch_size): # iterate over the batch_size
                imgs = os.listdir(source_path+'/'+ t[folder + (batch*batch_size)].split(';')[0]) # read all the images in the folder
                for idx,item in enumerate(img_idx): #  Iterate iver the frames/images of a folder to read them in
                    image = imageio.imread(source_path+'/'+ t[folder + (batch*batch_size)].strip().split(';')[0]+'/'+imgs[item]).astype(np.uint8)
                    image_pil = Image.fromarray(image)
                    #crop the images and resize them. Note that the images are of 2 different shape 
                    #and the conv3D will throw error if the inputs in a batch have different shapes
                    image_resized = np.array(image_pil.resize((y, z)))

                    # Normalize the image (scale pixel values to [0, 1]) and assign to batch_data
                    batch_data[folder,idx, :, :, 0] = image_resized[:, :, 0] / 255.0  # Red channel
                    batch_data[folder,idx, :, :, 1] = image_resized[:, :, 1] / 255.0  # Green channel
                    batch_data[folder,idx, :, :, 2] = image_resized[:, :, 2] / 255.0  # Blue channel
                    
                    batch_labels[folder, int(t[folder + (batch*batch_size)].strip().split(';')[2])] = 1
            yield batch_data, batch_labels #you yield the batch_data and the batch_labels, remember what does yield do

        
        # write the code for the remaining data points which are left after full batches
    if remaining_samples > 0:
            batch_data = np.zeros((remaining_samples, x, y, z, 3))  # Initialize batch data for the remaining samples
            batch_labels = np.zeros((remaining_samples, 5))  # Initialize batch labels for the remaining samples
            
            for folder in range(remaining_samples):  # Iterate over the remaining samples
                folder_name = t[num_batches * batch_size + folder].split(';')[0]  # Get folder name
                imgs = os.listdir(os.path.join(source_path, folder_name))  # Get all images in the folder
                
                for idx, item in enumerate(img_idx):  # Iterate over the frames/images of the folder
                    image_path = os.path.join(source_path, folder_name, imgs[item])
                    image = imageio.imread(image_path).astype(np.uint8)  # Read and convert the image to float32
            
                    # Resize and normalize the image
                    image_resized = np.array(image_pil.resize((y, z)))  # Resize the image to the desired shape
                    
                    # Normalize the image channels (R, G, B)
                    batch_data[folder, idx, :, :, 0] = image_resized[:, :, 0] / 255.0  # Red channel
                    batch_data[folder, idx, :, :, 1] = image_resized[:, :, 1] / 255.0  # Green channel
                    batch_data[folder, idx, :, :, 2] = image_resized[:, :, 2] / 255.0  # Blue channel
                
                # One-hot encode the label for the remaining sample
                label = int(t[num_batches * batch_size + folder].split(';')[2])  # Extract the label
                batch_labels[folder, label] = 1  # Set the label for the remaining sample
            
            # Yield the remaining data and labels
            yield batch_data, batch_labels


Note here that a video is represented above in the generator as (number of images, height, width, number of channels). Take this into consideration while creating the model architecture.

#### In the above generator function we have below implementation done 
1.Resizing of image to 180 * 180 
2.Normalised the images as batches of 16
3.Remaining sample variable was set and code to normalise was done
####

In [133]:
curr_dt_time = datetime.datetime.now()
train_path = '/datasets/Project_data/train'
val_path = '/datasets/Project_data/val'
num_train_sequences = len(train_doc)
print('# training sequences =', num_train_sequences)
num_val_sequences = len(val_doc)
print('# validation sequences =', num_val_sequences)
num_epochs =  20  # choose the number of epochs
print ('# epochs =', num_epochs)

# training sequences = 663
# validation sequences = 100
# epochs = 20


#### Epoch was set to 20 though it has to be set higher for better results , for computational reasons reduced to  20 in this case ####

## Model
Here you make the model using different functionalities that Keras provides. Remember to use `Conv3D` and `MaxPooling3D` and not `Conv2D` and `Maxpooling2D` for a 3D convolution model. You would want to use `TimeDistributed` while building a Conv2D + RNN model. Also remember that the last layer is the softmax. Design the network in such a way that the model is able to give good accuracy on the least number of parameters so that it can fit in the memory of the webcam.

In [134]:
from keras.models import Sequential, Model
from keras.layers import Dense, GRU, Flatten, TimeDistributed, Flatten, BatchNormalization, Activation
from keras.layers.convolutional import Conv3D, MaxPooling3D
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras import optimizers


#write your model here
model = Sequential()

x, y, z = 30, 180, 180  # Input dimensions: (frames, height, width)

# Layer 1: Conv3D + MaxPooling3D
model.add(Conv3D(32, kernel_size=(3, 3, 3), activation='relu', input_shape=(x, y, z, 3)))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Layer 2: Conv3D + MaxPooling3D
model.add(Conv3D(32, kernel_size=(3, 3, 3), activation='relu'))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))
 
# Layer 3: Conv3D + MaxPooling3D
model.add(Conv3D(64, kernel_size=(3, 3, 3), activation='relu'))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))
model.add(Dropout(0.2)) 

# Global Average Pooling to Reduce Dimensions
model.add(TimeDistributed(Flatten()))

# LSTM Layer for Temporal Dynamics
model.add(LSTM(64, return_sequences=False,kernel_regularizer=l2(0.01), dropout=0.5))

# Batch Normalization
model.add(BatchNormalization())

# Dense Layers
model.add(Dense(256, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))

Now that you have written the model, the next step is to `compile` the model. When you print the `summary` of the model, you'll see the total number of parameters you have to train.

#### In the above final model 
1. 3 layers of Convo3D with 32,32 and 64 size respectively and activation layer with relu
2. Dropout of 20% was added to minimise unwanted neuron connection and improve validation accuracy
3. Global Average pooling caused reshaping issues with LSTM inclusion finally ,so we  used TimeDistributed Flatten  to reduce dimensions
4. LSTM layer was applied finally to improve accuracy with Regulariser  of  0.01 and drop out of 0.5
5. Batch Normalisation and Dense methods were added to improve accuracy.
6. Final Dense layer with activation function of softmax was included to find exact match of the 5 gesture images of the  Assignment
####


#### Adam Optimiser summary of the model are below ####

In [135]:
#optimiser = SGD(learning_rate=0.01, momentum=0.9)#write your optimizer
optimiser = Adam(learning_rate=0.001)
model.compile(optimizer=optimiser, loss='categorical_crossentropy', metrics=['categorical_accuracy'])
print (model.summary())

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv3d_22 (Conv3D)          (None, 28, 178, 178, 32)  2624      
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 28, 178, 178, 32)  0         
                                                                 
 max_pooling3d_21 (MaxPoolin  (None, 14, 89, 89, 32)   0         
 g3D)                                                            
                                                                 
 conv3d_23 (Conv3D)          (None, 12, 87, 87, 32)    27680     
                                                                 
 leaky_re_lu_1 (LeakyReLU)   (None, 12, 87, 87, 32)    0         
                                                                 
 max_pooling3d_22 (MaxPoolin  (None, 6, 43, 43, 32)    0         
 g3D)                                                 

Let us create the `train_generator` and the `val_generator` which will be used in `.fit_generator`.

In [136]:
train_generator = generator(train_path, train_doc, batch_size)
val_generator = generator(val_path, val_doc, batch_size)

In [137]:
model_name = 'model_init' + '_' + str(curr_dt_time).replace(' ','').replace(':','_') + '/'
    
if not os.path.exists(model_name):
    os.mkdir(model_name)
        
filepath = model_name + 'model-{epoch:05d}-{loss:.5f}-{categorical_accuracy:.5f}-{val_loss:.5f}-{val_categorical_accuracy:.5f}.h5'

checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=False, save_weights_only=False, mode='auto', period=1)

LR = ReduceLROnPlateau(
    monitor='val_loss',          # Metric to monitor (can be 'val_loss', 'loss', etc.)
    factor=0.2,                  # Factor by which to reduce the learning rate
    patience=5,                  # Number of epochs with no improvement before reducing learning rate
    min_lr=1e-6,                 # Minimum learning rate (to avoid going too low)
    verbose=1                    # Display messages when the learning rate is reduced
)
callbacks_list = [checkpoint, LR]



The `steps_per_epoch` and `validation_steps` are used by `fit_generator` to decide the number of next() calls it need to make.

In [138]:
if (num_train_sequences%batch_size) == 0:
    steps_per_epoch = int(num_train_sequences/batch_size)
else:
    steps_per_epoch = (num_train_sequences//batch_size) + 1

if (num_val_sequences%batch_size) == 0:
    validation_steps = int(num_val_sequences/batch_size)
else:
    validation_steps = (num_val_sequences//batch_size) + 1

Let us now fit the model. This will start training the model and with the help of the checkpoints, you'll be able to save the model at the end of each epoch.

In [1]:
model.fit_generator(train_generator, steps_per_epoch=steps_per_epoch, epochs=num_epochs, verbose=1, 
                    callbacks=callbacks_list, validation_data=val_generator, 
                    validation_steps=validation_steps, class_weight=None, workers=1, initial_epoch=0)

#### Final Model generated with accuracy of 93% and validation accuracy of 86% ####

#### Below are experiments done to tune the Model and improve accuracy ####

#### At first used Sgd Optimiser and the accuracy resulted was 20% only ####

In [None]:
""" with. Sgd Optimizer and below netwrok architecture we get 20% accuracy

model.add(Conv3D(32, kernel_size=(3, 3, 3), activation='relu', input_shape=(x, y, z,3)))
model.add(BatchNormalization())
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Layer 2
model.add(Conv3D(64, kernel_size=(3, 3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Layer 3
model.add(Conv3D(128, kernel_size=(3, 3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Flatten and Dense Layers
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(5, activation='softmax')) """

#### Adam optimiser was used , Batch Normalisation after each layer  was removed and only was included in the  final layer of size 128.
#### Flatten and Dense layers were used with drop out of 0.4
#### Accuracy was thus was 71% and  Validation accuracy of 55%
####

In [None]:
""" with adam optimiser and below. architecture categorical accuracy is 0.7128  and validation accuracy 0.5536
model.add(Conv3D(32, kernel_size=(3, 3, 3), activation='relu', input_shape=(x, y, z,3)))
#model.add(BatchNormalization())
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Layer 2
model.add(Conv3D(64, kernel_size=(3, 3, 3), activation='relu'))
#model.add(BatchNormalization())
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Layer 3
model.add(Conv3D(128, kernel_size=(3, 3, 3), activation='relu'))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

model.add(BatchNormalization())

# Flatten and Dense Layers
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(5, activation='softmax'))"""

#### In the Flatten  and Dense layers used GlobalAveragepooling but the accuracy is 80% and validation accuracy is 69% ####

In [None]:
""" with adam optimiser and below. architecture categorical accuracy is 0.8051 and validation accuracy 0.6964
model = Sequential()

x = 30  # Number of frames per video
y = 180  # Image height after resizing
z = 180
# Layer 1

model.add(Conv3D(32, kernel_size=(3, 3, 3), activation='relu', input_shape=(x, y, z,3)))
#model.add(BatchNormalization())
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Layer 2
model.add(Conv3D(64, kernel_size=(3, 3, 3), activation='relu'))
#model.add(BatchNormalization())
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Layer 3
model.add(Conv3D(128, kernel_size=(3, 3, 3), activation='relu'))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

model.add(BatchNormalization())

# Flatten and Dense Layers
model.add(GlobalAveragePooling3D())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(5, activation='softmax'))"""

#### Time Distributed Flatten was used in Flatten and Dense layers along with GRU but the accuracy was 47% and validation accuracy was 43% ####

In [None]:
""" with adam optimiser and below. architecture categorical accuracy is 0.4732  and validation accuracy 0.4375
model = Sequential()

x = 30  # Number of frames per video
y = 180  # Image height after resizing
z = 180
# Layer 1

model.add(Conv3D(32, kernel_size=(3, 3, 3), activation='relu', input_shape=(x, y, z,3)))
#model.add(BatchNormalization())
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Layer 2
model.add(Conv3D(64, kernel_size=(3, 3, 3), activation='relu'))
#model.add(BatchNormalization())
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Layer 3
model.add(Conv3D(128, kernel_size=(3, 3, 3), activation='relu'))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

model.add(BatchNormalization())

# Flatten and Dense Layers
model.add(TimeDistributed(Flatten()))
model.add(GRU(64, return_sequences=False))
model.add(Dense(256, activation='relu',kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))"""

#### LSTM was in corporated in code with filter size as 128 and resulted in overfitting ####

In [None]:
""" with adam optimiser and below. architecture categorical accuracy is 1.00  and validation accuracy 0.66
model = Sequential()

x, y, z = 30, 180, 180  # Input dimensions: (frames, height, width)

# Layer 1: Conv3D + MaxPooling3D
model.add(Conv3D(32, kernel_size=(3, 3, 3), activation='relu', input_shape=(x, y, z, 3)))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Layer 2: Conv3D + MaxPooling3D
model.add(Conv3D(64, kernel_size=(3, 3, 3), activation='relu'))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Layer 3: Conv3D + MaxPooling3D
model.add(Conv3D(128, kernel_size=(3, 3, 3), activation='relu'))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Global Average Pooling to Reduce Dimensions
model.add(TimeDistributed(Flatten()))

# LSTM Layer for Temporal Dynamics
model.add(LSTM(128, return_sequences=False))

# Batch Normalization
model.add(BatchNormalization())

# Dense Layers
model.add(Dense(256, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.4))
model.add(Dense(5, activation='softmax'))"""

#### LSTM was run again with filter size of 64 and the accuracy was 99% and Validation accuracy of 70% ####

In [None]:
"""  with adam optimiser and below. architecture categorical accuracy is 0.9940  and validation accuracy 0.70
model = Sequential()

x, y, z = 30, 180, 180  # Input dimensions: (frames, height, width)

# Layer 1: Conv3D + MaxPooling3D
model.add(Conv3D(32, kernel_size=(3, 3, 3), activation='relu', input_shape=(x, y, z, 3)))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Layer 2: Conv3D + MaxPooling3D
model.add(Conv3D(64, kernel_size=(3, 3, 3), activation='relu'))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Layer 3: Conv3D + MaxPooling3D
model.add(Conv3D(128, kernel_size=(3, 3, 3), activation='relu'))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Global Average Pooling to Reduce Dimensions
model.add(TimeDistributed(Flatten()))

# LSTM Layer for Temporal Dynamics
model.add(LSTM(64, return_sequences=False,kernel_regularizer=l2(0.01), dropout=0.4))

# Batch Normalization
model.add(BatchNormalization())

# Dense Layers
model.add(Dense(256, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.3))
model.add(Dense(5, activation='softmax'))"""

#### Layer 3 Convo3D filter size was reduced to 64 and LSTM dropout was updated to 0.4 ####

In [None]:
"""with adam optimiser and below. architecture categorical accuracy is 0.9926  and validation accuracy 0.75
model = Sequential()
model = Sequential()

x, y, z = 30, 180, 180  # Input dimensions: (frames, height, width)

# Layer 1: Conv3D + MaxPooling3D
model.add(Conv3D(32, kernel_size=(3, 3, 3), activation='relu', input_shape=(x, y, z, 3)))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Layer 2: Conv3D + MaxPooling3D
model.add(Conv3D(32, kernel_size=(3, 3, 3), activation='relu'))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Layer 3: Conv3D + MaxPooling3D
model.add(Conv3D(64, kernel_size=(3, 3, 3), activation='relu'))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Global Average Pooling to Reduce Dimensions
model.add(TimeDistributed(Flatten()))

# LSTM Layer for Temporal Dynamics
model.add(LSTM(64, return_sequences=False,kernel_regularizer=l2(0.01), dropout=0.4))

# Batch Normalization
model.add(BatchNormalization())

# Dense Layers
model.add(Dense(256, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))"""

#### LSTM drop out parameter was set to 0.5 ####

In [None]:


"""with adam optimiser and below. architecture categorical accuracy is 0.9301  and validation accuracy 0.7500
model = Sequential()

x, y, z = 30, 180, 180  # Input dimensions: (frames, height, width)

# Layer 1: Conv3D + MaxPooling3D
model.add(Conv3D(32, kernel_size=(3, 3, 3), activation='relu', input_shape=(x, y, z, 3)))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))
model.add(Dropout(0.1)) 

# Layer 2: Conv3D + MaxPooling3D
model.add(Conv3D(32, kernel_size=(3, 3, 3), activation='relu'))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))
model.add(Dropout(0.1)) 

# Layer 3: Conv3D + MaxPooling3D
model.add(Conv3D(64, kernel_size=(3, 3, 3), activation='relu'))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))
model.add(Dropout(0.2)) 

# Global Average Pooling to Reduce Dimensions
model.add(TimeDistributed(Flatten()))

# LSTM Layer for Temporal Dynamics
model.add(LSTM(64, return_sequences=False,kernel_regularizer=l2(0.01), dropout=0.5))

# Batch Normalization
model.add(BatchNormalization())

# Dense Layers
model.add(Dense(256, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))"""

In [None]:
""" with adam optimiser and below. architecture categorical accuracy is 0.9330  and validation accuracy 0.8661
model = Sequential()

x, y, z = 30, 180, 180  # Input dimensions: (frames, height, width)

# Layer 1: Conv3D + MaxPooling3D
model.add(Conv3D(32, kernel_size=(3, 3, 3), activation='relu', input_shape=(x, y, z, 3)))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Layer 2: Conv3D + MaxPooling3D
model.add(Conv3D(32, kernel_size=(3, 3, 3), activation='relu'))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))
 
# Layer 3: Conv3D + MaxPooling3D
model.add(Conv3D(64, kernel_size=(3, 3, 3), activation='relu'))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))
model.add(Dropout(0.2)) 

# Global Average Pooling to Reduce Dimensions
model.add(TimeDistributed(Flatten()))

# LSTM Layer for Temporal Dynamics
model.add(LSTM(64, return_sequences=False,kernel_regularizer=l2(0.01), dropout=0.5))

# Batch Normalization
model.add(BatchNormalization())

# Dense Layers
model.add(Dense(256, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))"""