In [1]:
# Import the required libraries.

import os
import cv2
import pafy
import math
import random
import numpy as np
import datetime as dt
import tensorflow as tf
from collections import deque
import matplotlib.pyplot as plt

from moviepy.editor import *

%matplotlib inline

from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import plot_model
from keras.optimizers import Adam
from keras.models import Model

# 1. Loading Model

In [2]:
# Important variables DEFINED IN PREPROCESSOR FILES
SEQUENCE_LENGTH = 15
IMAGE_HEIGHT = 50 
IMAGE_WIDTH = 100
CLASSES_LIST = ['ہے', 'کیسے', 'چار', 'دو', 'چھ', 'وہ', 'جی', 'کب', 'پانچھ', 'تین', 'آپ', 'نو', 'ہاں', 'میں', 'تھا', 'ہوں', 'نہیں', 'کیوں', 'کتنے', 'ایک', 'کون', 'تھے', 'ہم', 'آٹھ', 'کونسا', 'کدھر', 'سات']
alphabets = "اٹتپکھودےآسرہبںنچجمی"
max_str_len = len(max(CLASSES_LIST, key=len)) #=5 max length of input labels
num_of_characters = len(set(alphabets)) + 1 # +1 for ctc pseudo blank
num_of_timestamps = SEQUENCE_LENGTH#len(max(CLASSES_LIST, key=len)) #=64 max length of predicted labels

In [3]:
def label_to_num(label):
    label_num = []
    for ch in label:
        label_num.append(alphabets.find(ch))
    return np.array(label_num)

def num_to_label(num):
    ret = ""
    for ch in num:
        if ch == -1:  # CTC Blank
            break
        else:
            ret+=alphabets[ch]
    return ret

In [4]:
from keras import backend as K
# the ctc loss function
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    # the 2 is critical here since the first couple outputs of the RNN
    # tend to be garbage
    y_pred = y_pred[:, 2:, :]
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

In [5]:
labels = Input(name='gtruth_labels', shape=[max_str_len], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')

In [6]:
from tensorflow.keras import regularizers
IMAGE_HEIGHT = 50
IMAGE_WIDTH = 100

input_data = Input(shape=(SEQUENCE_LENGTH, IMAGE_HEIGHT, IMAGE_WIDTH, 3), name='input')

x = TimeDistributed(Conv2D(16, (3, 3), padding='valid', activation='relu'))(input_data)
x = TimeDistributed(MaxPooling2D((2, 2)))(x)
x = TimeDistributed(Dropout(0.25))(x)

x = TimeDistributed(Conv2D(32, (3, 3), padding='valid', activation='relu'))(input_data)
x = TimeDistributed(MaxPooling2D((2, 2)))(x)
x = TimeDistributed(Dropout(0.25))(x)

x = TimeDistributed(Conv2D(64, (3, 3), padding='valid', activation='relu'))(x)
x = TimeDistributed(MaxPooling2D((2, 2)))(x)
x = TimeDistributed(Dropout(0.25))(x)

#x = TimeDistributed(Conv2D(64, (3, 3), padding='valid', activation='relu'))(x)
#x = TimeDistributed(MaxPooling2D((1, 2)))(x)
#x = TimeDistributed(Dropout(0.25))(x)

#x = TimeDistributed(Conv2D(128, (3, 3), padding='valid', activation='relu'))(x)
#x = TimeDistributed(MaxPooling2D((1, 2)))(x)
#x = TimeDistributed(Dropout(0.25))(x)

x = Reshape(target_shape=((SEQUENCE_LENGTH, -1)), name='reshape')(x)
#x = Dense(64, activation='relu', kernel_initializer='he_normal', name='dense1')(x)

rnn_size = 128

lstm_layer = GRU(rnn_size, return_sequences=True, name='lstm1')
x = Bidirectional(lstm_layer)(x)
x = Bidirectional(lstm_layer)(x)
#x=LSTM(rnn_size, return_sequences=True, name='lstm1')(x)

#x = Dense(64, name="dense_1")(x)
x = Dropout(rate=0.25)(x)

## OUTPUT
x = Dense(num_of_characters, kernel_initializer='he_normal',name='dense2',kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4))(x)
y_pred = Activation('softmax', name='softmax')(x)

ctc_loss = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length])
model_final = Model(inputs=[input_data, labels, input_length, label_length], outputs=ctc_loss)


In [7]:

#early_stopping_callback = EarlyStopping(monitor = 'val_loss', patience = 15, mode = 'min', restore_best_weights = True)
mc = ModelCheckpoint('New_model_GRU_BEST_VALL-LOSS_checkpoint.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=50)

model_final.load_weights('New_model_GRU_BEST-LOSS_batchid_19valloss_8.40.h5')
#model_final.load_weights("final_model_ctc_2BILSTM128_speaker-0to10_11valid.h5")

In [9]:
import keras

#features = features_valid
#labels = labels_valid

pred_model = keras.models.Model(
    model_final.get_layer(name="input").input, model_final.get_layer(name="softmax").output
)

preds = pred_model.predict(features_valid)
decoded = K.get_value(K.ctc_decode(preds, input_length=np.ones(preds.shape[0])*preds.shape[1], greedy=True)[0][0])

cnt = 0

prediction = []
for i in range(len(features_valid)):
    
    lol=0
    if num_to_label(decoded[i]) == CLASSES_LIST[labels_valid[i]]:
        lol = "1"
        cnt +=1
    
    print(num_to_label(decoded[i])," -- ", CLASSES_LIST[labels_valid[i]]," - ", lol)
    prediction.append(num_to_label(decoded[i]))

NameError: name 'features_valid' is not defined

In [10]:
import re
from collections import Counter

def correct_word(word_list, target_word, n=1):
    target_ngrams = Counter([target_word[i:i+n] for i in range(len(target_word)-n+1)])
    best_match, best_match_score = None, 0
    for word in word_list:
        word_ngrams = Counter([word[i:i+n] for i in range(len(word)-n+1)])
        score = sum((word_ngrams & target_ngrams).values())
        if score > best_match_score:
            best_match, best_match_score = word, score
    return best_match

print(CLASSES_LIST)
word_list = CLASSES_LIST
target_word = ''
closest_word = correct_word(word_list, target_word)
print(closest_word)  # Output: apple

['ہے', 'کیسے', 'چار', 'دو', 'چھ', 'وہ', 'جی', 'کب', 'پانچھ', 'تین', 'آپ', 'نو', 'ہاں', 'میں', 'تھا', 'ہوں', 'نہیں', 'کیوں', 'کتنے', 'ایک', 'کون', 'تھے', 'ہم', 'آٹھ', 'کونسا', 'کدھر', 'سات']
None


In [11]:
cnt=0
for i in range(len(features_valid)):
    
    target_word = num_to_label(decoded[i])
    
    closest_word = correct_word(word_list, target_word,n=2)
    
    if closest_word == None:
        closest_word = correct_word(word_list, target_word,n=1)
    
    lol=0
    if closest_word == CLASSES_LIST[labels_valid[i]]:
        lol = "1"
        cnt +=1
    
    print( num_to_label(decoded[i]),"  ",closest_word," -- ", CLASSES_LIST[labels_valid[i]]," - ", lol)
    prediction.append(num_to_label(decoded[i]))

NameError: name 'features_valid' is not defined

In [12]:
print(cnt)
print(len(features_valid))

0


NameError: name 'features_valid' is not defined

# 2. Performing Lip reading on Existing Videos

In [13]:
w1_u = ['میں', 'آپ', 'ہم', 'وہ']
w2_k_u = ['کیسے', 'کونسا', 'کدھر', 'کتنے']
w3_u = ['ہوں', 'تھا', 'ہے', 'تھے']
w4_k_u = ['جی', 'ہاں', 'نہیں']
w5_u = ['کیوں', 'کب', 'کون']
w6_k_u = ['ایک', 'دو', 'تین', 'چار', 'پانچھ', 'چھے', 'سات', 'آٹھ', 'نوں']

In [16]:
# importing libraries
import cv2
from lips import crop_lips
import mediapipe as mp
import keras

mp_holistic = mp.solutions.holistic  # Holistic model
mp_drawing = mp.solutions.drawing_utils  # Drawing utilities


# Declare a queue to store video frames.
frames_queue = deque(maxlen = SEQUENCE_LENGTH)

# Initialize a variable to store the predicted action being performed in the video.
predicted_class_name = ''

sentence = []
predictions = []
threshold = 0.0 # Result rendered only if they are above this threshold

# Create a VideoCapture object and read from input file
cap = cv2.VideoCapture(r"D:\FYP Workspace\Raw Data\18\mein_kidhar_thay_jee_kab_saath\_video.avi")
length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

# Check if camera opened successfully
if (cap.isOpened()== False):
    print("Error opening video file")


preds = []

pred_model = keras.models.Model(model_final.get_layer(name="input").input, model_final.get_layer(name="softmax").output)

cnt = 0
with mp_holistic.Holistic(min_detection_confidence=0.1, min_tracking_confidence=0.1) as holistic:    
    # Read until video is completed
    while(cap.isOpened()):

        # Capture frame-by-frame
        ret, frame = cap.read()
        
        # If frame correctly read only then performing predictions
        if ret == True:
            
            # Cropping lips
            cropped_image = crop_lips(frame, holistic)

            # Normalizing the cropped frame
            normalized_frame = cropped_image / 255    
            
            # Appending the pre-processed frame into the frames list.
            frames_queue.append(normalized_frame)
            
            # Check if the number of frames in the queue are equal to the fixed sequence length.
            if len(frames_queue) == SEQUENCE_LENGTH:
                
                seq = np.asarray(frames_queue)
                print(seq.shape)
                print("Res ",seq.reshape(1, 15, 50, 100, 3).shape)
                res = pred_model.predict(seq.reshape(1, 15, 50, 100, 3))
                
                word_list = w3_u
                decoded = K.get_value(K.ctc_decode(res, input_length=np.ones(res.shape[0])*res.shape[1], greedy=True)[0][0])
                target_word = num_to_label(decoded[0])
                closest_word = correct_word(word_list, target_word,n=2)
    
                if closest_word == None:
                    closest_word = correct_word(word_list, target_word,n=1)
                
                print(num_to_label(decoded[0]),"----",closest_word)
                #res = model.predict(image.reshape(1, 256, 64, 1))
                # Pass the normalized frames to the model and get the predicted probabilities.
                #res = model_final.predict(np.expand_dims(frames_queue, axis = 0))[0]

                # Appending prediction in the Predictions List
                #predictions.append(np.argmax(res))
                
                #frames_queue = frames_queue[5:]
                
            cv2.imshow("Frame",frame)
            
        # Press Q on keyboard to exit
            if cv2.waitKey(25) & 0xFF == ord('q'):
                break

        # Break the loop
        else:
            break
  

 # When everything done, release the video capture object
cap.release()
  
# Closes all the frames
cv2.destroyAllWindows()

(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنہیے ---- ہے
(15, 50, 

کنہیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنہیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنہیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنہیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنہیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنہیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنہیے ---- ہے
(15, 50, 100, 3)
Res  (1, 15, 50, 100, 3)
کنہیے ---- ہے
