# Lip Reading Prediction without CTC

In [1]:
# Import the required libraries.
import os
import cv2
import pafy
import math
import random
import numpy as np
import datetime as dt
import tensorflow as tf
from collections import deque
import matplotlib.pyplot as plt

from moviepy.editor import *

%matplotlib inline
from keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import plot_model

# 1. Loading Model 

In [2]:
seed_constant = 27
np.random.seed(seed_constant)
random.seed(seed_constant)
tf.random.set_seed(seed_constant)

In [3]:
# Important variables DEFINED IN PREPROCESSOR FILES
SEQUENCE_LENGTH = 15
IMAGE_HEIGHT = 50 
IMAGE_WIDTH = 100
CLASSES_LIST = ['ہے', 'کیسے', 'چار', 'دو', 'چھ', 'وہ', 'جی', 'کب', 'پانچھ', 'تین', 'آپ', 'نو', 'ہاں', 'میں', 'تھا', 'ہوں', 'نہیں', 'کیوں', 'کتنے', 'ایک', 'کون', 'تھے', 'ہم', 'آٹھ', 'کونسا', 'کدھر', 'سات']


In [4]:
def create_LRCN_model():
    '''
    This function will construct the required LRCN model.
    Returns:
        model: It is the required constructed LRCN model.
    '''

   # We will use a Sequential model for model construction.
    model = Sequential()
    
    # Define the Model Architecture.
    ########################################################################################################################
    
    model.add(TimeDistributed(Conv2D(16, (3, 3), padding='valid',activation = 'relu'),
                              input_shape = (SEQUENCE_LENGTH, IMAGE_HEIGHT, IMAGE_WIDTH, 3)))
    
    model.add(TimeDistributed(MaxPooling2D((2, 2)))) 
    model.add(TimeDistributed(Dropout(0.25)))
    
    model.add(TimeDistributed(Conv2D(32, (3, 3), padding='valid',activation = 'relu')))
    model.add(TimeDistributed(MaxPooling2D((2, 2))))
    model.add(TimeDistributed(Dropout(0.25)))
    
    model.add(TimeDistributed(Conv2D(64, (3, 3), padding='valid',activation = 'relu')))
    model.add(TimeDistributed(MaxPooling2D((2, 2))))
    model.add(TimeDistributed(Dropout(0.25)))
    
    model.add(TimeDistributed(Conv2D(64, (3, 3), padding='valid',activation = 'relu')))
    model.add(TimeDistributed(MaxPooling2D((2, 2))))
    #model.add(TimeDistributed(Dropout(0.25)))
                                      
    model.add(TimeDistributed(Flatten()))
                                      
    #model.add(LSTM(32))
    gru_size = 128
    GRU_layer = GRU(gru_size, return_sequences=True, name='lstm1')
    model.add(Bidirectional(GRU(gru_size, return_sequences=True, name='lstm1')))
    model.add(Bidirectional(GRU(gru_size, name='lstm1')))
    
    model.add(Dropout(0.25))
    
    model.add(Dense(len(CLASSES_LIST), activation = 'softmax'))

    ########################################################################################################################

    # Display the models summary.
    # Return the constructed LRCN model.
    return model

In [5]:
LRCN_model = create_LRCN_model()
# Compile the model and specify loss function, optimizer and metrics to the model.
LRCN_model.compile(loss = 'categorical_crossentropy', optimizer=Adam(lr = 0.00001), metrics = ["accuracy"])

LRCN_model.load_weights("Model_without_ctc_0.lr_0-00001_batch_0-18.h5"))
#LRCN_model.load_weights("Model_without_ctc_0.lr_0-00001_batch_0-19_a26.h5")

ValueError: Layer count mismatch when loading weights from file. Model expected 7 layers, found 5 saved layers.

# 2. Making Prediction on Existing Videos

In [None]:
w1_u = ['میں', 'آپ', 'ہم', 'وہ']
w2_k_u = ['کیسے', 'کونسا', 'کدھر', 'کتنے']
w3_u = ['ہوں', 'تھا', 'ہے', 'تھے']
w4_k_u = ['جی', 'ہاں', 'نہیں']
w5_u = ['کیوں', 'کب', 'کون']
w6_k_u = ['ایک', 'دو', 'تین', 'چار', 'پانچھ', 'چھ', 'سات', 'آٹھ', 'نو']

In [None]:
word_list = [w1_u, w2_k_u, w3_u, w4_k_u, w5_u, w6_k_u]

In [None]:
# function to get most likely word in a given set of words
def get_most_likely_word(word_list, CLASSES_LIST, res):
    
    prob = float(0.0)
    pred = ''
    
    ls = []

    for i in word_list:
        index = CLASSES_LIST.index(i)
        tmp_prob = res[index]
              
        if tmp_prob >  prob:
            prob = tmp_prob
            pred = i
    
    return pred,prob

#word = get_most_likely_word(w6_k_u, CLASSES_LIST, res)
#print(word)

In [None]:
from ipywidgets import IntProgress
from IPython.display import display
import cv2
from lips import crop_lips
import mediapipe as mp

# file_path = "..\\Dataset\\Urdu\\6\\aap_kitne_tha_han_kyun_ek\\_video.avi"

def predict_word_level(file_path, word_list, CLASSES_LIST, SEQUENCE_LENGTH):
    mp_holistic = mp.solutions.holistic  # Holistic model
    mp_drawing = mp.solutions.drawing_utils  # Drawing utilities


    # Declare a queue to store video frames.
    frames_queue = deque(maxlen = SEQUENCE_LENGTH)

    # Initialize a variable to store the predicted action being performed in the video.
    predicted_class_name = ''

    sentence = []
    predictions = []
    threshold = 0.0 # Result rendered only if they are above this threshold

    # Create a VideoCapture object and read from input file
    cap = cv2.VideoCapture(file_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    win_size = int((total_frames - 12) / 6) 

    w_cnt = 0

    # Progress bar
    #f = IntProgress(min=0, max=total_frames-12) # instantiate the bar
    #display(f) # display the bar

    # Check if camera opened successfully
    if (cap.isOpened()== False):
        print("Error opening video file")


    preds = []

    w0 = []
    w1 = []
    w2 = []
    w3 = []
    w4 = []
    w5 = []


    cnt = 0
    with mp_holistic.Holistic(min_detection_confidence=0.1, min_tracking_confidence=0.1) as holistic:    
        # Read until video is completed
        while(cap.isOpened()):

            # Capture frame-by-frame
            ret, frame = cap.read()

            # If frame correctly read only then performing predictions
            if ret == True:

                # Cropping lips
                cropped_image = crop_lips(frame, holistic)

                # Normalizing the cropped frame
                normalized_frame = cropped_image / 255    

                # Appending the pre-processed frame into the frames list.
                frames_queue.append(normalized_frame)

                # Check if the number of frames in the queue are equal to the fixed sequence length.
                if len(frames_queue) == SEQUENCE_LENGTH:

                    # Pass the normalized frames to the model and get the predicted probabilities.
                    res = LRCN_model.predict(np.expand_dims(frames_queue, axis = 0), verbose=0)[0]
                    
                    print(CLASSES_LIST[np.argmax(res)])
                    
                    # Appending prediction in the Predictions List
                    predictions.append(np.argmax(res))

                    #preds.append(get_most_likely_word(word_list[0], CLASSES_LIST, res))
                    #preds.append(get_most_likely_word(word_list[1], CLASSES_LIST, res))
                    #preds.append(get_most_likely_word(word_list[2], CLASSES_LIST, res))
                    #preds.append(get_most_likely_word(word_list[3], CLASSES_LIST, res))
                    #preds.append(get_most_likely_word(word_list[4], CLASSES_LIST, res))
                    #preds.append(get_most_likely_word(word_list[5], CLASSES_LIST, res))
                    #sentence.append(pred)

                    if w_cnt < win_size:
                        p0_w, p0_acc = get_most_likely_word(word_list[0], CLASSES_LIST, res)
                        if p0_acc > threshold:
                            w0.append([p0_w, p0_acc])



                    if w_cnt > win_size and w_cnt < (2*win_size):
                        p1_w, p1_acc = get_most_likely_word(word_list[1], CLASSES_LIST, res)
                        if p1_acc > threshold:
                            w1.append([p1_w, p1_acc])

                    if w_cnt > (2*win_size) and w_cnt < (3*win_size):
                        p2_w, p2_acc = get_most_likely_word(word_list[2], CLASSES_LIST, res)
                        if p2_acc > threshold:
                            w2.append([p2_w, p2_acc])

                    if w_cnt > (3*win_size) and w_cnt < (4*win_size):
                        p3_w, p3_acc = get_most_likely_word(word_list[3], CLASSES_LIST, res)
                        if p3_acc > threshold:
                            w3.append([p3_w, p3_acc])

                    if w_cnt > (4*win_size) and w_cnt < (5*win_size):
                        p4_w, p4_acc = get_most_likely_word(word_list[4], CLASSES_LIST, res)
                        if p4_acc > threshold:
                            w4.append([p4_w,p4_acc])

                    if w_cnt > (5*win_size) :
                        p5_w, p5_acc = get_most_likely_word(word_list[5], CLASSES_LIST, res)
                        if p5_acc > threshold:
                            w5.append([p5_w,p5_acc])


                    w_cnt += 1

                    #Progress bar variables
                    #f.value += 1 # signal to increment the progress bar

                    #print(get_most_likely_word(word_list[5], CLASSES_LIST, res))
                    '''if np.unique(predictions[-5:])[0] == np.argmax(res):
                        if res[np.argmax(res)] > threshold:
                            print(CLASSES_LIST[np.argmax(res)])
                            if len(sentence) > 0:
                                if CLASSES_LIST[np.argmax(res)] != sentence[-1]:
                                    sentence.append(CLASSES_LIST[np.argmax(res)])
                            else:
                                sentence.append(CLASSES_LIST[np.argmax(res)])
                    '''

                    if len(sentence) < 8:

                        sentence.extend(np.unique(preds))


                #print(sentence)

                #cv2.putText(frame, ' '.join(sentence) , (2, 30),cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
                # Display the resulting frame
                #cv2.imshow('Frame', frame)


            # Press Q on keyboard to exit
                if cv2.waitKey(25) & 0xFF == ord('q'):
                    break

            # Break the loop
            else:
                break


     # When everything done, release the video capture object
    cap.release()

    # Closes all the frames
    cv2.destroyAllWindows()
    
    return [w0, w1, w2, w3, w4, w5]

In [None]:
file_path = r"D:\FYP Workspace\Raw Data\0\wo_kese_tha_han_kab_chhae\_video.avi"

prediction = predict_word_level(file_path, word_list, CLASSES_LIST, SEQUENCE_LENGTH)

In [None]:
# Returns word with highest probability in a window
def highest_prob_word(words):
    
    w = ''
    acc = 0
    for i in words:
        if i[1] > acc:
            w = i[0]
    
    return w

# Returns most frequently occuring word in a window
def most_freq_word(words):
    
    tmp_list = []
    
    for i in words:
        tmp_list.append(i[0])
    
    return max(set(tmp_list), key = tmp_list.count)

# General function to parse sentence [args : prediction results, function to be used for getting predicted words]
def parse_sentence(prediction, func):
    
    sen = func(prediction[0])+" "+func(prediction[1])+" "+func(prediction[2])+" "+func(prediction[3])+" "+func(prediction[4])+" "+func(prediction[5])
    
    return sen

In [None]:

#print(parse_sentence(prediction, most_freq_word))
#print(parse_sentence(prediction, highest_prob_word))

In [58]:
prediction

[[['وہ', 0.0003135272],
  ['وہ', 0.0003075806],
  ['وہ', 0.00029801187],
  ['وہ', 0.00030042834],
  ['وہ', 0.0003055214],
  ['وہ', 0.0002911442],
  ['وہ', 0.0002905727],
  ['وہ', 0.00028994022],
  ['وہ', 0.0002800187],
  ['وہ', 0.00028508995],
  ['وہ', 0.00029535138],
  ['وہ', 0.00030491402],
  ['وہ', 0.00030268246],
  ['وہ', 0.00030134275]],
 [['کونسا', 0.0021174792],
  ['کونسا', 0.00226972],
  ['کونسا', 0.0022969807],
  ['کونسا', 0.0023301952],
  ['کونسا', 0.0023471767],
  ['کونسا', 0.002334324],
  ['کونسا', 0.0024401324],
  ['کونسا', 0.0025586458],
  ['کونسا', 0.0025960158],
  ['کونسا', 0.002506506],
  ['کونسا', 0.0022066436],
  ['کونسا', 0.0019777264],
  ['کونسا', 0.0019186819]],
 [['ہوں', 0.03882281],
  ['ہوں', 0.03821545],
  ['ہوں', 0.037633277],
  ['ہوں', 0.03628597],
  ['ہوں', 0.037882246],
  ['ہوں', 0.04098809],
  ['ہوں', 0.042226944],
  ['ہوں', 0.042036325],
  ['ہوں', 0.03707954],
  ['ہوں', 0.03031575],
  ['ہوں', 0.027466035],
  ['ہوں', 0.02514671],
  ['ہوں', 0.023838667]],
 

In [60]:
prediction

[[['وہ', 0.00030752964],
  ['وہ', 0.00030346695],
  ['وہ', 0.00030106676],
  ['وہ', 0.00030189828],
  ['وہ', 0.00030023133],
  ['وہ', 0.00029923843],
  ['وہ', 0.00030286593],
  ['وہ', 0.0002879529],
  ['وہ', 0.00029021432],
  ['وہ', 0.00029006423],
  ['وہ', 0.00027735782],
  ['وہ', 0.00026087312],
  ['وہ', 0.00026042463],
  ['وہ', 0.00024922693]],
 [['کونسا', 0.0019243342],
  ['کونسا', 0.0018881434],
  ['کونسا', 0.0018806419],
  ['کونسا', 0.0018293484],
  ['کونسا', 0.0018960196],
  ['کونسا', 0.0019842228],
  ['کونسا', 0.002002617],
  ['کونسا', 0.0018724068],
  ['کونسا', 0.0018470136],
  ['کونسا', 0.0018377311],
  ['کونسا', 0.001925882],
  ['کونسا', 0.0019275012],
  ['کونسا', 0.0019942264]],
 [['ہوں', 0.08729195],
  ['ہوں', 0.09059428],
  ['ہوں', 0.088176526],
  ['ہوں', 0.081180714],
  ['ہوں', 0.06671865],
  ['ہوں', 0.056790102],
  ['ہوں', 0.05528664],
  ['ہوں', 0.052246273],
  ['ہوں', 0.05341279],
  ['ہوں', 0.053130873],
  ['ہوں', 0.051384963],
  ['ہوں', 0.050020188],
  ['ہوں', 0.04926

# 3. Record and Predict Video

### - Gathering and Defining helper Functions

In [16]:
# Importing dependencies
# import the opencv library
import cv2
import mediapipe as mp
import math
import numpy as np

In [17]:
# Defining Mediapipe model
mp_holistic = mp.solutions.holistic  # Holistic model
mp_drawing = mp.solutions.drawing_utils  # Drawing utilities

In [18]:
import cv2

# Function to record simple video of 4 seconds
# Args(camera_index; 1 = PC webcam, 1 = USB cam)
def record_vid(cam_index = 0):
    
    # Set the video capture device (webcam)
    cap = cv2.VideoCapture(cam_index)

    width= int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height= int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Define the codec and create a VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'DIVX')
    # Recording at 25 fps with Camera's original resolution
    out = cv2.VideoWriter('output.mp4',fourcc, 25.0, (width,height))

    # Set the duration of the video capture (in seconds)
    duration = 4

    # Capture frames for the specified duration
    start_time = cv2.getTickCount()
    frame_cnt = 0
    while(int((cv2.getTickCount() - start_time)/cv2.getTickFrequency() * 1000) < duration*1000):
        ret, frame = cap.read()
        
        if ret==True:
            # Show the frame
            cv2.imshow('frame',frame)
            # Write the frame to the output file
            out.write(frame)
            frame_cnt += 1
            # Wait for a key press to exit
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
            
            if frame_cnt == 86:
                break
        else:
            break

    # Release the video capture device and the output file
    cap.release()
    out.release()
    
    # Destroy all the windows
    cv2.destroyAllWindows()

In [19]:
#Passing image and model to the function
def mediapipe_detection(image, model):
    # Converting frame from BGR to RGB because model works on RGB
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Color conversion (BGR to RGB)
    
    image.flags.writeable = False # Image not writeable anymore
    
    results = model.process(image)# Making prediction
    
    image.flags.writeable = True # Image is now writeable
    
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # Color conversion (RGB to BGR)
    return image, results

In [20]:
def crop_lips(image, results):
    image_height, image_width, c = image.shape

    #IMPORTANT VARIABLES
    # Defining width and height of resized frame
    width = 100
    height = 50

    # If no lips get detected by mediapipe then exception will be thrown
    try:        
        
        # NORAMALIZING POSITIONS OF LANDMARKS(Two lines below taken from
        x_px1 = min(math.floor(results.face_landmarks.landmark[212].x * image_width), image_width - 1)
        x_px2 = min(math.floor(results.face_landmarks.landmark[432].x * image_width), image_width - 1)
        y_px1 = min(math.floor(results.face_landmarks.landmark[94].y * image_height), image_height - 1)
        y_px2 = min(math.floor(results.face_landmarks.landmark[200].y * image_height), image_height - 1)
        
        # Padding the image
        pad = 0.05
        

        a = math.floor(x_px1 * (1 - pad))
        b = math.floor(x_px2 * (1 + pad))
        c = math.floor(y_px1 * (1 - pad))
        d = math.floor(y_px2 * (1 + pad))
        
        #print("X = ",x_px1," ",x_px2)
        #print("AB= ",a, " ",b)
        #print("Y = ",y_px1," ",y_px2)
        #print("CD= ",c," ",d)
        
        # Cropping an image
        #cropped_image = image[y_px1:y_px2, x_px1:x_px2]
        cropped_image = image[c:d, a:b]

        # Resizing the cropped image to Fixed resolution i.e. 300*150
        dim = (width, height)

        resized = cv2.resize(cropped_image, dim, interpolation=cv2.INTER_AREA)

    except:
        # If no lips detected plain black frame will be returned
        resized = np.zeros((height, width, 3), dtype=np.uint8)


    return resized

In [22]:
filepath = "temp_video.mp4"

In [25]:
prediction = predict_word_level(filepath, word_list, CLASSES_LIST, SEQUENCE_LENGTH)

In [26]:
print(parse_sentence(prediction, most_freq_word))
print(parse_sentence(prediction, highest_prob_word))

آپ کتنے تھا ہاں کون آٹھ
آپ کتنے تھا ہاں کون آٹھ


In [27]:
prediction

[[['آپ', 0.13459261],
  ['آپ', 0.13459261],
  ['آپ', 0.13459261],
  ['آپ', 0.13459261],
  ['آپ', 0.13459261],
  ['آپ', 0.13459261],
  ['آپ', 0.13459261],
  ['آپ', 0.13459261],
  ['آپ', 0.13459261],
  ['آپ', 0.13459261],
  ['آپ', 0.13459261],
  ['آپ', 0.13459261]],
 [['کتنے', 0.057291556],
  ['کتنے', 0.057291556],
  ['کتنے', 0.057291556],
  ['کتنے', 0.057291556],
  ['کتنے', 0.057291556],
  ['کتنے', 0.057291556],
  ['کتنے', 0.057291556],
  ['کتنے', 0.057291556],
  ['کتنے', 0.057291556],
  ['کتنے', 0.057291556],
  ['کتنے', 0.057291556]],
 [['تھا', 0.02471636],
  ['تھا', 0.02471636],
  ['تھا', 0.02471636],
  ['تھا', 0.02471636],
  ['تھا', 0.02471636],
  ['تھا', 0.02471636],
  ['تھا', 0.02471636],
  ['تھا', 0.02471636],
  ['تھا', 0.02471636],
  ['تھا', 0.02471636],
  ['تھا', 0.02471636]],
 [['ہاں', 0.05466577],
  ['ہاں', 0.05466577],
  ['ہاں', 0.05466577],
  ['ہاں', 0.05466577],
  ['ہاں', 0.05466577],
  ['ہاں', 0.05466577],
  ['ہاں', 0.05466577],
  ['ہاں', 0.05466577],
  ['ہاں', 0.05466577]

In [33]:
from Dataset import get_sentences

sentences = get_sentences('..\\Dictionary\\roman_urdu_sentences.txt')

speaker_id = 18

#file_path = "..\\Dataset\\Urdu\\6\\aap_konsa_thay_jee_kon_ek\\_video.avi"

x_sen = []
y_sen = []

cnt = 0

for i in sentences:
    
    print(cnt)
    
    # Appending in actual sentence list
    x_sen.append(i)
    
    sen = i.replace(' ','_')
    
    # Parsing filepath of the video file
    file_path = "D:\\FYP Workspace\\Raw Data\\" + str(speaker_id) + "\\" + sen + "\\_video.avi"

    # Making prediction on video and parsing sentencing
    prediction = predict_word_level(file_path, word_list, CLASSES_LIST, SEQUENCE_LENGTH)
    predicted_sentence = parse_sentence(prediction, highest_prob_word)
    
    # Appending in predicted sentence list
    y_sen.append(predicted_sentence)
    
    cnt += 1

    if cnt==4:
        break


0
1
2
3


In [34]:
#Function to get urdu dictionary sentences
def get_urdu_sentences(path):
    #path = '..\\Dictionary\\roman_urdu_sentences.txt'

    # Using readlines()
    file = open(path, 'r', encoding='utf-8')
    lines = file.readlines()

    for i in range(len(lines)):
        lines[i] = lines[i].replace("\n", "")

    return lines

In [35]:
# Library to calculate WER
from jiwer import wer

#print(len(x_sen))
#print(len(y_sen))
urdu_sen = get_urdu_sentences('..\\Dictionary\\urdu_sentences.txt')

total_wer = 0

#print("       Predicted       -          Actual")
for i in range(0, len(urdu_sen)):
    print("Original : ",urdu_sen[i]) 
    print("Predicted : " , y_sen[i])
    print("------------------------------------")
    
    #arg = (ref, hypothesis)
    total_wer += wer(urdu_sen[i], y_sen[i])

print("\n----------------------\n")
avg_wer = total_wer / len(urdu_sen)
    
print(avg_wer)    

Original :  وہ کیسے ہوں جی کب ایک
Predicted :  آپ کتنے تھا ہاں کون آٹھ
------------------------------------
Original :  آپ کیسے ہے جی کیوں دو
Predicted :  آپ کتنے تھا ہاں کون آٹھ
------------------------------------
Original :  وہ کیسے تھا جی کب تین
Predicted :  آپ کتنے تھا ہاں کون آٹھ
------------------------------------
Original :  میں کیسے تھے جی کیوں چار
Predicted :  آپ کتنے تھا ہاں کون آٹھ
------------------------------------
Original :  آپ کیسے تھا جی کیوں پانچھ


IndexError: list index out of range