# Lip Reading Prediction without CTC

In [49]:
# Import the required libraries.
import os
import cv2
import pafy
import math
import random
import numpy as np
import datetime as dt
import tensorflow as tf
from collections import deque
import matplotlib.pyplot as plt
from jiwer import wer
from moviepy.editor import *

%matplotlib inline
from keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import plot_model

# 1. Loading Model 

In [2]:
seed_constant = 27
np.random.seed(seed_constant)
random.seed(seed_constant)
tf.random.set_seed(seed_constant)

In [3]:
# Important variables DEFINED IN PREPROCESSOR FILES
SEQUENCE_LENGTH = 15
IMAGE_HEIGHT = 50 
IMAGE_WIDTH = 100
CLASSES_LIST = ['ہے', 'کیسے', 'چار', 'دو', 'چھے', 'وہ', 'جی', 'کب', 'پانچھ', 'تین', 'آپ', 'نوں', 'ہاں', 'میں', 'تھا', 'ہوں', 'نہیں', 'کیوں', 'کتنے', 'ایک', 'کون', 'تھے', 'ہم', 'آٹھ', 'کونسا', 'کدھر', 'سات']


In [4]:
def create_LRCN_model():
    '''
    This function will construct the required LRCN model.
    Returns:
        model: It is the required constructed LRCN model.
    '''

   # We will use a Sequential model for model construction.
    model = Sequential()
    
    # Define the Model Architecture.
    ########################################################################################################################
    
    model.add(TimeDistributed(Conv2D(16, (3, 3), padding='valid',activation = 'relu'),
                              input_shape = (SEQUENCE_LENGTH, IMAGE_HEIGHT, IMAGE_WIDTH, 3)))
    
    model.add(TimeDistributed(MaxPooling2D((2, 2)))) 
    model.add(TimeDistributed(Dropout(0.25)))
    
    model.add(TimeDistributed(Conv2D(32, (3, 3), padding='valid',activation = 'relu')))
    model.add(TimeDistributed(MaxPooling2D((2, 2))))
    model.add(TimeDistributed(Dropout(0.25)))
    
    model.add(TimeDistributed(Conv2D(64, (3, 3), padding='valid',activation = 'relu')))
    model.add(TimeDistributed(MaxPooling2D((2, 2))))
    model.add(TimeDistributed(Dropout(0.25)))
    
    model.add(TimeDistributed(Conv2D(64, (3, 3), padding='valid',activation = 'relu')))
    model.add(TimeDistributed(MaxPooling2D((2, 2))))
    #model.add(TimeDistributed(Dropout(0.25)))
                                      
    model.add(TimeDistributed(Flatten()))
                                      
    model.add(GRU(64,return_sequences=True))
    model.add(GRU(64))
    
    #model.add(Bidirectional(GRU(32)))
    
    model.add(Dense(len(CLASSES_LIST), activation = 'softmax'))


    ########################################################################################################################

    # Display the models summary.
    model.summary()
    
    # Return the constructed LRCN model.
    return model

In [5]:
LRCN_model = create_LRCN_model()
# Compile the model and specify loss function, optimizer and metrics to the model.
LRCN_model.compile(loss = 'categorical_crossentropy', optimizer=Adam(lr = 0.00001), metrics = ["accuracy"])


#LRCN_model.load_weights(r"C:\Users\ibrah\OneDrive\Desktop\Best word level predictions\Model_1-Without-CTC-LOSS_checkpoint1_tillbatch_5.h5")
LRCN_model.load_weights(r"C:\Users\ibrah\OneDrive\Desktop\Best word level predictions\Model_1-Without-CTC-LOSS_checkpoint1_tillbatch_9.h5")
#LRCN_model.load_weights('Model_1-Without-CTC-LOSS_checkpoint1.h5')

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_distributed (TimeDistr  (None, 15, 48, 98, 16)   448       
 ibuted)                                                         
                                                                 
 time_distributed_1 (TimeDis  (None, 15, 24, 49, 16)   0         
 tributed)                                                       
                                                                 
 time_distributed_2 (TimeDis  (None, 15, 24, 49, 16)   0         
 tributed)                                                       
                                                                 
 time_distributed_3 (TimeDis  (None, 15, 22, 47, 32)   4640      
 tributed)                                                       
                                                                 
 time_distributed_4 (TimeDis  (None, 15, 11, 23, 32)   0

# 2. Making Prediction on Existing Videos

In [6]:
w1_u = ['میں', 'آپ', 'ہم', 'وہ']
w2_k_u = ['کیسے', 'کونسا', 'کدھر', 'کتنے']
w3_u = ['ہوں', 'تھا', 'ہے', 'تھے']
w4_k_u = ['جی', 'ہاں', 'نہیں']
w5_u = ['کیوں', 'کب', 'کون']
w6_k_u = ['ایک', 'دو', 'تین', 'چار', 'پانچھ', 'چھے', 'سات', 'آٹھ', 'نوں']

In [7]:
word_list = [w1_u, w2_k_u, w3_u, w4_k_u, w5_u, w6_k_u]

In [8]:
# function to get most likely word in a given set of words
def get_most_likely_word(word_list, CLASSES_LIST, res):
    
    prob = float(0.0)
    pred = ''
    
    ls = []

    for i in word_list:
        index = CLASSES_LIST.index(i)
        tmp_prob = res[index]
              
        if tmp_prob >  prob:
            prob = tmp_prob
            pred = i
    
    return pred,prob

#word = get_most_likely_word(w6_k_u, CLASSES_LIST, res)
#print(word)

In [9]:
from ipywidgets import IntProgress
from IPython.display import display
import cv2
from lips import crop_lips
import mediapipe as mp

# file_path = "..\\Dataset\\Urdu\\6\\aap_kitne_tha_han_kyun_ek\\_video.avi"

def predict_word_level(file_path, word_list, CLASSES_LIST, SEQUENCE_LENGTH):
    mp_holistic = mp.solutions.holistic  # Holistic model
    mp_drawing = mp.solutions.drawing_utils  # Drawing utilities


    # Declare a queue to store video frames.
    frames_queue = deque(maxlen = SEQUENCE_LENGTH)

    # Initialize a variable to store the predicted action being performed in the video.
    predicted_class_name = ''

    sentence = []
    predictions = []
    threshold = 0.0 # Result rendered only if they are above this threshold

    # Create a VideoCapture object and read from input file
    cap = cv2.VideoCapture(file_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    win_size = int((total_frames - 12) / 6) 

    w_cnt = 0

    # Progress bar
    #f = IntProgress(min=0, max=total_frames-12) # instantiate the bar
    #display(f) # display the bar

    # Check if camera opened successfully
    if (cap.isOpened()== False):
        print("Error opening video file")


    preds = []

    w0 = []
    w1 = []
    w2 = []
    w3 = []
    w4 = []
    w5 = []


    cnt = 0
    with mp_holistic.Holistic(min_detection_confidence=0.1, min_tracking_confidence=0.1) as holistic:    
        # Read until video is completed
        while(cap.isOpened()):

            # Capture frame-by-frame
            ret, frame = cap.read()

            # If frame correctly read only then performing predictions
            if ret == True:

                # Cropping lips
                cropped_image = crop_lips(frame, holistic)

                # Normalizing the cropped frame
                normalized_frame = cropped_image / 255    

                # Appending the pre-processed frame into the frames list.
                frames_queue.append(normalized_frame)

                # Check if the number of frames in the queue are equal to the fixed sequence length.
                if len(frames_queue) == SEQUENCE_LENGTH:

                    # Pass the normalized frames to the model and get the predicted probabilities.
                    res = LRCN_model.predict(np.expand_dims(frames_queue, axis = 0), verbose=0)[0]
                    
                    #print(CLASSES_LIST[np.argmax(res)])
                    
                    # Appending prediction in the Predictions List
                    predictions.append(np.argmax(res))

                    #preds.append(get_most_likely_word(word_list[0], CLASSES_LIST, res))
                    #preds.append(get_most_likely_word(word_list[1], CLASSES_LIST, res))
                    #preds.append(get_most_likely_word(word_list[2], CLASSES_LIST, res))
                    #preds.append(get_most_likely_word(word_list[3], CLASSES_LIST, res))
                    #preds.append(get_most_likely_word(word_list[4], CLASSES_LIST, res))
                    #preds.append(get_most_likely_word(word_list[5], CLASSES_LIST, res))
                    #sentence.append(pred)

                    if w_cnt < win_size:
                        p0_w, p0_acc = get_most_likely_word(word_list[0], CLASSES_LIST, res)
                        if p0_acc > threshold:
                            w0.append([p0_w, p0_acc])



                    if w_cnt > win_size and w_cnt < (2*win_size):
                        p1_w, p1_acc = get_most_likely_word(word_list[1], CLASSES_LIST, res)
                        if p1_acc > threshold:
                            w1.append([p1_w, p1_acc])

                    if w_cnt > (2*win_size) and w_cnt < (3*win_size):
                        p2_w, p2_acc = get_most_likely_word(word_list[2], CLASSES_LIST, res)
                        if p2_acc > threshold:
                            w2.append([p2_w, p2_acc])

                    if w_cnt > (3*win_size) and w_cnt < (4*win_size):
                        p3_w, p3_acc = get_most_likely_word(word_list[3], CLASSES_LIST, res)
                        if p3_acc > threshold:
                            w3.append([p3_w, p3_acc])

                    if w_cnt > (4*win_size) and w_cnt < (5*win_size):
                        p4_w, p4_acc = get_most_likely_word(word_list[4], CLASSES_LIST, res)
                        if p4_acc > threshold:
                            w4.append([p4_w,p4_acc])

                    if w_cnt > (5*win_size) :
                        p5_w, p5_acc = get_most_likely_word(word_list[5], CLASSES_LIST, res)
                        if p5_acc > threshold:
                            w5.append([p5_w,p5_acc])


                    w_cnt += 1

                    #Progress bar variables
                    #f.value += 1 # signal to increment the progress bar

                    #print(get_most_likely_word(word_list[5], CLASSES_LIST, res))
                    '''if np.unique(predictions[-5:])[0] == np.argmax(res):
                        if res[np.argmax(res)] > threshold:
                            print(CLASSES_LIST[np.argmax(res)])
                            if len(sentence) > 0:
                                if CLASSES_LIST[np.argmax(res)] != sentence[-1]:
                                    sentence.append(CLASSES_LIST[np.argmax(res)])
                            else:
                                sentence.append(CLASSES_LIST[np.argmax(res)])
                    '''

                    if len(sentence) < 8:

                        sentence.extend(np.unique(preds))


                #print(sentence)

                #cv2.putText(frame, ' '.join(sentence) , (2, 30),cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
                # Display the resulting frame
                #cv2.imshow('Frame', frame)


            # Press Q on keyboard to exit
                if cv2.waitKey(25) & 0xFF == ord('q'):
                    break

            # Break the loop
            else:
                break


     # When everything done, release the video capture object
    cap.release()

    # Closes all the frames
    cv2.destroyAllWindows()
    
    return [w0, w1, w2, w3, w4, w5]

In [10]:
# Returns word with highest probability in a window
def highest_prob_word(words):
    
    w = ''
    acc = 0
    for i in words:
        if i[1] > acc:
            w = i[0]
    
    return w

# Returns most frequently occuring word in a window
def most_freq_word(words):
    
    tmp_list = []
    
    for i in words:
        tmp_list.append(i[0])
    
    return max(set(tmp_list), key = tmp_list.count)

# General function to parse sentence [args : prediction results, function to be used for getting predicted words]
def parse_sentence(prediction, func):
    
    sen = func(prediction[0])+" "+func(prediction[1])+" "+func(prediction[2])+" "+func(prediction[3])+" "+func(prediction[4])+" "+func(prediction[5])
    
    return sen

In [39]:
#file_path = r"D:\FYP Workspace\Raw Data\8\aap_kitne_tha_han_kyun_ek\_video.avi"
file_path = r"C:\Users\ibrah\OneDrive\Desktop\Docs\7th Semester\FYP\Workspace 2\Word Level Lip Reading\Preprocessing\Unseen_videos\آپ_کتنے_تھا_نہیں_کیوں_نو.mp4"
prediction = predict_word_level(file_path, word_list, CLASSES_LIST, SEQUENCE_LENGTH)

Error opening video file


In [13]:

print(parse_sentence(prediction, most_freq_word))
print(parse_sentence(prediction, highest_prob_word))

میں کیسے تھا ہاں کون پانچھ
میں کیسے تھا ہاں کون سات


In [14]:
prediction

[[['میں', 0.025866712],
  ['میں', 0.026979527],
  ['میں', 0.02805288],
  ['میں', 0.029016558],
  ['میں', 0.029857717],
  ['میں', 0.03033493],
  ['میں', 0.03013219],
  ['میں', 0.029500723],
  ['میں', 0.02915101],
  ['میں', 0.028710244],
  ['میں', 0.028442157],
  ['میں', 0.02837359],
  ['میں', 0.0284607],
  ['میں', 0.028488124]],
 [['کتنے', 0.057769783],
  ['کتنے', 0.055372745],
  ['کتنے', 0.05331666],
  ['کتنے', 0.052140288],
  ['کتنے', 0.049522202],
  ['کدھر', 0.047862984],
  ['کدھر', 0.05155868],
  ['کدھر', 0.056878805],
  ['کدھر', 0.06279432],
  ['کدھر', 0.06671741],
  ['کیسے', 0.06756581],
  ['کیسے', 0.06873552],
  ['کیسے', 0.068394154]],
 [['ہے', 0.14933439],
  ['ہے', 0.15167901],
  ['ہے', 0.15165126],
  ['ہے', 0.15105468],
  ['ہے', 0.15043254],
  ['ہے', 0.14906704],
  ['ہے', 0.14743885],
  ['ہے', 0.14590706],
  ['ہے', 0.14322042],
  ['ہے', 0.13968848],
  ['ہے', 0.13542798],
  ['ہے', 0.13094649],
  ['ہے', 0.1265161]],
 [['نہیں', 0.15125707],
  ['نہیں', 0.15090811],
  ['نہیں', 0.150

# 3. Record and Predict Video

### - Gathering and Defining helper Functions

In [15]:
# Importing dependencies
# import the opencv library
import cv2
import mediapipe as mp
import math
import numpy as np

In [16]:
# Defining Mediapipe model
mp_holistic = mp.solutions.holistic  # Holistic model
mp_drawing = mp.solutions.drawing_utils  # Drawing utilities

In [18]:
filepath = "temp_video.mp4"

In [19]:
prediction = predict_word_level(filepath, word_list, CLASSES_LIST, SEQUENCE_LENGTH)

In [16]:
#On batch 5 weights
print(parse_sentence(prediction, most_freq_word))
print(parse_sentence(prediction, highest_prob_word))

میں کتنے تھا ہاں کب آٹھ
میں کتنے تھا نہیں کب آٹھ


In [20]:
#On batch 9 weights
print(parse_sentence(prediction, most_freq_word))
print(parse_sentence(prediction, highest_prob_word))

میں کیسے تھا ہاں کب آٹھ
میں کیسے تھا ہاں کون آٹھ


In [13]:
#On latest weights
print(parse_sentence(prediction, most_freq_word))
print(parse_sentence(prediction, highest_prob_word))

"""
میں کدھر تھا ہاں کب آٹھ
میں کدھر تھا ہاں کب آٹھ"""

میں کدھر تھا ہاں کب آٹھ
میں کدھر تھا ہاں کب آٹھ


In [29]:
len(prediction)

6

In [14]:
prediction

[[['وہ', 0.042594664],
  ['وہ', 0.040996153],
  ['وہ', 0.03893286],
  ['وہ', 0.040079854],
  ['وہ', 0.039443478],
  ['وہ', 0.03948737],
  ['وہ', 0.040604953],
  ['وہ', 0.04034933],
  ['وہ', 0.040025346],
  ['وہ', 0.040301472],
  ['وہ', 0.03747733],
  ['وہ', 0.032850724]],
 [['کونسا', 0.13111618],
  ['کونسا', 0.1165776],
  ['کونسا', 0.09973614],
  ['کونسا', 0.07442825],
  ['کونسا', 0.05353079],
  ['کیسے', 0.049553864],
  ['کتنے', 0.05043427],
  ['کتنے', 0.05059517],
  ['کتنے', 0.050401356],
  ['کتنے', 0.049986657],
  ['کتنے', 0.04930621]],
 [['تھا', 0.1235],
  ['تھا', 0.1266356],
  ['تھا', 0.12633795],
  ['تھا', 0.12282128],
  ['ہے', 0.11912547],
  ['ہے', 0.11510526],
  ['ہے', 0.10950241],
  ['ہے', 0.10403517],
  ['ہے', 0.09811872],
  ['ہے', 0.091240436],
  ['ہے', 0.08505084]],
 [['ہاں', 0.13626473],
  ['ہاں', 0.120318],
  ['نہیں', 0.09913375],
  ['نہیں', 0.08568436],
  ['نہیں', 0.07119999],
  ['نہیں', 0.062424388],
  ['نہیں', 0.06638001],
  ['نہیں', 0.07582516],
  ['نہیں', 0.08738724],

In [11]:
from Dataset import get_sentences
import random

sentences = get_sentences('..\\Dictionary\\roman_urdu_sentences.txt')


speaker_id = 0

#file_path = "..\\Dataset\\Urdu\\6\\aap_konsa_thay_jee_kon_ek\\_video.avi"

x_sen = []
y_sen_mf = []
y_sen_hp = []

cnt = 0

for i in sentences:
    
    print(cnt)
    
    # Appending in actual sentence list
    x_sen.append(i)
    
    sen = i.replace(' ','_')
    
    # Parsing filepath of the video file
    file_path = "D:\\FYP Workspace\\Raw Data\\" + str(speaker_id) + "\\" + sen + "\\_video.avi"

    # Making prediction on video and parsing sentencing
    prediction = predict_word_level(file_path, word_list, CLASSES_LIST, SEQUENCE_LENGTH)
    predicted_sentence = parse_sentence(prediction, highest_prob_word)
    
    # Appending in predicted sentence list
    y_sen_hp.append(parse_sentence(prediction, highest_prob_word))
    y_sen_mf.append(parse_sentence(prediction,most_freq_word))
    
    cnt += 1
    
    if cnt == 20:
        break



0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [11]:
#Function to get urdu dictionary sentences
def get_urdu_sentences(path):
    #path = '..\\Dictionary\\roman_urdu_sentences.txt'

    # Using readlines()
    file = open(path, 'r', encoding='utf-8')
    lines = file.readlines()

    for i in range(len(lines)):
        lines[i] = lines[i].replace("\n", "")

    return lines

In [13]:
# Library to calculate WER


#print(len(x_sen))
#print(len(y_sen))
urdu_sen = get_urdu_sentences('..\\Dictionary\\urdu_sentences.txt')

total_wer_hp = 0
total_wer_mf = 0

print("------------------------------------")
for i in range(0, len(y_sen_mf)):
    
    #arg = (ref, hypothesis)
    wer_hp = wer(urdu_sen[i], y_sen_hp[i])
    wer_mf = wer(urdu_sen[i], y_sen_mf[i])
    total_wer_hp += wer_hp
    total_wer_mf += wer_mf
    
    print("Original : ",urdu_sen[i]) 
    print("Original : ",sentences[i]) 
    print("Predicted 1: " , y_sen_hp[i]," ",wer_hp)
    print("Predicted 2: " , y_sen_mf[i]," ",wer_mf)

    print("------------------------------------")
    

    
print("\n----------------------\n")
avg_wer_hp = total_wer_hp / len(y_sen_hp)
avg_wer_mf = total_wer_mf / len(y_sen_mf)
    
print("AVG WER HP : ",avg_wer_hp)    
print("AVG WER MF : ",avg_wer_mf)    

------------------------------------
Original :  وہ کیسے ہوں جی کب ایک
Original :  wo_kese_hoon_jee_kab_ek
Predicted 1:  وہ کونسا ہوں نہیں کب پانچھ   0.5
Predicted 2:  وہ کونسا ہوں نہیں کب آٹھ   0.5
------------------------------------
Original :  آپ کیسے ہے جی کیوں دو
Original :  aap_kese_hai_jee_kyun_do
Predicted 1:  میں کیسے تھا نہیں کیوں پانچھ   0.6666666666666666
Predicted 2:  وہ کیسے تھا ہاں کیوں دو   0.5
------------------------------------
Original :  وہ کیسے تھا جی کب تین
Original :  wo_kese_tha_jee_kab_theen
Predicted 1:  وہ کونسا تھا نہیں کب پانچھ   0.5
Predicted 2:  وہ کونسا تھے نہیں کب آٹھ   0.6666666666666666
------------------------------------
Original :  میں کیسے تھے جی کیوں چار
Original :  mein_kese_thay_jee_kyun_chaar
Predicted 1:  وہ کیسے تھا ہاں کون آٹھ   0.8333333333333334
Predicted 2:  وہ کونسا تھا ہاں کب آٹھ   1.0
------------------------------------
Original :  آپ کیسے تھا جی کیوں پانچھ
Original :  aap_kese_tha_jee_kyun_paanch
Predicted 1:  میں کیسے تھا جی کب پ

# Testing on Seen speaker with Unknown Sentence

In [44]:
unseen_urdu_sen = get_urdu_sentences('.\\Unseen_videos\\unseen_urdu_sentences.txt')

In [45]:

directory_path = ".\\unseen_videos\\"

# get a list of all files and directories in the directory_path
dir_ls = os.listdir(directory_path)

In [46]:
dir_ls

['unseen_urdu_sentences.txt',
 'آپ_کتنے_تھا_نہیں_کیوں_نوں.mp4',
 'آپ_کدھر_تھے_نہیں_کیوں_چھے.mp4',
 'آپ_کدھر_ہے_جی_کون_چھے.mp4',
 'آپ_کدھر_ہے_نہیں_کون_نوں.mp4',
 'میں_کتنے_تھا_ہاں_کب_نوں.mp4',
 'میں_کتنے_ہوں_جی_کون_چھے.mp4',
 'میں_کتنے_ہے_ہاں_کب_چھے.mp4',
 'میں_کدھر_تھا_جی_کیوں_نوں.mp4',
 'وہ_کونسا_تھے_جی_کون_نوں.mp4',
 'وہ_کونسا_تھے_نہیں_کیوں_نوں.mp4',
 'وہ_کونسا_ہوں_ہاں_کون_نوں.mp4',
 'وہ_کونسا_ہے_ہاں_کون_چھے.mp4',
 'وہ_کیسے_تھا_نہیں_کون_نوں.mp4',
 'وہ_کیسے_تھا_ہاں_کب_چھے.mp4',
 'وہ_کیسے_تھے_جی_کون_نوں.mp4',
 'وہ_کیسے_ہے_نہیں_کب_چھے.mp4',
 'ہم_کتنے_ہوں_نہیں_کب_چھے.mp4',
 'ہم_کتنے_ہے_جی_کیوں_دو.mp4',
 'ہم_کدھر_تھا_ہاں_کیوں_دو.mp4',
 'ہم_کدھر_تھے_ہاں_کون_چھے.mp4',
 'ہم_کونسا_تھے_جی_کب_چھے.mp4',
 'ہم_کونسا_ہوں_نہیں_کب_چھے.mp4',
 'ہم_کیسے_تھا_ہاں_کیوں_دو.mp4',
 'ہم_کیسے_ہوں_جی_کیوں_چھے.mp4']

In [51]:
#prediction on each video

avg_wer_hp = 0
avg_wer_mf = 0
for x in unseen_urdu_sen:
    filepath = ".\\Unseen_videos\\" + x.replace(" ", "_") + ".mp4"
    print(filepath)
    prediction = predict_word_level(filepath, word_list, CLASSES_LIST, SEQUENCE_LENGTH)
    hp = parse_sentence(prediction, highest_prob_word)
    mf = parse_sentence(prediction, most_freq_word)
    
    print("Original : ",x)

    wer_hp = wer(x, hp)
    wer_mf = wer(x, mf)
    print("HP       :",hp," ",wer_hp)
    print("MF       :",mf," ",wer_mf)
    
    avg_wer_hp += wer_hp
    avg_wer_mf += wer_mf

print("Avg WER HP : ", avg_wer_hp/len(unseen_urdu_sen))
print("Avg WER MF : ", avg_wer_mf/len(unseen_urdu_sen))

.\Unseen_videos\ہم_کدھر_تھا_ہاں_کیوں_دو.mp4
Original :  ہم کدھر تھا ہاں کیوں دو
HP       : وہ کیسے تھا ہاں کون دو   0.5
MF       : وہ کیسے تھا ہاں کون دو   0.5
.\Unseen_videos\وہ_کونسا_تھے_نہیں_کیوں_نوں.mp4
Original :  وہ کونسا تھے نہیں کیوں نوں
HP       : وہ کونسا تھا ہاں کیوں دو   0.5
MF       : ہم کونسا ہوں نہیں کب دو   0.6666666666666666
.\Unseen_videos\ہم_کیسے_تھا_ہاں_کیوں_دو.mp4
Original :  ہم کیسے تھا ہاں کیوں دو
HP       : میں کیسے تھا نہیں کیوں دو   0.3333333333333333
MF       : میں کیسے تھا ہاں کیوں دو   0.16666666666666666
.\Unseen_videos\ہم_کونسا_ہوں_نہیں_کب_چھے.mp4
Original :  ہم کونسا ہوں نہیں کب چھے
HP       : میں کونسا ہوں نہیں کب سات   0.3333333333333333
MF       : میں کونسا ہوں جی کب سات   0.5
.\Unseen_videos\وہ_کونسا_تھے_جی_کون_نوں.mp4
Original :  وہ کونسا تھے جی کون نوں
HP       : وہ کونسا تھے جی کون پانچھ   0.16666666666666666
MF       : ہم کونسا تھا نہیں کون پانچھ   0.6666666666666666
.\Unseen_videos\وہ_کونسا_ہے_ہاں_کون_چھے.mp4
Original :  وہ کونسا ہے ہاں کون چھے


## Test Results:
### 1. Seen speaker seen sentence: WER    = 0.61
### 2. Unseen speaker seen sentence WER = 0.65
### 2. Seen speaker unseen sentence WER = 0.63

In [4]:
import time
time.time()

1684174810.2622097