In [1]:
import os
import cv2
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt
import gdown

In [2]:
# import gdown
# url = 'https://drive.google.com/uc?id=1YlvpDLix3S-U8fd-gqRwPcWXAXm8JwjL'
# output = 'data.zip'
# gdown.download(url, output, quiet=False)
# gdown.extractall('data.zip')

In [3]:
def load_video(path):

    cap = cv2.VideoCapture(path)
    frames = []
    for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
        ret, frame = cap.read()
        frame = tf.image.rgb_to_grayscale(frame)
        frames.append(frame[190:236,80:220,:])
    cap.release()

    mean = tf.math.reduce_mean(frames)
    std = tf.math.reduce_std(tf.cast(frames, tf.float32))
    return tf.cast((frames - mean), tf.float32) / std

In [4]:
vocab = [x for x in "abcdefghijklmnopqrstuvwxyz "]
char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
num_to_char = tf.keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)

In [5]:
def load_alignments(path):
    with open(path, 'r') as f:
        lines = f.readlines()
    tokens = []
    for line in lines:
        line = line.split()
        if line[2] != 'sil':
            tokens = [*tokens,' ',line[2]]
    return char_to_num(tf.reshape(tf.strings.unicode_split(tokens, input_encoding='UTF-8'), (-1)))[1:]

In [6]:
def load_data(path: str):
    path = bytes.decode(path.numpy())
    file_name = os.path.basename(path).split('.')[0]
    # print(file_name)
    video_path = os.path.join('data','s1',f'{file_name}.mpg')
    alignment_path = os.path.join("data", "alignments", "s1", f"{file_name}.align")
    # print(video_path, alignment_path)
    frames = load_video(video_path)
    alignments = load_alignments(f"{alignment_path}")

    return frames, alignments

In [7]:
def map_function(path):
    result = tf.py_function(load_data, [path], (tf.float32, tf.int64))
    return result

In [9]:
data = tf.data.Dataset.list_files('data/s1/*.mpg')
data = data.shuffle(500, reshuffle_each_iteration=False)
data = data.map(map_function)
data = data.padded_batch(2, padded_shapes=([75,None,None,None],[40]))
data = data.prefetch(tf.data.AUTOTUNE)
# Added for split
train = data.take(450)
test = data.skip(450)

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv3D, GRU, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, Callback

In [11]:
model = Sequential()
model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))

model.add(Conv3D(256, 3, padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))

model.add(Conv3D(75, 3, padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))

model.add(TimeDistributed(Flatten()))

model.add(Bidirectional(GRU(128, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(.5))

model.add(Bidirectional(GRU(128, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(.5))

model.add(Dense(char_to_num.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax'))

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv3d (Conv3D)             (None, 75, 46, 140, 128   3584      
                             )                                   
                                                                 
 activation (Activation)     (None, 75, 46, 140, 128   0         
                             )                                   
                                                                 
 max_pooling3d (MaxPooling3  (None, 75, 23, 70, 128)   0         
 D)                                                              
                                                                 
 conv3d_1 (Conv3D)           (None, 75, 23, 70, 256)   884992    
                                                                 
 activation_1 (Activation)   (None, 75, 23, 70, 256)   0         
                                                        

In [13]:
def CTCLoss(y_true, y_pred):
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

    loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss

In [14]:
url = 'https://drive.google.com/u/0/uc?id=1ZjV3BqqMk95gZLgy126P0lw8b-93guTj&export=download'
output = 'pre_models.zip'
gdown.download(url, output, quiet=False)
gdown.extractall('pre_models.zip')

Downloading...
From (uriginal): https://drive.google.com/u/0/uc?id=1ZjV3BqqMk95gZLgy126P0lw8b-93guTj&export=download
From (redirected): https://drive.google.com/uc?id=1ZjV3BqqMk95gZLgy126P0lw8b-93guTj&export=download&confirm=t&uuid=e6ec09b4-cb4c-47db-a424-f0fa1e258f92
To: c:\Users\Rajvi Zala\streamlit-app\pre_models.zip
100%|██████████| 75.4M/75.4M [01:50<00:00, 679kB/s] 


['models/.ipynb_checkpoints/',
 'models/checkpoint',
 'models/checkpoint.data-00000-of-00001',
 'models/checkpoint.index']

In [15]:
model.compile(optimizer = Adam(learning_rate=0.0001), loss=CTCLoss)

In [16]:
model.load_weights("models/checkpoint")

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x1bd277abfa0>

In [17]:
sample = data.as_numpy_iterator().next()
sample

(array([[[[[1.4137809 ],
           [1.4137809 ],
           [1.4509857 ],
           ...,
           [0.33484286],
           [0.3720476 ],
           [0.3720476 ]],
 
          [[1.4881904 ],
           [1.5253952 ],
           [1.4881904 ],
           ...,
           [0.33484286],
           [0.3720476 ],
           [0.3720476 ]],
 
          [[1.4137809 ],
           [1.4137809 ],
           [1.3393714 ],
           ...,
           [0.33484286],
           [0.33484286],
           [0.33484286]],
 
          ...,
 
          [[1.0045285 ],
           [1.0045285 ],
           [0.9673238 ],
           ...,
           [0.07440952],
           [0.03720476],
           [0.03720476]],
 
          [[1.0045285 ],
           [1.0045285 ],
           [0.9673238 ],
           ...,
           [0.03720476],
           [0.        ],
           [0.        ]],
 
          [[1.0045285 ],
           [1.0045285 ],
           [0.9673238 ],
           ...,
           [0.03720476],
           [0.        

In [18]:
result = model.predict(sample[0])



In [19]:
decoded = tf.keras.backend.ctc_decode(result, [75,75], greedy=False)[0][0].numpy()
for x in range(len(result)):
    print('Original:', tf.strings.reduce_join(num_to_char(sample[1][x])).numpy().decode('utf-8'))
    print('Prediction:', tf.strings.reduce_join(num_to_char(decoded[x])).numpy().decode('utf-8'))
    print('~'*100)

Original: bin red in f four now
Prediction: bin red in four now
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Original: lay green by g zero please
Prediction: lay gren by zero please
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


In [20]:
import pickle
# create an iterator object with write permission - model.pkl
with open('model_pkl', 'wb') as files:
    pickle.dump(model, files)

In [21]:
model.evaluate(test)



4.4354681968688965

In [22]:
frames = load_video("bbab8n.mpg")
plt.imshow(frames[4])

InvalidArgumentError: {{function_node __wrapped__StridedSlice_device_/job:localhost/replica:0/task:0/device:CPU:0}} slice index 4 of dimension 0 out of bounds. [Op:StridedSlice] name: strided_slice/

In [None]:
result = model.predict(tf.expand_dims(frames, axis=0))



In [None]:
decoded = tf.keras.backend.ctc_decode(result, [75], greedy=False)[0][0].numpy()

In [None]:
tf.strings.reduce_join(num_to_char(decoded)).numpy().decode('utf-8')

'set blue a e sosn'