# Prototype of lipreading pipeline

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

#from torchtext.legacy.datasets import Multi30k
#from torchtext.legacy.data import Field, BucketIterator

#import spacy
import numpy as np

import random
import math
import time

# model file, encoder, decoder and seqtoseq
from model import *
# utils file
from utils import *
# Get landmark using vocadataset.py
from data.vocaset import *

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
trainset = vocadataset("train", landmark=True)
dataloader = DataLoader(trainset, batch_size=8, collate_fn=collate_fn)

In [3]:
# Create a mapping from characters to indices
vocabulary = create_vocabulary(blank='@')

In [10]:
vocabulary

['-',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '.',
 '?',
 ',',
 '!',
 "'",
 '@',
 '#',
 ' ']

In [4]:
LANDMARK_DIM = 68
INPUT_DIM = LANDMARK_DIM*3
HID_DIM = 128
output_dim = len(vocabulary)

enc = Encoder(INPUT_DIM, HID_DIM)
dec = Decoder(output_dim, HID_DIM)
model = Seq2Seq(enc, dec, 'cpu')


In [5]:
# With batch

# Define the CTC loss function
ctc_loss = nn.CTCLoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    for landmarks, len_landmark, label, len_label in dataloader:
        # reshape the batch from [batch_size, frame_size, num_landmark, 3] to [batch_size, frame_size, num_landmark * 3] 
        landmarks = torch.reshape(landmarks, (landmarks.shape[0], landmarks.shape[1], landmarks.shape[2]*landmarks.shape[3]))
        # label to char
        optimizer.zero_grad()
        
        label = char_to_index_batch(label, vocabulary)

        output = model(landmarks, label)
        

        input_lengths = torch.full((1,), output.size(0), dtype=torch.long)

        #target_lengths = torch.full((target_tensor.size(0),), target_tensor.size(0), dtype=torch.long)
        
        loss = ctc_loss(output, label, input_lengths, len_label)
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")
            #e = torch.argmax(output, dim=2).squeeze(1)
            #output_sequence = ''.join([vocabulary[index] for index in e])
            #print(output_sequence)
            torch.save(model.state_dict(), "models/model.pt")




RuntimeError: For batched 3-D input, hx should also be 3-D but got 4-D tensor

In [None]:
# Define the CTC loss function
ctc_loss = nn.CTCLoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    
    optimizer.zero_grad()
    output = model(reshaped_landmark, target_tensor[None,:])
    

    input_lengths = torch.full((1,), output.size(0), dtype=torch.long)
    target_lengths = torch.full((target_tensor.size(0),), target_tensor.size(0), dtype=torch.long)
    
    loss = ctc_loss(output, target_tensor, input_lengths, target_lengths[0])
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")
        #e = torch.argmax(output, dim=2).squeeze(1)
        #output_sequence = ''.join([vocabulary[index] for index in e])
        #print(output_sequence)
        torch.save(model.state_dict(), "models/model.pt")




In [None]:
# Decode the output sequence
#output_indices = torch.argmax(output, dim=2).squeeze(1)
#output_sequence = ''.join([vocabulary[index] for index in output_indices])

#print("Target Sequence:", labels.replace("@","").replace("#",""))
#print("Decoded Output:", process_string("-she had your dark suit in greasy  wwwaash  waterr  all-ll-  --yy--eeaa-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------"))