# Prototype of lipreading pipeline

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

#from torchtext.legacy.datasets import Multi30k
#from torchtext.legacy.data import Field, BucketIterator

#import spacy
import numpy as np

import random
import math
import time

# model file, encoder, decoder and seqtoseq
from model import *
# utils file
from utils import *
# Get landmark using vocadataset.py
from data.vocaset import *

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Print the vocabulary
print(vocabulary(blank='-', start='@', stop='#'))

In [None]:
# Get landmark from vocadaset class
trainset = vocadataset("train", landmark=True)
landmark, labels = trainset[0]

In [None]:
# Test labels!
print("Before:", labels)
labels = '@'+labels+'#'         #Concatenating the start and stop character
print("After:", labels)

In [None]:
vocabulary = vocabulary(blank='-', start='@', stop='#')

In [None]:
# Create a mapping from characters to indices
char_to_index = {char: index for index, char in enumerate(vocabulary)}
print(char_to_index)

In [None]:
# Convert the sequence and target to indices
#sequence_indices = [char_to_index[char] for char in sequence]
target_indices = [char_to_index[char] for char in labels]
target_tensor = torch.tensor(target_indices)

In [None]:
INPUT_DIM = 68*3
EMB_DIM = 0
HID_DIM = 128
N_LAYERS = 0
DROPOUT = 0
output_dim = len(vocabulary)

enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
dec = Decoder(output_dim, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
model = Seq2Seq(enc, dec, 'cpu')#.to(device)

reshaped_landmark = torch.reshape(landmark, (landmark.shape[0], landmark.shape[1]*landmark.shape[2]))
start_landmark = torch.zeros(1, 68*3)
stop_landmark = torch.ones(1, 68*3)

final_landmarks = torch.cat((start_landmark, reshaped_landmark, stop_landmark), 0)

In [None]:
# Define the CTC loss function
ctc_loss = nn.CTCLoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10000
for epoch in range(num_epochs):
    optimizer.zero_grad()
    output = model(reshaped_landmark, target_tensor[None,:])
    

    input_lengths = torch.full((1,), output.size(0), dtype=torch.long)
    target_lengths = torch.full((target_tensor.size(0),), target_tensor.size(0), dtype=torch.long)
    
    loss = ctc_loss(output, target_tensor, input_lengths, target_lengths[0])
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")
        e = torch.argmax(output, dim=2).squeeze(1)
        output_sequence = ''.join([vocabulary[index] for index in e])
        print(output_sequence)



In [None]:
# Decode the output sequence
output_indices = torch.argmax(output, dim=2).squeeze(1)
output_sequence = ''.join([vocabulary[index] for index in output_indices])

print("Target Sequence:", labels.replace("@","").replace("#",""))
print("Decoded Output:", process_string(output_sequence))