# Prototype of lipreading pipeline

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

import random
import math
import time

# model file, encoder, decoder and seqtoseq
from model_temp import *
# utils file
from utils import *
# Get landmark using vocadataset.py
from data.vocaset import *

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
# Get landmark from vocadaset class
#trainset = vocadataset("train", landmark=True)
trainset = vocadataset("train", landmark=True, mouthOnly=True)
#landmark, labels = trainset[0]

trainloader = DataLoader(trainset, batch_size=2, collate_fn=collate_fn)

landmarks, len_landmark, label, len_label = next(iter(trainloader))


In [3]:
vocabulary = create_vocabulary(blank="@")

In [4]:
# Create a mapping from characters to indices
char_to_index = {char: index for index, char in enumerate(vocabulary)}

In [5]:
# Convert the sequence and target to indices
#sequence_indices = [char_to_index[char] for char in sequence]

label_t = char_to_index_batch(label, vocabulary)

#target_indices = [char_to_index[char] for char in labels]
#target_tensor = torch.tensor(target_indices)

In [6]:
INPUT_DIM = 36*3
INPUT_DIM = 36*3
HID_DIM = 512
output_dim = len(vocabulary)

model = only_Decoder(INPUT_DIM, HID_DIM, 8, len(vocabulary)).to(device)



    Found GPU1 NVIDIA GeForce GTX TITAN which is of cuda capability 3.5.
    PyTorch no longer supports this GPU because it is too old.
    The minimum cuda capability supported by this library is 3.7.
    


In [None]:
landmarks.shape

In [None]:
print(torch.reshape(landmarks[0], (landmarks[0].shape[0], landmarks[0].shape[1]*landmarks[0].shape[2]))[None,:,:].shape)
print(len_landmark[0][None, None].shape)

In [12]:
# Define the CTC loss function
ctc_loss = nn.CTCLoss(blank=1)

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 1000
for epoch in range(num_epochs):
    
    for i in range(landmarks.shape[0]):
        landmarks[i] = landmarks[i].to(device) 
        label_t[i] = label_t[i].to(device)
        optimizer.zero_grad()
        target_tensor = label_t[i]
        reshaped_landmark = torch.reshape(landmarks[i], (landmarks[i].shape[0], landmarks[i].shape[1]*landmarks[i].shape[2]))
        reshaped_landmark = reshaped_landmark.to(device)
        target_tensor = target_tensor.to(device)
        len_landmark[i] = len_landmark[i].to(device) 
        output = model(reshaped_landmark[None, : , :], len_landmark[i][None])
        output = output.permute(1,0,2)
        input_lengths = torch.full((1,), output.size(0), dtype=torch.long)
        target_lengths = torch.full((target_tensor.size(0),), target_tensor.size(0), dtype=torch.int32)
        
        loss = ctc_loss(torch.nn.functional.log_softmax(output, dim=2), target_tensor, input_lengths, target_lengths[0])
        loss.backward()
        optimizer.step()

        e = torch.argmax(output, dim=2).squeeze(1)
        output_sequence = ''.join([vocabulary[index] for index in e])
        #print(output_sequence)
        if(epoch + 1) % 100 == 0:
            f = open("prova_.txt", "a")
            f.write(label[i]+"\n")
            f.write(output_sequence+"\n")
            f.close() 
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")
        #e = torch.argmax(output, dim=2).squeeze(1)
        #output_sequence = ''.join([vocabulary[index] for index in e])
        #print(output_sequence)



Epoch [10/1000], Loss: 3.078540325164795
Epoch [20/1000], Loss: 2.42348313331604
Epoch [30/1000], Loss: 2.2838854789733887
Epoch [40/1000], Loss: 2.1356260776519775
Epoch [50/1000], Loss: 1.9883315563201904
Epoch [60/1000], Loss: 1.9357656240463257
Epoch [70/1000], Loss: 1.863783597946167
Epoch [80/1000], Loss: 1.811815857887268
Epoch [90/1000], Loss: 1.7709523439407349
Epoch [100/1000], Loss: 1.7361414432525635
Epoch [110/1000], Loss: 1.7396267652511597
Epoch [120/1000], Loss: 2.271461009979248
Epoch [130/1000], Loss: 1.859091877937317
Epoch [140/1000], Loss: 1.7804906368255615
Epoch [150/1000], Loss: 1.7234482765197754
Epoch [160/1000], Loss: 1.699995756149292
Epoch [170/1000], Loss: 1.7250819206237793
Epoch [180/1000], Loss: 1.6569994688034058
Epoch [190/1000], Loss: 1.5988686084747314
Epoch [200/1000], Loss: 1.90243661403656
Epoch [210/1000], Loss: 1.666096806526184
Epoch [220/1000], Loss: 1.5807631015777588
Epoch [230/1000], Loss: 2.7744736671447754


KeyboardInterrupt: 

In [None]:
output.shape

In [None]:
# Decode the output sequence
output_indices = torch.argmax(output, dim=2).squeeze(1)
output_sequence = ''.join([vocabulary[index] for index in output_indices])

print("Target Sequence:", labels.replace("@","").replace("#",""))
print("Decoded Output:", process_string(output_sequence))