In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

#from torchtext.legacy.datasets import Multi30k
#from torchtext.legacy.data import Field, BucketIterator

#import spacy
import numpy as np

import random
import math
import time

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [207]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        
        #self.embedding = nn.Embedding(input_dim, emb_dim) FOR NOW WE DON't HAVE INPUT EMBEDDINGS
        
        self.rnn = nn.RNN(input_size=68*3, hidden_size=128, bidirectional=True, batch_first=True)
        
        #self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):

        #The input to the encoder are the landmarks
        #x = [batch size, sequence_len , 68*3]
        
        #embedded = self.dropout(self.embedding(src))FOR NOW WE DON't HAVE INPUT EMBEDDINGS
        
        #embedded = [src len, batch size, emb dim]

        out, hid = self.rnn(x)
        
        #out = [batch size, src len, hid dim * n directions]
        #hidd = [n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer

        #print("ENCODER: hid.shape: ", hid.shape)
        print("ENCODER: hid:\n", hid)
        return hid

In [198]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        #self.n_layers = n_layers
        
        #self.embedding = nn.Embedding(output_dim, emb_dim)FOR NOW WE DON't HAVE INPUT EMBEDDINGS
        self.rnn = nn.RNN(input_size=1, hidden_size=128, bidirectional=True, batch_first=True)
        
        self.fc_out = nn.Linear(2*hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        #input = input.unsqueeze(0).unsqueeze(0)COMMENTED TO DEBUG

        #print("DECODER: input.shape", input.shape)
        
        #input = [batch size, 1]
        
        #embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
                
        output, hidden = self.rnn(input.to(torch.float32), hidden)
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        prediction = self.fc_out(output.squeeze(0))
        
        #prediction = [batch size, output dim]

        #print("DECODER: prediction.shape", prediction.shape)
        
        return prediction, hidden

In [199]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        #assert encoder.hid_dim == decoder.hid_dim, \
        #    "Hidden dimensions of encoder and decoder must be equal!"
        #assert encoder.n_layers == decoder.n_layers, \
        #    "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[0]
        trg_len = src.shape[0]#FIXME Forse qui non ho passato i landmark con la dimensione del batch in testa
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden = self.encoder(src)
        hidden = hidden.unsqueeze(1)
        #print("SEQ2SEQ: hidden.shape: ", hidden.shape)
        
        #first input to the decoder is the <sos> tokens
        input = torch.tensor(31).unsqueeze(0).unsqueeze(0).unsqueeze(0)#trg[0,:] 31 is the index of <sos>
        #print("SEQ2SEQ: input.shape: ", input.shape)
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            #print("SEQ2SEQ-FOR: input.shape: ", input.shape)
            #print("SEQ2SEQ-FOR: hidden.shape: ", hidden.shape)
            output, hidden = self.decoder(input, hidden)
            #print("SEQ2SEQ-FOR: output.shape: ", output.shape)
            
            output = output.unsqueeze(0)#ADDED TO BE CHECKED

            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(2) 

            input = top1.unsqueeze(0)
            #print("SEQ2SEQ-FOR: top1.shape: ", top1.shape)
        
        return outputs

In [101]:
import string
vocabulary = ['-'] + list(string.ascii_lowercase) + ['.', '?', ',', '!'] + ['@', '#', ' ']

In [102]:
vocabulary

['-',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '.',
 '?',
 ',',
 '!',
 '@',
 '#',
 ' ']

In [7]:
# Get landmark using vocadataset.py
from data.vocaset import *

In [8]:
trainset = vocadataset("train", landmark=True)

In [103]:
landmark, labels = trainset[0]
print(labels)

she had your dark suit in greasy wash water all year.


In [106]:
labels

'@she had your dark suit in greasy wash water all year.#'

In [105]:
labels = '@'+labels+'#'

'<sos>she had your dark suit in greasy wash water all year.\n<eos>'

In [39]:
input_dim = 68*3
emb_dim = 0
hid_dim = 128
n_layers = 0
dropout = 0


#model = Encoder(input_dim, emb_dim, hid_dim, n_layers, dropout)

reshaped_landmark = torch.reshape(landmark, (landmark.shape[0], landmark.shape[1]*landmark.shape[2]))
start_landmark = torch.zeros(1, 68*3)
stop_landmark = torch.ones(1, 68*3)

final_landmarks = torch.cat((start_landmark, reshaped_landmark, stop_landmark), 0)

#out = model(reshaped_landmark)

In [107]:
# Create a mapping from characters to indices
char_to_index = {char: index for index, char in enumerate(vocabulary)}

In [206]:
char_to_index

{'-': 0,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 27,
 '?': 28,
 ',': 29,
 '!': 30,
 '@': 31,
 '#': 32,
 ' ': 33}

In [109]:
# Convert the sequence and target to indices
#sequence_indices = [char_to_index[char] for char in sequence]
target_indices = [char_to_index[char] for char in labels]
target_tensor = torch.tensor(target_indices)

In [110]:
target_indices

[31,
 19,
 8,
 5,
 33,
 8,
 1,
 4,
 33,
 25,
 15,
 21,
 18,
 33,
 4,
 1,
 18,
 11,
 33,
 19,
 21,
 9,
 20,
 33,
 9,
 14,
 33,
 7,
 18,
 5,
 1,
 19,
 25,
 33,
 23,
 1,
 19,
 8,
 33,
 23,
 1,
 20,
 5,
 18,
 33,
 1,
 12,
 12,
 33,
 25,
 5,
 1,
 18,
 27,
 32]

In [208]:
output_dim = len(vocabulary)

enc = Encoder(input_dim, emb_dim, hid_dim, n_layers, dropout)
dec = Decoder(output_dim, emb_dim, hid_dim, n_layers, dropout)

model = Seq2Seq(enc, dec, 'cpu')#.to(device)

reshaped_landmark = torch.reshape(landmark, (landmark.shape[0], landmark.shape[1]*landmark.shape[2]))
start_landmark = torch.zeros(1, 68*3)
stop_landmark = torch.ones(1, 68*3)

final_landmarks = torch.cat((start_landmark, reshaped_landmark, stop_landmark), 0)



In [209]:
# Define the CTC loss function
ctc_loss = nn.CTCLoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    optimizer.zero_grad()
    output = model(reshaped_landmark, target_tensor[None,:])

    #output = output[None,:,:]
    #output = output.permute(1, 0, 2)  # Swap batch and sequence dimensions
    
    e = torch.argmax(output, dim=2).squeeze(1)
    output_sequence = ''.join([vocabulary[index] for index in e])
    print(output_sequence)

    
    input_lengths = torch.full((1,), output.size(0), dtype=torch.long)
    target_lengths = torch.full((target_tensor.size(0),), target_tensor.size(0), dtype=torch.long)
    
    loss = ctc_loss(output, target_tensor, input_lengths, target_lengths[0])
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")



ENCODER: hid:
 tensor([[-1.6063e-01, -6.9836e-02, -1.1997e-01, -2.8645e-02,  1.3352e-01,
          1.6212e-01,  1.2018e-01,  2.7975e-01,  1.2428e-01, -4.5522e-02,
         -1.6396e-01,  1.6925e-01, -3.5751e-02,  5.3424e-02,  2.1607e-03,
         -2.9854e-03, -8.8274e-02,  2.3255e-01, -6.0750e-02,  2.7966e-01,
          7.7010e-02,  2.5489e-01, -2.0631e-02,  1.4633e-01, -1.3169e-01,
          1.4802e-01, -6.8587e-02, -9.7953e-02,  1.2080e-01,  6.4370e-02,
          8.2585e-02, -1.4442e-01, -1.4885e-02,  8.3471e-02, -5.9720e-02,
          2.4837e-02, -4.1003e-02, -3.6045e-02,  9.9979e-02, -9.3806e-02,
         -8.9514e-02, -2.1441e-01, -1.6406e-01, -4.9183e-03,  4.3422e-02,
          6.6367e-02,  2.9248e-01,  6.6438e-02,  7.1486e-03, -1.7569e-01,
          9.5391e-02, -5.6145e-02, -5.5782e-03, -1.3999e-02, -3.0993e-01,
         -2.4762e-02,  4.8771e-02, -3.1594e-02, -4.1744e-02, -2.3525e-02,
         -2.3924e-02, -9.2270e-02,  2.1891e-02,  2.0566e-01,  1.1514e-01,
         -8.5894e-02, -

In [205]:
target_tensor

tensor([31, 19,  8,  5, 33,  8,  1,  4, 33, 25, 15, 21, 18, 33,  4,  1, 18, 11,
        33, 19, 21,  9, 20, 33,  9, 14, 33,  7, 18,  5,  1, 19, 25, 33, 23,  1,
        19,  8, 33, 23,  1, 20,  5, 18, 33,  1, 12, 12, 33, 25,  5,  1, 18, 27,
        32])

In [193]:
output.shape

torch.Size([244, 1, 34])

In [None]:
# Decode the output sequence
output_indices = torch.argmax(output, dim=2).squeeze(1)
output_sequence = ''.join([vocabulary[index] for index in output_indices])

print("Original Sequence:", sequence)
print("Target Sequence:", target)

def process_string(input_string):
    output_string = ""
    current_char = ""

    for char in input_string:
        if char != current_char:
            if char.isalpha() or char == '0':
                if char == '0':
                    output_string += ' '
                else:
                    output_string += char   
            current_char = char

    return output_string.strip()

print("Decoded Output:", process_string(output_sequence))