In [111]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from torchtext.vocab import GloVe
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import numpy as np
from torch.utils.data import SubsetRandomSampler,DataLoader


EMBED_DIM = 300 

ENC_BIDIRECTIONAL = True
ENC_BIDIRECTIONAL_FACTOR = 2 if ENC_BIDIRECTIONAL else 1
ENC_HIDDEN_DIM = 128
ENC_OUTPUT_DIM =  ENC_HIDDEN_DIM
DEC_HIDDEN_DIM  =128
DEC_EMBED_DIM = 128

BATCH_SIZE = 64




device_cpu = torch.device('cpu')
device_fast = torch.device('cpu')

if torch.has_mps:
    device_fast = torch.device('mps')
elif torch.has_cuda:
    device_fast = torch.device('cuda')


output_vocab = {}
for i in range(0,10):
    output_vocab[str(i)] = i
output_vocab['-'] = 10
output_vocab['<sos>'] = 11
output_vocab['<eos>'] = 12

glove = GloVe()

In [107]:

def get_training_data(filename='./Assignment4aDataset.txt',glove=glove):
    f = open(filename,'r')
    dataset = []
    for line in f.readlines():       
        
        nl_date , out_date = line.split(',')
        nl_date = nl_date.replace("\'","").strip()
        out_date = out_date.replace("\'","").strip()

        split_on_slash = nl_date.split("/")
        nl_date = " / ".join(split_on_slash)

        embeddings = []
        for word in nl_date.split(' '):
            embeddings.append(glove[word])
        
        current_inp_length = len(embeddings)
        embeddings = torch.stack(embeddings)

        target = []
        target.append(output_vocab['<sos>'])

        for character in list(out_date):
            target.append(output_vocab[character])
        
        target.append(output_vocab['<eos>'])

        dataset.append({'in' : embeddings,'in_length' : current_inp_length,'out' : target})
    return dataset


class TranslationDataset(Dataset):
    def __init__(self,data):
        super().__init__()
        self.data = data


    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.data)


def collate_function(batch_data):


    inputs = [b['in'] for b in batch_data]
    in_lengths = [b['in_length'] for b in batch_data]
    out = torch.tensor([b['out'] for b in batch_data])

    inputs = pad_sequence(inputs,batch_first=True)

    return {'src': inputs, 'src_length' : in_lengths, 'trg' : out}




In [108]:
train_dataset = TranslationDataset(get_training_data())

train_idx,valid_idx = train_test_split(np.arange(len(train_dataset)), 
    test_size=0.2,
    shuffle= True,
    random_state=0
)

train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)
train_dataloader = DataLoader(train_dataset,BATCH_SIZE,sampler=train_sampler,collate_fn=collate_function)
valid_dataloader = DataLoader(train_dataset,BATCH_SIZE,sampler=valid_sampler,collate_fn=collate_function)

In [117]:
class Encoder(nn.Module):

    def __init__(self,embed_dim = EMBED_DIM,enc_hidden_dim = ENC_HIDDEN_DIM,enc_output_dim = ENC_OUTPUT_DIM,NUM_LAYERS=1,enc_bidirectional=ENC_BIDIRECTIONAL,dropout=0.3):
        super().__init__()
        
        #self.embedding_layer = nn.Embedding(vocab_size,EMBED_DIM)
        self.rnn = nn.GRU(embed_dim,enc_hidden_dim, num_layers = NUM_LAYERS ,batch_first= True ,bidirectional=enc_bidirectional)


        # ENCODER_OUTPUT_DIM = DECODER_HIDDEN_SIZE
        self.fc = nn.Linear(2*enc_hidden_dim,enc_output_dim)            
        self.dropout = nn.Dropout(dropout)

    def forward(self,inp,inp_len):
        
        #embedded_input = self.embedding_layer(inp)
        embedded_input = inp   # [batch_size, input_seq_length, embed_dim ]
        packed_embedding = nn.utils.rnn.pack_padded_sequence(embedded_input,inp_len,batch_first=True,enforce_sorted=False)
        packed_output , hidden = self.rnn(packed_embedding)  # hidden = [D*num_layers, batch_size , hidden_dim ]
        outputs, _  = nn.utils.rnn.pad_packed_sequence(packed_output,batch_first=True)  # [batch_size, inp_seq_length, hidden_dim]
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1)))  # [batch_size, decoder_hidden_size]
        return outputs,hidden


In [112]:

# enc_hidden_dim = 2* ENC_HIDDEN_DIM
# dec_hidden_dim = DEC_HIDDEN_DIM = ENCODER_OUTPUT_DIM

class Attention(nn.Module):
    def __init__(self,enc_hidden_dim, dec_hidden_dim):
        super().__init__()
        
        self.attn = nn.Linear(enc_hidden_dim+dec_hidden_dim,dec_hidden_dim)
        self.v = nn.Linear(dec_hidden_dim,1)

    def forward(self,hidden,encoder_outputs, encoder_length_mask):
        
        # encoder_outputs = [batch_size,seq_length, enc_hidden_dim][2*ENCODER_HIDDEN_DIM or ENCODER_HIDDEN_DIM]
        # hidden = [batch_size,  dec_hidden_dim]
        # encoder_length_mask = [batch_size, seq_length]

        batch_size = encoder_outputs.shape[0]
        src_len = encoder_outputs.shape[1]
        h = hidden.unsqueeze(1).repeat(1,src_len,1)  # h = [batch_size,seq_length,dec_hidden_dim]
        e = torch.tanh(self.attn(torch.cat((h,encoder_outputs),dim=2))) 
        attention_scores = self.v(e).squeeze(2)  # attention_scores = [batch_size , seq_length ]
        attention_scores = attention_scores.masked_fill(encoder_length_mask==0, -1e10)   # Fill padding tokens with a lower value
        return F.softmax(attention_scores,dim=1)

enc = Encoder(30,30,15,10)

inp = torch.randn((3,20,30))
inp_len = [20 for i in range(3)]
outputs, hidden = enc(inp,inp_len)
outputs

In [118]:
class Decoder(nn.Module):

    def __init__(self,vocab_size,enc_hidden_dim,dec_hidden_dim,dec_output_dim,emb_dim):
        
        # enc_hidden_dim = 2*ENCODER_HIDDEN_DIM or ENCODER_HIDDEN_DIM

        super().__init__()
        
        self.vocab_size = vocab_size
        self.attention = Attention(enc_hidden_dim,dec_hidden_dim)
        self.embedding_layer = nn.Embedding(vocab_size,emb_dim)
        self.rnn = nn.GRU(enc_hidden_dim + emb_dim,dec_hidden_dim,batch_first = True)

        self.fc_out = nn.Linear(enc_hidden_dim + emb_dim + dec_hidden_dim , dec_output_dim)


    def forward(self,input,hidden,encoder_outputs,encoder_length_mask):
            # encoder outputs =  batch_size , seq_len , encoder_output_dim
            # hidden = batch_size , hidden_dim
            # input = batch_size
            
            input = input.unsqueeze(0) # [1,batch_size]
            embedded = self.embedding_layer(input) # [1,batch_size,embed_dim]

            embedded = embedded.permute(1,0,2) #[ batch_size, seq_length=1, embed_dim ]

            attention_vector = self.attention(hidden,encoder_outputs,encoder_length_mask) # [ batch_size , seq_length ]
            attention_vector = attention_vector.unsqueeze(1) # [batch_size , 1 , seq_length ]

            weighted = torch.bmm(attention_vector,encoder_outputs) # [ batch_size, 1, encoder_output_dim]
            #weighted = weighted.permute(1,0,2) #[1 , batch_size , encoder_output_dim]


            rnn_input = torch.cat((embedded,weighted),dim=2) #[batch_size, seq_length=1, encoder + decoder]

            out,hidden = self.rnn(rnn_input,hidden.unsqueeze(0)) # consider only a single layer (1.) so unsqueeze(0)

            # out = [batch_size, layers, decoder_hidden_out (bidirectional)]
            # hidden = [D*num_layers,batch_size, decoder_hidden_out]


            embedded = embedded.squeeze(1)  # [batch_size,embed_dim]
            out = out.squeeze(1)    # Have to change if the number of layers is changed to more than 1
            weighted = weighted.squeeze(1)

            predicition = self.fc_out(torch.cat([embedded,out,weighted],dim=1))  #[batch_size, decoder_output_dim]

            return predicition, hidden.squeeze(0) # Reduce the number of layers

In [119]:
class TranslationModel(nn.Module):

    def __init__(self,
                input_embed_dim = EMBED_DIM,
                encoder_hidden_dim = ENC_HIDDEN_DIM,
                encoder_hidden_output = ENC_OUTPUT_DIM,
                enc_num_layers = 1,
                enc_bidirectional = ENC_BIDIRECTIONAL,

                dec_vocab_size = len(output_vocab),
                dec_embed_dim = DEC_EMBED_DIM,
                dec_hidden_dim  =DEC_HIDDEN_DIM,
                device_train = device_cpu
        ):
        
        super().__init__()

        self.encoder = Encoder(input_embed_dim,encoder_hidden_dim,encoder_hidden_output,enc_num_layers,enc_bidirectional=enc_bidirectional)
        enc_bidirectional_factor = 2 if enc_bidirectional else 1
        self.decoder = Decoder(dec_vocab_size,enc_bidirectional_factor*encoder_hidden_dim,dec_hidden_dim=dec_hidden_dim,dec_output_dim=dec_vocab_size,emb_dim=dec_embed_dim)
        self.device_train = device_train


    def create_mask(self, src_lengths,max_src_length):

        src_mask = torch.zeros((len(src_lengths),max_src_length),dtype=torch.int64)
        for i in range(len(src_lengths)):

            src_mask[i,src_lengths[i]:] = 1
        return src_mask
        
    def forward(self,source,source_len,target,teacher_forcing_ratio = 0.5):
        #   source = [batch_size, max_src_len]
        #   source_len = [length of sentence in the batch]
        #   target = [batch_size,traget_length]
        #   teacher_forcing_ratio = probability to use teacher forcinbg

        batch_size = source.shape[0]
        target_length = target.shape[1]
        target_vocab_size = self.decoder.vocab_size
        outputs= torch.zeros(batch_size,target_length,target_vocab_size).to(self.device_train)
        encoder_outputs , hidden = self.encoder(source,source_len)

        inp = target[:,0]        
        enc_mask = self.create_mask(source_len,encoder_outputs.shape[1])
        for t in range(1,target_length):
            decoder_output, hidden =  self.decoder(inp,hidden,encoder_outputs,enc_mask)

            outputs[:,t,:] = decoder_output # batch_size, vocab_size
            teacher_force = random.random() < teacher_forcing_ratio 

            top1 = decoder_output.argmax(1)

            inp = target[:,t] if teacher_force else top1
        return outputs

In [120]:
t = TranslationModel()

In [122]:
batch_data = next(iter(train_dataloader))
t(batch_data['src'],batch_data['src_length'],batch_data['trg']).shape

torch.Size([64, 12, 13])

In [13]:
def train_model(model,num_epochs,train_loader,valid_loader,optimizer,criterion,checkpoint_name='translation_model.pth'):
    

    best_validation_loss = 1000.0
    for e in range(num_epochs):

        training_loss = 0.0
        model.train()
        for i, batch in enumerate(train_loader):

            source, source_length, target = batch['src'], batch['src_length'], batch['trg']

            optimizer.zero_grad()

            # model_output = [batch_size, output_seq_length,vocab_size]
            model_output  = model(source,source_length,target)
            
            model_out_reshaped = model_output[1:].view(-1,model_output.shape[-1])
            reshaped_target = target[1:].view(-1)

            loss_value = criterion(model_out_reshaped,reshaped_target)
            loss_value.backward()
            nn.utils.clip_grad_norm_(model.parameters(),5)

            optimizer.step()
            training_loss += loss_value.item()
        print("Epoch " + str(e) + " Training Loss Value = " + str(training_loss/len(train_loader)))

        model.eval()
        validation_loss = 0.0

        with torch.no_grad():

            for i, batch in enumerate(valid_loader):
                source, source_length, target = batch['src'], batch['src_length'], batch['trg']
                model_output  = model(source,source_length,target,0)
                model_out_reshaped = model_output[1:].view(-1,model_output.shape[-1])
                reshaped_target = target[1:].view(-1)
                loss_value = criterion(model_out_reshaped,reshaped_target)

                validation_loss += loss_value.item()
        averaged_validation_loss = validation_loss/len(valid_loader)
        print("Epoch " + str(e) + "Validation Loss Value = " + str(averaged_validation_loss))
        if (averaged_validation_loss <= best_validation_loss):
            best_validation_loss = averaged_validation_loss
            torch.save(model.state_dict(),checkpoint_name)
    

In [29]:
rnn = nn.GRU(100,64, num_layers= 2 ,batch_first= True ,bidirectional=True)
inp = torch.randn((2,8,100))
outputs,hidden = rnn(inp)

In [30]:
outputs.shape

torch.Size([2, 8, 128])

In [31]:
hidden.shape

torch.Size([4, 2, 64])

In [26]:
inq = torch.cat([hidden[-i , : , :] for i in range(4)],dim=1)


In [27]:
inq.shape

torch.Size([2, 256])

In [20]:
rnn(inp)[1].shape

torch.Size([4, 2, 64])

In [9]:
embedding_layer = nn.Embedding(10,30)

embedding_layer(torch.tensor([[10,2,3],[4,5,6]])).shape

IndexError: index out of range in self