In [1]:
!pip install python-levenshtein

Collecting python-levenshtein
  Downloading python-Levenshtein-0.12.2.tar.gz (50 kB)
[?25l[K     |██████▌                         | 10 kB 17.1 MB/s eta 0:00:01[K     |█████████████                   | 20 kB 13.3 MB/s eta 0:00:01[K     |███████████████████▌            | 30 kB 9.9 MB/s eta 0:00:01[K     |██████████████████████████      | 40 kB 7.4 MB/s eta 0:00:01[K     |████████████████████████████████| 50 kB 3.4 MB/s 
Building wheels for collected packages: python-levenshtein
  Building wheel for python-levenshtein (setup.py) ... [?25l[?25hdone
  Created wheel for python-levenshtein: filename=python_Levenshtein-0.12.2-cp37-cp37m-linux_x86_64.whl size=149873 sha256=35f29eb8858302056b6b6706072bf051478cff10d9e6be2f7d59746cc04aeae0
  Stored in directory: /root/.cache/pip/wheels/05/5f/ca/7c4367734892581bb5ff896f15027a932c551080b2abd3e00d
Successfully built python-levenshtein
Installing collected packages: python-levenshtein
Successfully installed python-levenshtein-0.12.2


# Libraries and Initial Processing

In [2]:
import os
import sys
import pandas as pd
import numpy as np
import Levenshtein as lev
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.nn.utils as utils
import seaborn as sns
import matplotlib.pyplot as plt
import time
import random
import datetime
from torch.utils import data
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from tqdm.notebook import tqdm

cuda = torch.cuda.is_available()

print(cuda, sys.version)

device = torch.device("cuda" if cuda else "cpu")
num_workers = 4 if cuda else 0
print("Cuda = "+str(cuda)+" with num_workers = "+str(num_workers))
np.random.seed(11785)
torch.manual_seed(11785)

# The labels of the dataset contain letters in LETTER_LIST.
# You should use this to convert the letters to the corresponding indices
# and train your model with numerical labels.
LETTER_LIST = ['<sos>', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', \
         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', "'", ' ', '<eos>']

True 3.7.13 (default, Apr 24 2022, 01:04:09) 
[GCC 7.5.0]
Cuda = True with num_workers = 4


In [9]:
def create_dictionaries(letter_list):
    '''
    Create dictionaries for letter2index and index2letter transformations
    based on LETTER_LIST

    Args:
        letter_list: LETTER_LIST

    Return:
        letter2index: Dictionary mapping from letters to indices
        index2letter: Dictionary mapping from indices to letters
    '''
    letter2index = dict()
    index2letter = dict()
    for idx,alph in enumerate(letter_list):
        letter2index[alph] = idx
        index2letter[idx] = alph
    return letter2index, index2letter
    

def transform_index_to_letter(batch_indices):
    '''
    Transforms numerical index input to string output by converting each index 
    to its corresponding letter from LETTER_LIST

    Args:
        batch_indices: List of indices from LETTER_LIST with the shape of (N, )
    
    Return:
        transcripts: List of converted string transcripts. This would be a list with a length of N
    '''
    transcripts = []
    for i in batch_indices:
        txt = ""
        for j in i:
            if j == letter2index['<eos>']:
                break;
            elif j == letter2index['<sos>']:
                continue;
            else:
                txt += index2letter[j]     
        transcripts.append(txt)
    return transcripts
        
# Create the letter2index and index2letter dictionary
letter2index, index2letter = create_dictionaries(LETTER_LIST)

In [4]:
index2letter

{0: '<sos>',
 1: 'A',
 2: 'B',
 3: 'C',
 4: 'D',
 5: 'E',
 6: 'F',
 7: 'G',
 8: 'H',
 9: 'I',
 10: 'J',
 11: 'K',
 12: 'L',
 13: 'M',
 14: 'N',
 15: 'O',
 16: 'P',
 17: 'Q',
 18: 'R',
 19: 'S',
 20: 'T',
 21: 'U',
 22: 'V',
 23: 'W',
 24: 'X',
 25: 'Y',
 26: 'Z',
 27: "'",
 28: ' ',
 29: '<eos>'}

In [6]:
transform_index_to_letter([1,2])

TypeError: ignored

# Kaggle (TODO)

In [7]:
!pip install --upgrade --force-reinstall --no-deps kaggle==1.5.8
!mkdir /root/.kaggle

with open("/root/.kaggle/kaggle.json", "w+") as f:
    f.write('{"username":"mangalamsahai","key":"521f66540469b3a12f7b11566d8b1c14"}') # Put your kaggle username & key here

!chmod 600 /root/.kaggle/kaggle.json

Collecting kaggle==1.5.8
  Downloading kaggle-1.5.8.tar.gz (59 kB)
[?25l[K     |█████▌                          | 10 kB 26.3 MB/s eta 0:00:01[K     |███████████                     | 20 kB 18.8 MB/s eta 0:00:01[K     |████████████████▋               | 30 kB 16.2 MB/s eta 0:00:01[K     |██████████████████████▏         | 40 kB 14.7 MB/s eta 0:00:01[K     |███████████████████████████▊    | 51 kB 9.3 MB/s eta 0:00:01[K     |████████████████████████████████| 59 kB 3.8 MB/s 
[?25hBuilding wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
  Created wheel for kaggle: filename=kaggle-1.5.8-py3-none-any.whl size=73275 sha256=98b0daa51c7fbf4800865458ad386de5c8ce282bff8a29e2de56d34f9b0e3e6b
  Stored in directory: /root/.cache/pip/wheels/de/f7/d8/c3902cacb7e62cb611b1ad343d7cc07f42f7eb76ae3a52f3d1
Successfully built kaggle
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Found existing installation: kaggle 1.5.12
 

In [8]:
!kaggle competitions download -c 11-785-s22-hw4p2

Downloading 11-785-s22-hw4p2.zip to /content
100% 1.84G/1.84G [00:09<00:00, 227MB/s]
100% 1.84G/1.84G [00:09<00:00, 211MB/s]


In [12]:
!unzip -q 11-785-s22-hw4p2.zip

# Dataset and Dataloading (TODO)

You will need to implement the Dataset class by your own. You can implement it similar to HW3P2. However, you are welcomed to do it your own way if it is more comfortable or efficient.

Note that you need to use LETTER_LIST to convert the transcript into numerical labels for the model.


Example of raw transcript:

    ['<sos>', 'N', 'O', 'R', 'T', 'H', 'A', 'N', 'G', 'E', 'R', ' ','A', 'B', 'B', 'E', 'Y', '<eos>']

Example of converted transcript ready to process for the model:

    [0, 14, 15, 18, 20, 8, 1, 14, 7, 5, 18, 28, 1, 2, 2, 5, 25, 29]


In [10]:
import pdb
class LibriSamples(torch.utils.data.Dataset):

    def __init__(self, data_path, partition= "train"):
        # TODO
        if partition=='train':
           self.X_dir = os.path.join(data_path,"mfcc")
           self.Y_dir = os.path.join(data_path,"transcript")
        elif partition=='dev':
           self.X_dir = os.path.join(data_path,"mfcc")
           self.Y_dir = os.path.join(data_path,"transcript")

        self.X_files = os.listdir(self.X_dir)
        self.Y_files = os.listdir(self.Y_dir)   
           #pdb.set_trace()

    def __len__(self):
        return len(self.X_files)

    def __getitem__(self, ind):
        # TODO
        X = np.load(os.path.join(self.X_dir,self.X_files[ind]))
        Y = np.load(os.path.join(self.Y_dir,self.Y_files[ind]))
              
        Y1 = np.zeros(Y.shape[0])

        for i in range(Y.shape[0]):
            Y1[i] = LETTER_LIST.index(Y[i])

        Yy = torch.LongTensor(Y1)

        return torch.tensor(X), Yy

    
    def collate_fn(batch):
        # TODO
        batch_x = [x for x,y in batch]
        batch_y = [y for x,y in batch]
        
        batch_x_pad = pad_sequence(batch_x,batch_first=True)
        lengths_x = [x.shape[0] for x,y in batch]
        batch_y_pad = pad_sequence(batch_y,batch_first=True)
        lengths_y = [y.shape[0] for x,y in batch]
        #pdb.set_trace()
        return batch_x_pad, batch_y_pad, torch.tensor(lengths_x), torch.tensor(lengths_y) 
        

class LibriSamplesTest(torch.utils.data.Dataset):

      def __init__(self, data_path, test_order):
          test_order_list = list(pd.read_csv(data_path + test_order).file)
          self.X = [np.load(data_path + "/mfcc/"+v) for v in test_order_list]          
    
      def __len__(self):
        # TODO
        return len(self.X) 

      def __getitem__(self, ind):
        # TODO
        return torch.tensor(self.X[ind])
    
      def collate_fn(batch):
        # TODO
        batch_x = [x for x in batch]
        batch_x_pad = pad_sequence(batch_x,batch_first=True)
        lengths_x = [x.shape[0] for x in batch]

        return batch_x_pad, torch.tensor(lengths_x) 

In [13]:
batch_size = 128

root = '/content/hw4p2_student_data/hw4p2_student_data'

train_data = LibriSamples(os.path.join(root,"train"),partition='train')
val_data = LibriSamples(os.path.join(root,"dev"),partition='dev')
test_data = LibriSamplesTest(os.path.join(root,"test"),'/test_order.csv')

train_loader = DataLoader(train_data, batch_size=128, shuffle=True, collate_fn=LibriSamples.collate_fn) # TODO
val_loader = DataLoader(val_data, batch_size=128, shuffle=True, collate_fn=LibriSamples.collate_fn) # TODO
test_loader = DataLoader(test_data, batch_size=128, shuffle=False, collate_fn=LibriSamplesTest.collate_fn)# TODO


print("Batch size: ", batch_size)
print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Val dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))

Batch size:  128
Train dataset samples = 28539, batches = 223
Val dataset samples = 2703, batches = 22
Test dataset samples = 2620, batches = 21


In [14]:
# test code for checking shapes
for data in val_loader:
    x, y, lx, ly = data
    print(x.shape, y.shape, lx.shape, len(ly))
    print(y[0]) # desired 
    break

torch.Size([128, 2054, 13]) torch.Size([128, 292]) torch.Size([128]) 128
tensor([ 0, 13,  1, 18, 20,  8,  1, 28, 18,  5, 13,  5, 13,  2,  5, 18,  5,  4,
        28, 20,  8,  5, 28,  3, 12, 15, 19,  5,  4, 28,  4, 15, 15, 18, 28, 15,
        14, 28, 20,  8,  5, 28,  6,  9, 18, 19, 20, 28, 19, 21, 18, 22,  5, 25,
        28, 20,  8,  5, 25, 28,  8,  1,  4, 14, 27, 20, 28,  1, 20, 20,  5, 13,
        16, 20,  5,  4, 28, 15, 16,  5, 14,  9, 14,  7, 28,  9, 20, 29,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,

# Model (TODO)

In [15]:
class pBLSTM(nn.Module):
    '''
    Pyramidal BiLSTM
    Read paper and understand the concepts and then write your implementation here.

    At each step,
    1. Pad your input if it is packed
    2. Truncate the input length dimension by concatenating feature dimension
        (i) How should  you deal with odd/even length input? 
        (ii) How should you deal with input length array (x_lens) after truncating the input?
    3. Pack your input
    4. Pass it into LSTM layer

    To make our implementation modular, we pass 1 layer at a time.
    '''
    def __init__(self, input_dim, hidden_dim):
        super(pBLSTM, self).__init__()
        self.blstm = nn.LSTM(input_dim*2,hidden_dim,num_layers=1,bidirectional=True,batch_first=True)
        

    def forward(self, x):
        out, lengths  = pad_packed_sequence(x,batch_first=True)
        # Need to perform further
        if out.shape[1]%2!=0:
           out = out[:,:-1,:]
        
        out = out.reshape(out.shape[0],out.shape[1]//2,out.shape[2]*2)
        lengths = lengths//2
        out4 = pack_padded_sequence(out,lengths,batch_first=True,enforce_sorted=False)
        out1,(out2,out3) = self.blstm(out4)

        return out1

In [16]:
class LD(nn.Module):
    def __init__(self, p=0.4):
        super().__init__()
        self.p = p

    def forward(self, x):
        if not self.training or not self.p:
            return x
        y, yl = pad_packed_sequence(x, batch_first=True)
        mask = y.new(y.size(0), 1, y.size(2)).bernoulli_(1 - self.p) / (1 - self.p)
        mask = mask.expand(y.size())
        mask.requires_grad = False
        z = pack_padded_sequence(y * mask, yl, batch_first=True,enforce_sorted=False)
        return z

In [17]:
class Block(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Conv1d(in_channels=dim, out_channels=dim, kernel_size=3, padding=1, groups=dim),
            nn.BatchNorm1d(dim),
            nn.Conv1d(dim, dim * 4, kernel_size=1),
            nn.GELU(),
            nn.Conv1d(dim * 4, dim, kernel_size=1)
        )
        self.layer_scale = nn.Parameter(torch.ones(1, dim, 1) * 1e-2, requires_grad=True)
  
    def forward(self, x):
        out = x
        x = self.layers(x)
        x = self.layer_scale * x
        x += out
        return x

In [18]:
import pdb
class Encoder(nn.Module):
    '''
    Encoder takes the utterances as inputs and returns the key, value and unpacked_x_len.

    '''
    def __init__(self, input_dim, encoder_hidden_dim, key_value_size=128):
        super(Encoder, self).__init__()
        # The first LSTM layer at the bottom
        self.embedding = nn.Sequential(
            nn.Conv1d(in_channels=input_dim, out_channels=encoder_hidden_dim, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm1d(encoder_hidden_dim),
            nn.GELU(),
            Block(encoder_hidden_dim),
            nn.Dropout(0.4), # Earlier was 0.2
        )
        
        
        self.lstm = nn.LSTM(input_size=encoder_hidden_dim, hidden_size=encoder_hidden_dim, num_layers=1, bidirectional=True, batch_first=True)

        # Define the blocks of pBLSTMs
        # Dimensions should be chosen carefully
        # Hint: Bidirectionality, truncation...
        self.pBLSTMs = nn.Sequential(
            pBLSTM(2*encoder_hidden_dim,encoder_hidden_dim), # New Addition #earlier in_channels= 4*encoder_hidden_dim
            LD(),
            pBLSTM(2*encoder_hidden_dim,encoder_hidden_dim), # New Addition*
            LD(),
            pBLSTM(2*encoder_hidden_dim,encoder_hidden_dim), # New Addition
            LD(),
            # ...
        )
         
        # The linear transformations for producing Key and Value for attention
        # Hint: Dimensions when bidirectional lstm? 
        self.key_network = nn.Linear(2*encoder_hidden_dim,key_value_size)
        self.value_network = nn.Linear(2*encoder_hidden_dim,key_value_size)

    def forward(self, x, x_len):
        """
        1. Pack your input and pass it through the first LSTM layer (no truncation)
        2. Pass it through the pyramidal LSTM layer
        3. Pad your input back to (B, T, *) or (T, B, *) shape
        4. Output Key, Value, and truncated input lens

        Key and value could be
            (i) Concatenated hidden vectors from all time steps (key == value).
            (ii) Linear projections of the output from the last pBLSTM network.
                If you choose this way, you can use the final output of
                your pBLSTM network.
        """
        #l = l.clamp(max=out.shape[2])   # I added
        x = torch.permute(x, (0, 2, 1))
        x = self.embedding(x)
        x = torch.permute(x,(0,2,1))
        
        
        x_len = ((x_len-1) // 2) + 1    # -3+2=-1
        #quit
        #pdb.set_trace()
        out = pack_padded_sequence(x,x_len.cpu(),batch_first=True,enforce_sorted=False)
        out1,(out2,out3) = self.lstm(out)

        out2 = self.pBLSTMs(out1)
        out3, lengths = pad_packed_sequence(out2,batch_first=True)        
        
        keys = self.key_network(out3)
        values = self.value_network(out3)

        return keys,values,lengths

In [19]:
def plot_attention(attention):
    # utility function for debugging
    plt.clf()
    sns.heatmap(attention, cmap='GnBu')
    plt.show()

class Attention(nn.Module):
    '''
    Attention is calculated using key and value from encoder and query from decoder.
    Here are different ways to compute attention and context:
    1. Dot-product attention
        energy = bmm(key, query) 
        # Optional: Scaled dot-product by normalizing with sqrt key dimension
        # Check "attention is all you need" Section 3.2.1
    * 1st way is what most TAs are comfortable with, but if you want to explore...
    2. Cosine attention
        energy = cosine(query, key) # almost the same as dot-product xD 
    3. Bi-linear attention
        W = Linear transformation (learnable parameter): d_k -> d_q
        energy = bmm(key @ W, query)
    4. Multi-layer perceptron
        # Check "Neural Machine Translation and Sequence-to-sequence Models: A Tutorial" Section 8.4
    
    After obtaining unnormalized attention weights (energy), compute and return attention and context, i.e.,
    energy = mask(energy) # mask out padded elements with big negative number (e.g. -1e9)
    attention = softmax(energy)
    context = bmm(attention, value)

    5. Multi-Head Attention
        # Check "attention is all you need" Section 3.2.2
        h = Number of heads
        W_Q, W_K, W_V: Weight matrix for Q, K, V (h of them in total)
        W_O: d_v -> d_v

        Reshape K: (B, T, d_k)
        to (B, T, h, d_k // h) and transpose to (B, h, T, d_k // h)
        Reshape V: (B, T, d_v)
        to (B, T, h, d_v // h) and transpose to (B, h, T, d_v // h)
        Reshape Q: (B, d_q)
        to (B, h, d_q // h)

        energy = Q @ K^T
        energy = mask(energy)
        attention = softmax(energy)
        multi_head = attention @ V
        multi_head = multi_head reshaped to (B, d_v)
        context = multi_head @ W_O
    '''
    def __init__(self):
        super(Attention, self).__init__()
        # Optional: dropout

    def forward(self, query, key, value, mask):
        """
        input:
            key: (batch_size, seq_len, d_k)
            value: (batch_size, seq_len, d_v)
            query: (batch_size, d_q)
        * Hint: d_k == d_v == d_q is often true if you use linear projections
        return:
            context: (batch_size, key_val_dim)
        
        """
        K1 = torch.bmm(key,query.unsqueeze(2)).squeeze(2).to(device)
        K2 = K1.masked_fill_(mask, -1e9)
        attention = F.softmax(K2,dim=1)
        context = torch.bmm(attention.unsqueeze(1),value).squeeze(1)

        return context, attention
        # we return attention weights for plotting (for debugging)

In [21]:
class Decoder(nn.Module):
    '''
    As mentioned in a previous recitation, each forward call of decoder deals with just one time step.
    Thus we use LSTMCell instead of LSTM here.
    The output from the last LSTMCell can be used as a query for calculating attention.
    Methods like Gumble noise and teacher forcing can also be incorporated for improving the performance.
    '''
    def __init__(self, vocab_size, decoder_hidden_dim, embed_dim, key_value_size=128):
        super(Decoder, self).__init__()
        # Hint: Be careful with the padding_idx
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=letter2index['<eos>'])
            # fill this out)
        # The number of cells is defined based on the paper
        self.lstm1 = nn.LSTMCell(input_size=embed_dim + key_value_size, hidden_size= decoder_hidden_dim)         # fill this out)
        self.lstm2 = nn.LSTMCell(input_size=decoder_hidden_dim,hidden_size=key_value_size)# fill this out)
    
        self.attention = Attention()     
        self.vocab_size = vocab_size
        # Optional: Weight-tying
        self.character_prob = nn.Linear(key_value_size*2,vocab_size)# fill this out) #: d_v -> vocab_size
        self.key_value_size = key_value_size
        
        # Weight tying
        self.character_prob.weight = self.embedding.weight

    def forward(self, key, value, encoder_len, y=None, mode='train',teacher_forcing=0.7):
        '''
        Args:
            key :(B, T, d_k) - Output of the Encoder (possibly from the Key projection layer)
            value: (B, T, d_v) - Output of the Encoder (possibly from the Value projection layer)
            y: (B, text_len) - Batch input of text with text_length
            mode: Train or eval mode for teacher forcing
        Return:
            predictions: the character perdiction probability 
        '''

        B, key_seq_max_len, key_value_size = key.shape

        if mode == 'train':
            max_len =  y.shape[1]
            char_embeddings = self.embedding(y)# fill this out)
        else:
            max_len = 600

        # TODO: Create the attention mask here (outside the for loop rather than inside) to aviod repetition
        mask = torch.zeros((len(encoder_len),key_seq_max_len),dtype=torch.bool)# fill this out
        for i,lens in enumerate(encoder_len):
            mask[i,lens:] = True
        mask = mask.to(device)
        
        predictions = []
        # This is the first input to the decoder
        # What should the fill_value be?
        for i,lens in enumerate(encoder_len):
            mask[i,lens:] = True

        prediction = torch.full((B,1), fill_value=0, device=device)# fill this out, device=device)
        # The length of hidden_states vector should depend on the number of LSTM Cells defined in init
        # The paper uses 2
        hidden_states = [None, None] 
        
        # TODO: Initialize the context
        context = value[:,0,:]# fill this out

        attention_plot = [] # this is for debugging

        for i in range(max_len):
            if mode == 'train':
                # TODO: Implement Teacher Forcing
                """
                if using teacher_forcing:
                    if i == 0:
                        # This is the first time step
                        # Hint: How did you initialize "prediction" variable above?
                    else:
                        # Otherwise, feed the label of the **previous** time step
                else:
                    char_embed = embedding of the previous prediction
                """ 
                if random.random() <= teacher_forcing:
                   if i==0:
                     char_embed = self.embedding(prediction.argmax(dim=-1))

                   else:
                     char_embed = char_embeddings[:,i-1,:]   
                else:
                    char_embed = self.embedding(prediction.argmax(dim=-1))       
            else:
                char_embed = self.embedding(prediction.argmax(dim=-1)) # embedding of the previous prediction

            # what vectors should be concatenated as a context?
            y_context = torch.cat([char_embed,context], dim=1)
            # context and hidden states of lstm 1 from the previous time step should be fed
            hidden_states[0] = self.lstm1(y_context,hidden_states[0])# fill this out)

            # hidden states of lstm1 and hidden states of lstm2 from the previous time step should be fed
            hidden_states[1] = self.lstm2(hidden_states[0][0],hidden_states[1])# fill this out)
            # What then is the query?
            query = hidden_states[1][0] # fill this out
            
            # Compute attention from the output of the second LSTM Cell
            context, attention = self.attention(query, key, value, mask)
            # We store the first attention of this batch for debugging
            attention_plot.append(attention[0].detach().cpu())
            
            # What should be concatenated as the output context?
            output_context = torch.cat([query,context], dim=1)
            #output_context = self.classification(output_context)
            prediction = self.character_prob(output_context)
            # store predictions
            
            predictions.append(prediction.unsqueeze(1))# fill this out)
        
        # Concatenate the attention and predictions to return
        attentions = torch.stack(attention_plot, dim=0)
        predictions = torch.cat(predictions, dim=1)
        # print(predictions.shape)
        return predictions, attentions

In [22]:
class Seq2Seq(nn.Module):
    '''
    We train an end-to-end sequence to sequence model comprising of Encoder and Decoder.
    This is simply a wrapper "model" for your encoder and decoder.
    '''
    def __init__(self, input_dim, vocab_size, encoder_hidden_dim, decoder_hidden_dim, embed_dim, key_value_size=128):
        super(Seq2Seq,self).__init__()
        self.encoder = Encoder(input_dim=input_dim, encoder_hidden_dim=encoder_hidden_dim, key_value_size=key_value_size) # fill this out)
        self.decoder = Decoder(vocab_size=vocab_size, decoder_hidden_dim=decoder_hidden_dim,embed_dim=embed_dim,key_value_size=key_value_size)# fill this out)

    def forward(self, x, x_len, y=None, mode='train',teacher_forcing=0.7):
        key, value, encoder_len = self.encoder(x, x_len)
        predictions, attentions = self.decoder(key, value, encoder_len, y=y, mode=mode,teacher_forcing=teacher_forcing)
        return predictions,attentions

In [23]:
model = Seq2Seq(input_dim=13, vocab_size=30, encoder_hidden_dim=256, decoder_hidden_dim=512, embed_dim=256, key_value_size=128)# fill this out)

model = model.to(device)
print(model)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Sequential(
      (0): Conv1d(13, 256, kernel_size=(3,), stride=(2,), padding=(1,), bias=False)
      (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): GELU()
      (3): Block(
        (layers): Sequential(
          (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,), groups=256)
          (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): Conv1d(256, 1024, kernel_size=(1,), stride=(1,))
          (3): GELU()
          (4): Conv1d(1024, 256, kernel_size=(1,), stride=(1,))
        )
      )
      (4): Dropout(p=0.4, inplace=False)
    )
    (lstm): LSTM(256, 256, batch_first=True, bidirectional=True)
    (pBLSTMs): Sequential(
      (0): pBLSTM(
        (blstm): LSTM(1024, 256, batch_first=True, bidirectional=True)
      )
      (1): LD()
      (2): pBLSTM(
        (blstm): LSTM(1024, 256, batch_first=True, bidirectional=True)
  

# Training

In [24]:
optimizer = optim.Adam(model.parameters(),lr =1e-3, weight_decay=5e-6) # fill this out)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,factor=0.8,patience=2)# fill this out
criterion = nn.CrossEntropyLoss(ignore_index=int(-1000))     #ignore_index=int(-1000)
#criterion = nn.CrossEntropyLoss(reduction='none')
n_epochs = 150

In [25]:
def Distance(p,y):
    total_dist = 0
    pred = transform_index_to_letter(p.argmax(-1).cpu().detach().numpy())
    tar = transform_index_to_letter(y.cpu().detach().numpy())
    # print("Predictions:",pred)
    # print("Targets:",tar)
    for q,t in zip(pred,tar):
        dist = lev.distance(q,t)
        total_dist += dist

    return total_dist/len(tar)    

In [26]:
import pdb
def train(model, train_loader, criterion, optimizer, mode='train',epoch=1,teacher_forcing=0.7):
    model.train()
    running_loss = 0
    running_dist = 0
    # 0) Iterate through your data loader
    batch_bar = tqdm(total=len(train_loader),position=0,desc='Train') # dynamic_ncols=True removed leave=False,
    for i, (x,y,x_len,y_len) in enumerate(train_loader):
        
        # 1) Send the inputs to the device
        optimizer.zero_grad()
        # 2) Pass your inputs, and length of speech into the model.
        x = x.to(device)
        x_len = x_len.to(device)
        y = y.to(device)
        y_len = y_len.to(device)
        
        predictions, attentions = model(x, x_len, y, mode, teacher_forcing)
        #pdb.set_trace()
        # 3) Generate a mask based on target length. This is to mark padded elements
        # so that we can exclude them from computing loss.
        # Ensure that the mask is on the device and is the correct shape.
        mask = torch.zeros((y.size(0),y.size(1),30),dtype= torch.bool)# fill this out
        mask = mask.to(device)
        #predictions = predictions.to(device)
        for l, lengths in enumerate(y_len):
            mask[l,lengths:,:] = True
            #mask[l, lengths:, :] = True

        predictions = predictions.masked_fill_(mask,int(-1000))    
            
        # 4) Make sure you have the correct shape of predictions when putting into criterion
        #pdb.set_trace()
        #pdb.set_trace()
        loss = criterion(predictions.view(-1,predictions.size(2)),y.view(-1))# fill this out)
        # Use the mask you defined above to compute the average loss
        masked_loss = torch.sum(loss*mask.view(-1))/torch.sum(mask)# fill this out

        running_loss += masked_loss.cpu().detach().item()
        # 5) backprop
        batch_bar.set_postfix(
            loss="{:.04f}".format(float(running_loss / (i + 1))),
            lr="{:.04f}".format(float(optimizer.param_groups[0]['lr'])),
           
          )
        masked_loss.backward()
        nn.utils.clip_grad_norm(model.parameters(),2)
        optimizer.step()
        batch_bar.update()
    batch_bar.close() 

    print("Epoch {}/{}: Train Loss {:.04f}, Learning Rate {:.04f}, dist {:.04f}".format(
             epoch + 1,
             n_epochs,
             float(running_loss / len(train_loader)),
             float(optimizer.param_groups[0]['lr']),
             float(float(running_dist / len(train_loader))),
            ))        
        # Optional: Gradient clipping

        # When computing Levenshtein distance, make sure you truncate prediction/target

        # Optional: plot your attention for debugging
        # plot_attention(attentions)
        
def val(model, valid_loader,epoch):
    model.eval()
    running_loss = 0
    running_dist = 0

    with torch.no_grad():
         batch_bar = tqdm(total=len(valid_loader),position=0,desc='Validation') #dynamic_ncols=True,leave=False,
         for i, (x,y,x_len,y_len) in enumerate(valid_loader):
             x=x.to(device)
             x_len = x_len.to(device)
             y = y.to(device)
             y_len = y_len.to(device)
             predictions, attentions = model(x, x_len, y, "val")
            #  mask = torch.zeros((y.size(0),y.size(1),30),dtype=torch.bool)
            #  mask = mask.to(device)

            #  for l, lengths in enumerate(y_len):
            #      mask[l,lengths:,:] = True
             
            #  predictions = predictions.masked_fill_(mask,int(-1000)) 
             
            #  loss = criterion(predictions.view(-1,predictions.size(2)),y.view(-1))
            #  masked_loss = torch.sum(loss*mask.view(-1))/torch.sum(mask)
            #  running_loss +=  
             running_dist += float(Distance(predictions, y))
             batch_bar.set_postfix(
                loss="{:.04f}".format(float(running_loss / (i + 1))),
                lr="{:.04f}".format(float(optimizer.param_groups[0]['lr'])),
              )
             batch_bar.update()
         batch_bar.close()        
         print("Validation Loss {:.04f}, dist {:.04f}".format(
            float(running_loss / len(val_loader)),
            float(float(running_dist / len(val_loader))),
            ))     

    return float(running_dist/len(val_loader))
    #pass


In [29]:
# TODO: Define your model and put it on the device here
# ...

n_epochs = 10
optimizer = optim.Adam(model.parameters(), # fill this out)
# Make sure you understand the implication of setting reduction = 'none'
criterion = nn.CrossEntropyLoss(reduction='none')
mode='train'

for epoch in range(n_epochs):
    train(model, train_loader, criterion, optimizer, mode)
    val(model, valid_loader)

SyntaxError: ignored

In [30]:
dist = val(model, val_loader, epoch)

NameError: ignored

In [31]:
from google.colab import drive

drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
save_checkpoint=torch.load("/content/drive/MyDrive/HW4P2Model_0.4_DifferentMasking_(19+17)80.pth")
model.load_state_dict(save_checkpoint["model_state_dict"])
optimizer.load_state_dict(save_checkpoint["optimizer_state_dict"])
scheduler.load_state_dict(save_checkpoint["scheduler_state_dict"])


In [None]:
best_dist = 7.66
teacher_forcing =save_checkpoint['teacher_forcing'] # 0.2
for epoch in range(n_epochs):
    train(model, train_loader, criterion, optimizer, 'train', epoch, teacher_forcing)
    # if epoch > -1:
    dist = val(model, val_loader, epoch)
    scheduler.step(dist)
    if dist < best_dist:
        best_dist = dist
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'loss': criterion,
            'teacher_forcing': teacher_forcing
            }, 'drive/MyDrive/HW4P2Model_best_0.4_DifferentMasking_.pth')
    if epoch % 30 == 28:
        teacher_forcing -= 0.1
        teacher_forcing = max(0.1, teacher_forcing)
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'loss': criterion,
        'teacher_forcing': teacher_forcing
        }, 'drive/MyDrive/HW4P2Model_0.4_DifferentMasking_(19+17+80)'+str(epoch)+'.pth')
    # del x,y,x_len,y_len
    # torch.cuda.empty_cache()
# dist = val(model, valid_loader)

Train:   0%|          | 0/223 [00:00<?, ?it/s]



Epoch 1/150: Train Loss 1.5619, Learning Rate 0.0000, dist 0.0000


Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Validation Loss 0.0000, dist 10.3704


Train:   0%|          | 0/223 [00:00<?, ?it/s]

Epoch 2/150: Train Loss 1.5526, Learning Rate 0.0000, dist 0.0000


Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Validation Loss 0.0000, dist 10.3533


Train:   0%|          | 0/223 [00:00<?, ?it/s]

Epoch 3/150: Train Loss 1.5530, Learning Rate 0.0000, dist 0.0000


Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Validation Loss 0.0000, dist 10.2406


Train:   0%|          | 0/223 [00:00<?, ?it/s]

Epoch 4/150: Train Loss 1.5547, Learning Rate 0.0000, dist 0.0000


Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Validation Loss 0.0000, dist 10.2012


Train:   0%|          | 0/223 [00:00<?, ?it/s]

Epoch 5/150: Train Loss 1.5567, Learning Rate 0.0000, dist 0.0000


Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Validation Loss 0.0000, dist 10.3904


Train:   0%|          | 0/223 [00:00<?, ?it/s]

Epoch 6/150: Train Loss 1.5534, Learning Rate 0.0000, dist 0.0000


Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Validation Loss 0.0000, dist 10.5185


Train:   0%|          | 0/223 [00:00<?, ?it/s]

Epoch 7/150: Train Loss 1.5588, Learning Rate 0.0000, dist 0.0000


Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Validation Loss 0.0000, dist 10.4143


Train:   0%|          | 0/223 [00:00<?, ?it/s]

Epoch 8/150: Train Loss 1.5580, Learning Rate 0.0000, dist 0.0000


Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Validation Loss 0.0000, dist 10.2053


Train:   0%|          | 0/223 [00:00<?, ?it/s]

Epoch 9/150: Train Loss 1.5466, Learning Rate 0.0000, dist 0.0000


Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Validation Loss 0.0000, dist 10.2242


Train:   0%|          | 0/223 [00:00<?, ?it/s]

Epoch 10/150: Train Loss 1.5543, Learning Rate 0.0000, dist 0.0000


Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Validation Loss 0.0000, dist 10.2392


Train:   0%|          | 0/223 [00:00<?, ?it/s]

Epoch 11/150: Train Loss 1.5537, Learning Rate 0.0000, dist 0.0000


Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Validation Loss 0.0000, dist 10.1953


Train:   0%|          | 0/223 [00:00<?, ?it/s]

Epoch 12/150: Train Loss 1.5488, Learning Rate 0.0000, dist 0.0000


Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Validation Loss 0.0000, dist 10.2662


Train:   0%|          | 0/223 [00:00<?, ?it/s]

Epoch 13/150: Train Loss 1.5531, Learning Rate 0.0000, dist 0.0000


Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Validation Loss 0.0000, dist 10.2204


Train:   0%|          | 0/223 [00:00<?, ?it/s]

Epoch 14/150: Train Loss 1.5565, Learning Rate 0.0000, dist 0.0000


Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Validation Loss 0.0000, dist 10.3049


Train:   0%|          | 0/223 [00:00<?, ?it/s]

Epoch 15/150: Train Loss 1.5538, Learning Rate 0.0000, dist 0.0000


Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Validation Loss 0.0000, dist 10.1487


Train:   0%|          | 0/223 [00:00<?, ?it/s]

Epoch 16/150: Train Loss 1.5542, Learning Rate 0.0000, dist 0.0000


Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Validation Loss 0.0000, dist 10.3156


Train:   0%|          | 0/223 [00:00<?, ?it/s]

Epoch 17/150: Train Loss 1.5532, Learning Rate 0.0000, dist 0.0000


Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Validation Loss 0.0000, dist 10.4775


Train:   0%|          | 0/223 [00:00<?, ?it/s]

Epoch 18/150: Train Loss 1.5532, Learning Rate 0.0000, dist 0.0000


Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Validation Loss 0.0000, dist 10.5533


Train:   0%|          | 0/223 [00:00<?, ?it/s]

Epoch 19/150: Train Loss 1.5551, Learning Rate 0.0000, dist 0.0000


Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Validation Loss 0.0000, dist 10.3081


Train:   0%|          | 0/223 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [32]:
save_checkpoint=torch.load("/content/drive/MyDrive/HW4P2Model_0.4_DifferentMasking_(19+17+80)11.pth")
model.load_state_dict(save_checkpoint["model_state_dict"])

<All keys matched successfully>

In [33]:
model.eval()
submission=[]
with torch.no_grad():
     for batch_idx, (x,x_len) in enumerate(test_loader):
         x=x.to(device)
         x_len=x_len.to(device)
         predictions, attentions = model(x,x_len,mode='test') # Else it will run the default one
         pred = transform_index_to_letter(predictions.argmax(-1).cpu().detach().numpy())
         submission.extend(pred)
del x,x_len
torch.cuda.empty_cache()           



In [34]:
with open("final_submission.csv", 'w') as f:
    f.write('id,predictions\n') 
    for i in range(len(submission)):
        f.write(str(i)+ ',' + submission[i] + "\n")

In [35]:
!kaggle competitions submit -c 11-785-s22-hw4p2 -f final_submission.csv -m 'submission'

100% 288k/288k [00:01<00:00, 162kB/s]
Successfully submitted to Attention-Based Speech Recognition

In [None]:
'''
Debugging suggestions from Eason, a TA from previous semesters:

(1) Decrease your batch_size to 2 and print out the value and shape of all intermediate variables to check if they satisfy the expectation
(2) Be super careful about the LR, don't make it too high. Too large LR would lead to divergence and your attention plot will never make sense
(3) Make sure you have correctly handled the situation for time_step = 0 when teacher forcing

(1) is super important and is the most efficient way for debugging. 
'''
'''
Tips for passing A from B (from easy to hard):
** You need to implement all of these yourself without utilizing any library **
(1) Increase model capacity. E.g. increase num_layer of lstm
(2) LR and Teacher Forcing are also very important, you can tune them or their scheduler as well. Do NOT change lr or tf during the warm-up stage!
(3) Weight tying
(4) Locked Dropout - insert between the plstm layers
(5) Pre-training decoder or train an LM to help make predictions
(5) Pre-training decoder to speed up the convergence: 
    disable your encoder and only train the decoder like train a language model
(6) Better weight initialization technique
(7) Batch Norm between plstm. You definitely can try other positions as well
(8) Data Augmentation. Time-masking, frequency masking
(9) Weight smoothing (avg the last few epoch's weight)
(10) You can try CNN + Maxpooling (Avg). Some students replace the entire plstm blocks with it and some just combine them together.
(11) Beam Search
'''