<a href="https://colab.research.google.com/github/JTStephens18/AudioTranscriptor/blob/main/V6_VisionEncDec_audioProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install yt-dlp
# Install huggingface audio datasets
! pip install datasets[audio]
! pip install transformers evaluate jiwer
!pip install accelerate -U
!pip install nltk

In [None]:
from datasets import load_dataset, Audio, Dataset
from transformers import VisionEncoderDecoderModel, AutoFeatureExtractor, AutoTokenizer, AutoProcessor, ASTFeatureExtractor
import subprocess
import os
from pathlib import Path
import torch
import torch.nn as nn
import torchaudio
import soundfile as sf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import librosa
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
ds = load_dataset('google/MusicCaps', split="train")
ds

Downloading readme:   0%|          | 0.00/5.06k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['ytid', 'start_s', 'end_s', 'audioset_positive_labels', 'aspect_list', 'caption', 'author_id', 'is_balanced_subset', 'is_audioset_eval'],
    num_rows: 5521
})

In [None]:
ds = ds.remove_columns(["author_id", "is_balanced_subset", "is_audioset_eval", "audioset_positive_labels"])

In [None]:
ds_train[0]["aspect_list"]

"['classical song', 'harpsichord', 'ascending run', 'song for aristocracy', 'instrumental', 'no voices', 'no other instruments', 'moderate tempo']"

# Tokenize the dataset

In [None]:
import re
from nltk.tokenize import word_tokenize
# vocab_list = ["word 1", "word 2",  ..., "word N"]
vocab_list = ["<SOS>", "<EOS>", "<PAD>", "<UNK>"]
max_len = 0

# Need to split on punctuation
# I see some tokens such as "funk/pop" and singing.the - These should be separate

def split_word(word):
  return re.findall(r'\w+|[^\w\s]', word)


for i in range(ds.num_rows):
  caption = ds[i]["caption"]
  captionSplit = []
  # captionSplit = word_tokenize(ds[i]["caption"])
  for word in word_tokenize(caption):
    # word = word.replace(".", "")
    captionSplit.extend(split_word(word))
  if(len(captionSplit) > max_len):
    max_len = len(captionSplit)+1
  for j in range(len(captionSplit)):
    word = captionSplit[j].lower()
    if(word) not in vocab_list:
      vocab_list.append(word)

# vocab_dict = { "word": index }
vocab_dict = {}
for i in range(len(vocab_list)):
  vocab_dict[vocab_list[i]] = i

In [None]:
def testToken(input):
  output = []
  input = word_tokenize(input.lower())
  for i in range(len(input)):
    if(input[i]) in vocab_dict:
      index = vocab_dict[input[i]]
      output.append(index)
    else:
      output.append("<UNK>")
  return output

In [None]:
testVal = testToken(ds[500]["aspect_list"][2:-2])
print(testVal)
print(ds[500]["aspect_list"][2:-2])
print(max_len)
print(vocab_list[69])

[37, 177, 6, 7, '<UNK>', 73, 6, 7, '<UNK>', 69, 19, 454, 6, 7, '<UNK>', 19, 71, 192, 70, 6, 7, '<UNK>', 13, 76, 6, 7, '<UNK>', 6, 7, '<UNK>', 6, 7, '<UNK>', 6, 7, '<UNK>', 6, 7, '<UNK>', 6, 7, '<UNK>', 6, 7, '<UNK>', 6, 7, '<UNK>', 6, 7, '<UNK>', 6, 7, '<UNK>', 191, 6, 7, '<UNK>', 135, 6, 7, '<UNK>', 461, 6, 7, '<UNK>', 27, 19]
male vocalist', 'energetic drumming', 'loud electric guitar feedback', 'electric guitar lead and harmony', 'enthusiastic vocal backup', 'youthful', 'enthusiastic', 'energetic', 'vibrant', 'boisterous', 'voracious', 'intense', 'passionate', 'metal', 'hard rock', 'rock music', 'heavy metal', 'electric bass guitar
157
electric


In [None]:
ds = ds.train_test_split(test_size=0.2)

In [None]:
feature_extractor = ASTFeatureExtractor()
feature_extractor

ASTFeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "ASTFeatureExtractor",
  "feature_size": 1,
  "max_length": 1024,
  "mean": -4.2677393,
  "num_mel_bins": 128,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000,
  "std": 4.5689974
}

In [None]:
def download_clip(
    video_id,
    output_filename,
    start_time,
    end_time,
    tmp_dir='/musiccaps',
    num_attempts=5,
    url_base='https://www.youtube.com/watch?v='
):

  status = False
  command = f"""
        yt-dlp --quiet --no-warnings -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" {url_base}{video_id} --force-keyframes-at-cuts
    """.strip()
  attempts = 0
  while True:
    try:
      # output = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
      output = os.system(command)
    except subprocess.CalledProcess.Error as err:
      attempts += 1
      if attempts == num_attempts:
        return status, err.output
    else:
      break

  # Check if video was successfully saved
  status = os.path.exists(output_filename)
  return status, 'Downloaded'


def toUppercase(aspect_list):
  a_list = aspect_list[1:-1]
  new_list = ''
  for word in a_list:
    new_list = ''.join(a_list).upper().replace(',', '')
    # new_list = new_list.replace(' ', '|')
    if(len(new_list) > 128):
      new_list = new_list[:128]
  return new_list

# Add padding to make all captions the same length
def tokenizeCaption(caption):
  output = [vocab_dict["<SOS>"]]
  input = word_tokenize(caption.lower())
  for i in range(len(input)):
    if(input[i]) in vocab_dict:
      index = vocab_dict[input[i]]
      output.append(index)
    else:
      output.append(vocab_dict['<UNK>'])
  output.append(vocab_dict["<EOS>"])
  # Max_len -2 because I'm appending additional start and end tokens
  for j in range(len(input), max_len-2):
   output.append(vocab_dict["<PAD>"])
  return output


def process(example):
  output_path = str(data_dir / f"{example['ytid']}.wav")
  status = True
  # aspect_string = toUppercase(example['aspect_list'])
  if not os.path.exists(output_path):
    status = False
    status, log = download_clip(
        example['ytid'],
        output_path,
        example['start_s'],
        example['end_s'],
    )

  example["tokenizedCaption"] = tokenizeCaption(example["caption"])
  # example['aspect_string'] = aspect_string
  example["audio"] = output_path
  example['download_status'] = status
  example["image_path"] = f'./spectrograms/{example["ytid"]}.png'
  return example

In [None]:
def stereo_to_mono(wav):
  chan_1 = wav[0][:]
  chan_2 = wav[1][:]
  mono = (chan_1 + chan_2) / 2
  return mono

In [None]:
def resample_waveform(example):
  filepath = example["audio"]
  y, sr = librosa.load(filepath, sr=16000)
  sf.write(filepath, y, sr)
  waveform, sampling_rate = torchaudio.load(filepath)
  if(waveform.shape[0] == 2):
    waveform = stereo_to_mono(waveform)

  example["waveform"] = waveform
  return example

In [None]:
def wavToInput(example):
  wav = np.asarray(example["waveform"])
  inputs = feature_extractor(wav, sampling_rate=sampling_rate, padding="max_length", return_tensors="pt").input_values
  example["input_values"] = inputs
  example["inputs_shape"] = inputs.shape
  return example

In [None]:
samples_to_load = 10
cores = 4
sampling_rate = 16000
writer_batch_size = 1000
data_dir = "./music_data"
upper_limit = 5521
lower_limit = 5000

data_dir = Path(data_dir)
data_dir.mkdir(exist_ok=True, parents=True)

# ds = ds.select(range(lower_limit, upper_limit))

# ds = ds.map(
#     process,
#     num_proc=cores,
#     writer_batch_size=writer_batch_size,
#     keep_in_memory=False
# )


# ds = ds.filter(lambda ex: ex["download_status"] == True)

ds_train = ds["train"].select(range(samples_to_load))
ds_test = ds["test"].select(range(int(samples_to_load*0.2)))

ds_train = ds_train.map(
    process,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
    # batched=True,
)

ds_train = ds_train.filter(lambda ex: ex["download_status"] == True)
# ds_train = ds_train.cast_column("audio", Audio(sampling_rate=sampling_rate))
ds_train = ds_train.map(
    resample_waveform,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
)

ds_train = ds_train.map(
    wavToInput,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
)

ds_test = ds_test.map(
    process,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
    # batched=True,
)

ds_test = ds_test.filter(lambda ex: ex["download_status"] == True)
# ds_test = ds_test.cast_column("audio", Audio(sampling_rate=sampling_rate))
ds_test = ds_test.map(resample_waveform, num_proc=cores, writer_batch_size=writer_batch_size, keep_in_memory=False)
ds_test = ds_test.map(
    wavToInput,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
)

Map (num_proc=4):   0%|          | 0/10 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/10 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/10 [00:00<?, ? examples/s]

num_proc must be <= 2. Reducing num_proc to 2 for dataset of size 2.


Map (num_proc=2):   0%|          | 0/2 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2 [00:00<?, ? examples/s]

num_proc must be <= 2. Reducing num_proc to 2 for dataset of size 2.


Map (num_proc=2):   0%|          | 0/2 [00:00<?, ? examples/s]

num_proc must be <= 2. Reducing num_proc to 2 for dataset of size 2.


Map (num_proc=2):   0%|          | 0/2 [00:00<?, ? examples/s]

# AST Training

In [None]:
from transformers import AutoModel
model = AutoModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

Downloading (…)lve/main/config.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

In [None]:
print(model)

In [None]:
class MultiHeadAttentionLayer(nn.Module):
  def __init__(self, hid_dim, n_heads, dropout, device):
    super().__init__()

    assert hid_dim % n_heads == 0

    self.hid_dim = hid_dim
    self.n_heads = n_heads
    self.head_dim = hid_dim // n_heads

    self.fc_q = nn.Linear(hid_dim, hid_dim)
    self.fc_k = nn.Linear(hid_dim, hid_dim)
    self.fc_v = nn.Linear(hid_dim, hid_dim)

    self.fc_o = nn.Linear(hid_dim, hid_dim)

    self.dropout = nn.Dropout(dropout)
    self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

  def forward(self, query, key, value, mask = None):

    batch_size = query.shape[0]

    # query = [batch_size, query len, hid_dim]
    # key = [batch_size, key len, hid_dim]
    # value = [batch_size, value len, hid_dim]

    Q = self.fc_q(query)
    K = self.fc_k(query)
    V = self.fc_v(query)

    # Q = [batch_size, query len, hid_dim]
    # K = [batch_size, key len, hid_dim]
    # V = [batch_size, value len, hid_dim]

    Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0,2,1,3)
    K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0,2,1,3)
    V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0,2,1,3)

    # Q = [batch_size, n heads, query len, hid_dim]
    # K = [batch_size, n heads, key len, hid_dim]
    # V = [batch_size, n heads, value len, hid_dim]

    energy = torch.matmul(Q, K.permute(0,1,3,2)) / self.scale

    # energy = [batch_size, n heads, query len, key len]

    if mask is not None:
      energy = energy.masked_fill(mask == 0, -1e10)

    attention = torch.softmax(energy, dim=-1)

    # attention = [batch_size, n_heads, query len, key len]

    x = torch.matmul(self.dropout(attention), V)

    # x = [batch_size, n_heads, query len, head_dim]

    x = x.permute(0,2,1,3).contiguous()

    # x = [batch_size, query len, n_heads, head_dim]

    x = x.view(batch_size, -1, self.hid_dim)

    # x = [batch_size, query len, hid dim]

    x = self.fc_o(x)

    # x = [batch_size, query len, hid dim]

    return x, attention

# Transformer Decoder Implementation

In [None]:
# Create a starting output state
  # Pass this into an output embedding (size of 768)
  # Use positional encoding
  # Pass this into a masked multi-head attention mechanism
  # Add and normalize

# Take the output from stage 1 and combine it with the output from the encoder
  # Multi-head attention
  # Add and normalize

# Feed forward
  # Add and normalize

# Linear activation function
# Softmax



In [None]:
class TransformerBlock(nn.Module):
  def __init__(self, embed_size, heads, dropout, forward_expansion):
    super(TransformerBlock, self).__init__()
    # self.attention = nn.MultiheadAttention(embed_size, heads).to(device)
    self.attention = MultiHeadAttentionLayer(embed_size, heads, dropout, device)
    self.norm1 = nn.LayerNorm(embed_size)
    self.norm2 = nn.LayerNorm(embed_size)

    self.feed_forward = nn.Sequential(
        nn.Linear(embed_size, forward_expansion*embed_size),
        nn.ReLU(),
        nn.Linear(forward_expansion*embed_size, embed_size)
    )
    self.dropout = nn.Dropout(dropout)

  def forward(self, value, key, query, mask):
    # attention, _ = self.attention(query, key, value, mask)
    # Shouldn't need a mask to ignore padding on inputs since padding is not needed for inputs
    attention, _ = self.attention(query, key, value)

    x = self.dropout(self.norm1(attention + query))
    forward = self.feed_forward(x)
    out = self.dropout(self.norm2(forward + x))
    return out

In [None]:
class DecoderBlock(nn.Module):
  def __init__(self, embed_size, heads, forward_expansion, dropout, device, max_length):
    super(DecoderBlock, self).__init__()
    # self.attention = nn.MultiheadAttention(embed_size, heads).to(device)
    self.attention = MultiHeadAttentionLayer(embed_size, heads, dropout, device)
    self.norm = nn.LayerNorm(embed_size)
    self.transformer_block = TransformerBlock(
        embed_size, heads, dropout, forward_expansion
    )
    self.dropout = nn.Dropout(dropout)
    # self.feed_forward = nn.Sequential(
    #     # Target shape[1] to forward_expansion*embed_size
    #     nn.Linear(max_length, forward_expansion*embed_size),
    #     nn.ReLU(),
    #     nn.Linear(forward_expansion*embed_size, max_length)
    # )

  def forward(self, x, value, key, src_mask, trg_mask):
    # A feed forward connection adds other parameters so there is an additional case to learn if needed
    # x = self.feed_forward(x)
    attention, _ = self.attention(x, x, x, trg_mask)
    query = self.dropout(self.norm(attention + x))
    # query = self.norm(x + self.dropout(attention))
    out = self.transformer_block(value, key, query, src_mask)
    return out

In [None]:
class Decoder(nn.Module):
  def __init__(self, trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length):
    super(Decoder, self).__init__()
    self.device = device
    self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
    self.position_embedding = nn.Embedding(max_length, embed_size)
    # self.position_embedding = nn.Parameter(torch.randn(max_length, embed_size))

    self.layers = nn.ModuleList(
        [DecoderBlock(embed_size, heads, forward_expansion, dropout, device, max_length)
        for _  in range(num_layers)]
    )

    self.fc_out = nn.Linear(embed_size, trg_vocab_size)
    self.dropout = nn.Dropout(dropout)
    self.softmax = nn.Softmax(dim=0)
    self.scale = torch.sqrt(torch.FloatTensor([embed_size])).to(device)

  def forward(self, x, enc_out, src_mask, trg_mask):
    N, seq_length = x.shape
    pos = torch.arange(0, seq_length).unsqueeze(0).repeat(N, 1).to(self.device)
    # x = self.dropout(self.word_embedding(x) + self.position_embedding(pos))
    x = self.dropout((self.word_embedding(x) * self.scale) + self.position_embedding(pos))
    # positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
    # positions = position_embedding[:seq_length]
    # x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))
    # x = self.dropout((self.word_embedding(x) + self.position_embedding[:seq_length]))

    for layer in self.layers:
      x = layer(x, enc_out, enc_out, src_mask, trg_mask)

    out = self.fc_out(x)
    # out = self.softmax(out)
    return out


In [None]:
class Instantiate(nn.Module):
  def __init__(self):
    super(Instantiate, self).__init__()

  def forward(self, x):
    return x

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
src_vocab_size = 1024 # max_length of feature extractor - not even used
trg_vocab_size = len(vocab_list)
embed_size = 768
num_layers = 12
heads = 12
forward_expansion = 4
dropout = 0
device = device
max_length = max_len

src_pad_idx = 2
trg_pad_idx = 2

num_epochs = 10
# Number of training samples in the batch
batch_size = 2
learning_rate = 0.001

In [None]:
model.layernorm = Instantiate()
# model.decoder = Decoder(trg_vocab_size, embed_size, num_layers, heads, forward_expansion, droput, device, max_length)

In [None]:
class Transformer(nn.Module):
  def __init__(self,
               src_vocab_size,
               trg_vocab_size,
               src_pad_idx,
               trg_pad_idx,
               model,
               embed_size=768,
               num_layers=12,
               heads=12,
               forward_expansion=4,
               dropout=0,
               device=device,
               max_length=max_len,
              ):
    super(Transformer, self).__init__()

    self.encoder = model
    self.decoder = Decoder(trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length)
    self.src_pad_idx = src_pad_idx
    self.trg_pad_idx = trg_pad_idx
    self.device = device

# Src mask is used so the encoder does not pay attention to the padding values appended to the input
  def make_src_mask(self, src):
    src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
    # (N, 1, 1, src_length)
    return src_mask.to(self.device)

  def make_trg_mask(self, trg):
    N, trg_len= trg.shape
    # trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
    #     N, 1, trg_len, trg_len
    # )
    trg_mask = torch.tril(torch.ones(trg_len, N))
    return trg_mask.to(self.device)

    # # trg  = [batch_size, trg_len]
    # trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)

    # # trg_pad_mask = [batch_size, 1, 1, trg_len]

    # trg_len = trg.shape[1]

    # trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=self.device)).bool()

    # # trg_sub_mask = [trg_len, trg_len]

    # trg_mask = trg_pad_mask & trg_sub_mask

    # #trg_mask = [batch_size, 1, trg len, trg len]

    # return trg_mask

  def forward(self, src, trg):
    src_mask = self.make_src_mask(src)
    trg_mask = self.make_trg_mask(trg)

    enc_src = self.encoder(src)
    enc_out = enc_src["pooler_output"].unsqueeze(1).expand(-1, trg.shape[1], -1)
    out = self.decoder(trg, enc_out, src_mask, trg_mask)
    return out


In [None]:
newModel = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, model).to(device)
optimizer = torch.optim.Adam(newModel.parameters(), learning_rate)

In [None]:
# train_dataset = torch.utils.data.TensorDataset(ds_train)
train_loader = torch.utils.data.DataLoader(ds_train, batch_size=batch_size, shuffle=True)
# print(len(ds_train[8]["waveform"][0]))
# val = next(iter(train_loader))
#for i, data in enumerate(train_loader, 0):
#   print(data["ytid"])

In [None]:
from torchtext.data.metrics import bleu_score
criterion = nn.CrossEntropyLoss(ignore_index=vocab_dict["<PAD>"])
optimizer = torch.optim.Adam(newModel.parameters(), lr=learning_rate)

In [None]:
# Training loop
batches_per_epoch = int(ds_train.num_rows / batch_size)
output = []
sentences = []
step = 0
for epoch in range(num_epochs):
  for i in range(batches_per_epoch):
    start = i * batch_size
    # print(start)
    xBatch = ds_train[start:start+batch_size]
    # print(xBatch)
    waveform = xBatch["waveform"]
    id = xBatch["ytid"]
    print(step, id)
    input = torch.tensor(xBatch["input_values"]).squeeze(1).to(device)
    # input = torch.tensor(xBatch["input_values"]).to(device)
    target = torch.tensor(xBatch["tokenizedCaption"]).to(device)
    print(input.shape)
    print(target.shape)
    currOutput = newModel(input, target)
    print(currOutput)
    pred = currOutput[0].argmax(1)
    # print(predIdx)
    # output.append(currOutput)
    # print(target[0])

    # loss = bleu_score(predSen, targetSen)
    # loss = criterion(currOutput[0], target[0])
    # loss = criterion(currOutput.reshape(-1, currOutput.shape[2]), target[0].reshape(-1))
    currOutput = currOutput.view(-1, currOutput.shape[2])
    # targetInput = target[:,1:].reshape(-1)
    targetInput = target.contiguous().view(-1)
    loss = criterion(currOutput, targetInput)
    print("Loss", loss)

    step += 1
    # for name, param in newModel.named_parameters():
    #   if param.grad is not None:
    #     print(name, param.grad)
    optimizer.zero_grad()
    loss.backward(loss)
    # torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
    optimizer.step()



  print("Step", step)
  print("Loss", loss)
  # print("Score", score)
  # for name, param in newModel.named_parameters():
  #   if param.grad is not None:
  #     print(name, param.grad)


    # for j in range(len(currOutput)):
    #   outputSentence = []
    #   vector = currOutput[j]
    #   for k in range(len(vector)):
    #     sumInput = vector[k].detach().numpy()
    #     sum = np.sum(sumInput)
    #     outputSentence.append(vocab_list[int(sum)])
    #   sentences.append(outputSentence)

0 ['298Q_pSLflU', 'BlsbeyimUDE']
torch.Size([2, 1024, 128])
torch.Size([2, 153])
tensor([[[-0.4229, -0.7632, -0.8533,  ..., -0.5664,  0.0180,  1.5280],
         [-0.3425, -0.3313, -0.9018,  ..., -0.8380,  0.2074,  1.1715],
         [-0.3920,  0.4152, -0.6251,  ..., -0.8473,  0.2636,  1.4189],
         ...,
         [ 0.2417,  0.3681, -0.7659,  ..., -0.6805, -0.4889,  1.6627],
         [ 0.2157,  0.3115, -0.7473,  ..., -0.6633, -0.5407,  1.7192],
         [ 0.2654,  0.3764, -0.7792,  ..., -0.6662, -0.4922,  1.6873]],

        [[-0.1875, -0.8318, -0.6415,  ..., -0.4016,  0.1926,  1.4235],
         [ 0.0536,  0.1550, -0.0653,  ..., -0.4587,  0.1002,  1.5721],
         [ 0.0611, -0.5323, -0.7416,  ..., -0.7223,  0.0375,  1.3380],
         ...,
         [ 0.2024,  0.2981, -0.5165,  ..., -0.5951, -0.5278,  1.3957],
         [ 0.2049,  0.2963, -0.5071,  ..., -0.5532, -0.4863,  1.4050],
         [ 0.2065,  0.3249, -0.5204,  ..., -0.5527, -0.5163,  1.4045]]],
       grad_fn=<ViewBackward0>)
Los

KeyboardInterrupt: ignored

# Testing

In [None]:
input = torch.tensor(ds_train[0]["input_values"]).to(device)
target = torch.tensor(ds_train[0]["tokenizedCaption"]).unsqueeze(0).to(device)
print(target.shape)
modelOutput = newModel(input, target)

torch.Size([1, 153])


In [None]:
encoderOutput = model(input)

In [None]:
print(encoderOutput.keys())

odict_keys(['last_hidden_state', 'pooler_output'])


In [None]:
print(encoderOutput["last_hidden_state"].shape)
print(encoderOutput["pooler_output"].shape)
print(encoderOutput["pooler_output"].unsqueeze(1).expand(-1, target.shape[1], -1).shape)
enc_src = encoderOutput["pooler_output"].unsqueeze(1).expand(-1, target.shape[1], -1)
print(enc_src.shape)
print(enc_src[0].shape)

torch.Size([1, 1214, 768])
torch.Size([1, 768])
torch.Size([1, 153, 768])
torch.Size([1, 153, 768])
torch.Size([153, 768])


In [None]:
def make_src_mask(src):
  src_mask = (src != src_pad_idx).unsqueeze(1).unsqueeze(2)
  # (N, 1, 1, src_length)
  return src_mask.to(device)

def make_trg_mask(trg):
  N, trg_len = trg.shape
  # trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
  #     N, 1, trg_len, trg_len
  # )
  trg_mask = torch.tril(torch.ones(trg_len, N))
  return trg_mask.to(device)



src_mask = make_src_mask(input)
trg_mask = make_trg_mask(target)

decoderBlock = Decoder(trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length).to(device)

decoderOutput = decoderBlock(target, enc_src, src_mask, trg_mask).to(device)

In [None]:
print(target[:,:-1].shape)

torch.Size([1, 152])


In [None]:
print(decoderOutput.shape)
print(decoderOutput[0])

torch.Size([1, 153, 5475])
tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]], grad_fn=<SelectBackward0>)


In [None]:
print(modelOutput[0].shape)
print(modelOutput[0])
pred = modelOutput[0].argmax(1)
print(pred)

In [None]:
# currOutput[0]
# pred = currOutput[0].argmax(1)
# print(pred)
# print(len(currOutput[0][1]))
# print(currOutput.shape)
print(currOutput)
# print(outputSentence)
print(pred)

tensor([[0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        ...,
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000]],
       grad_fn=<ViewBackward0>)
tensor([  0, 489,  87, 185,   4,   4,  15,   0,   3, 172,  27,   4, 110,  87,
          9,   4, 489,  12,  87,   4,   9,  20,   4,  49,   4,   4,   3,  27,
          9,  87,   9, 110,   4,   4,  27,  27,   4,   9,   9,   9,   4,  27,
         27,   4,  11,  87,   4,   4,   9,   9,   9, 466,   9, 489,   4,  20,
          9,   4,  27,  27,  27,   4,   4,  27,  27,   4,  27,  27,   9,   3,
          4,   4,  27,   4,  27,  27,  27,   4,  27,   4,   4,  27,  27, 110,
         27,   4,   9,  27,   9,  27,   4,  27,  27,   0,   0,   0,   0,   0,
          0,   0,   0,  

In [None]:
print(currOutput[1][489])

tensor(0.5000, grad_fn=<SelectBackward0>)


In [None]:
from gensim.models import Word2Vec
import gensim.downloader
# word2vec = Word2Vec.load()
glove_vec = gensim.downloader.load('word2vec-google-news-300')

In [None]:
glove_vec.most_similar("guitar")

In [None]:
glove_vec.index_to_key

In [None]:
closest_word = None
max_sim = -1
vector1 = output[0][1][0]
vector2 = torch.zeros(149)
print(vector1.shape)

vector = torch.cat((vector1, vector2), dim=0)
for word in glove_vec.index_to_key:
  word_vector = torch.tensor(glove_vec[word])
  sim = torch.dot(vector, word_vector) / (torch.norm(vector) * torch.norm(word_vector)).detach.numpy()
  if sim > max_sim:
    max_sim = sim
    closest_word = word


print(closest_word)

# Debugging

In [None]:
with torch.no_grad():
  outputs = model(inputs)

In [None]:
print(outputs.keys())
outputs["last_hidden_state"].shape

odict_keys(['last_hidden_state', 'pooler_output'])


torch.Size([1, 1214, 768])

In [None]:
decoder = Decoder(trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length).to(device)

In [None]:
print(decoder)

Decoder(
  (word_embedding): Embedding(6143, 768)
  (layers): ModuleList(
    (0-11): 12 x DecoderBlock(
      (attention): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
      )
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (transformer_block): TransformerBlock(
        (attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (feed_forward): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): ReLU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0, inplace=False)
      )
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (fc_out): Linear(in_featur

In [None]:
def make_src_mask(src):
  # src_mask = (src != src_pad_idx).unsqueeze(1).unsqueeze(2)
  src_mask = (src != src_pad_idx).squeeze()
  # (N, 1, 1, src_length)
  return src_mask.to(device)

In [None]:
def make_trg_mask(trg):
  N, trg_len = trg.shape
  # trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
  #     N, 1, trg_len, trg_len
  # )
  trg_mask = torch.tril(torch.ones(trg_len, N))
  return trg_mask.to(device)

In [None]:
enc_src = outputs["pooler_output"].unsqueeze(1).expand(-1, 18, -1)
src_mask = make_src_mask(inputs)
trg_mask = make_trg_mask(trg)
print(trg_mask.shape)
print(src_mask.shape)
decodeOut = decoder(trg, enc_src, trg_mask, trg_mask)

torch.Size([18, 1])
torch.Size([1024, 128])


In [None]:
print(decodeOut.shape)

torch.Size([1, 18, 6143])


In [None]:
N, seq_length = trg.shape
positions = torch.arange(0, seq_length).expand(N, seq_length).to(device)
word_embedding = nn.Embedding(trg_vocab_size, embed_size).to(device)
# position_embedding = nn.Embedding(embed_size, max_length).to(device)
position_embedding = nn.Parameter(torch.randn(max_length, embed_size)).to(device)
dropoutFunc =  nn.Dropout(dropout)

In [None]:
word = word_embedding(trg)
# pos = position_embedding(positions)
pos = position_embedding[:seq_length]
print(word.shape)
print(pos.shape)
x = dropoutFunc(word + pos)
# x = dropout((word + pos))
print(x.shape)

torch.Size([1, 18, 768])
torch.Size([18, 768])
torch.Size([1, 18, 768])


In [None]:
self_attention = nn.MultiheadAttention(embed_size, heads)
self_norm = nn.LayerNorm(embed_size)
attention, _ = self_attention(x, x, x, trg_mask)
print(attention)
query = dropoutFunc(self_norm(attention + x))
print(query.shape)
# Modifies enc_src to be the same shape as query
test_enc_src = enc_src.unsqueeze(1).expand(-1, query.size(1), -1)
print(test_enc_src.shape)
print(src_mask.shape)
test_src_mask = src_mask[-1, -1, :, :]
print(test_src_mask.shape)

tensor([[[-0.7805, -0.1441, -1.3261,  ..., -0.6013, -0.0808,  0.0878],
         [ 0.7496,  1.0978,  0.0337,  ..., -0.5370, -0.1557, -0.3764],
         [-1.1219, -0.2174, -0.2250,  ...,  0.6839, -0.2006,  0.6456],
         ...,
         [ 0.2170, -0.6863, -0.5809,  ...,  0.0870,  0.3194,  0.1975],
         [-0.1514, -0.7697,  0.3599,  ...,  1.1205, -0.8512, -0.4996],
         [ 0.7957,  0.4360,  0.1964,  ..., -0.3208, -1.2579, -0.8765]]],
       grad_fn=<ViewBackward0>)
torch.Size([1, 18, 768])
torch.Size([1, 18, 768])
torch.Size([1, 1, 1024, 128])
torch.Size([1024, 128])


In [None]:
transform_attention = self_attention(query, test_enc_src, test_enc_src, trg_mask)

In [None]:
self_transformer_block = TransformerBlock(embed_size, heads, dropout, forward_expansion)
print(enc_src.shape)
trans = self_transformer_block(query, test_enc_src, test_enc_src, src_mask)

torch.Size([1, 768])


AssertionError: ignored

In [None]:
print(trg_mask.shape)

torch.Size([18, 18])


In [None]:
target = [[1,3,2,4,5], [6,7,8,9,10]]
target = np.asarray(target)
target_mask = make_trg_mask(target)

tensor([[[[1., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0.],
          [1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1.]]],


        [[[1., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0.],
          [1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1.]]]])


In [None]:
print(target_mask)
print(target_mask.shape)

tensor([[[[1., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0.],
          [1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1.]]],


        [[[1., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0.],
          [1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1.]]]])
torch.Size([2, 1, 5, 5])


# LSTM Decoder Implementation

In [None]:
print(model.encoder.layer)

ModuleList(
  (0-11): 12 x ASTLayer(
    (attention): ASTAttention(
      (attention): ASTSelfAttention(
        (query): Linear(in_features=768, out_features=768, bias=True)
        (key): Linear(in_features=768, out_features=768, bias=True)
        (value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (output): ASTSelfOutput(
        (dense): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (intermediate): ASTIntermediate(
      (dense): Linear(in_features=768, out_features=3072, bias=True)
      (intermediate_act_fn): GELUActivation()
    )
    (output): ASTOutput(
      (dense): Linear(in_features=3072, out_features=768, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  )
)


In [None]:
class Decoder(nn.Module):
  def __init__(self, input=3072, output_size=64, embedding_size=64, hidden_size=64, num_layers=2, p=0.2):
    super(Decoder, self).__init__()

    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.dropout = nn.Dropout(p)
    self.embedding = nn.Embedding(input, embedding_size)
    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
    self.fc = nn.Linear(hidden_size, output_size)

    self.h = [torch.zeros(num_layers, 4, hidden_size) for _ in range(2)]

  def forward(self, x, hidden):
    x = x.unsqueeze(0)
    # embedding = self.dropout(self.embedding(x))
    embedding = self.embedding(x.int())
    outputs, (hidden, cell) = self.rnn(embedding, (hidden, self.h))
    predictions = self.fc(outputs)
    predictions = predictions.squeeze(0)
    return predictions, hidden, cell


In [None]:
class Decoder(nn.Module):
  def __init__(self, vocab_size, n_hidden, n_layers):
    super(Decoder, self).__init__()

    self.input_hidden = nn.Embedding(vocab_size, n_hidden)
    self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
    self.hidden_out = nn.Linear(n_hidden, vocab_size)
    self.h = [torch.zeros(n_layers, 4, n_hidden) for _ in range(2)]

  def forward(self, x, bias):
    res, h = self.rnn(self.input_hidden(x.int()), self.h)
    self.h = [h_.detach() for h_ in h]
    return self.hidden_out(res)

In [None]:
# model.encoder
# model.encoder.layer.output = LSTM(3072, 64)
h = [torch.zeros(2, 4, 64) for _ in range(2)]
print(len(h))
print(h[0].shape)
input_h = nn.Embedding(3072, 64)
print(input_h)

2
torch.Size([2, 4, 64])
Embedding(3072, 64)


In [None]:
class Instantiate(nn.Module):
  def __init__(self):
    super(Instantiate, self).__init__()

  def forward(self, x, bias):
    return x

In [None]:
# print(model.encoder.layer[0].output)
for layer in model.encoder.layer:
  # layer.output = Decoder(3072, 64, 2)
  layer.output = Instantiate()

print(model.encoder.layer[0:12])

ModuleList(
  (0-11): 12 x ASTLayer(
    (attention): ASTAttention(
      (attention): ASTSelfAttention(
        (query): Linear(in_features=768, out_features=768, bias=True)
        (key): Linear(in_features=768, out_features=768, bias=True)
        (value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (output): ASTSelfOutput(
        (dense): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (intermediate): ASTIntermediate(
      (dense): Linear(in_features=768, out_features=3072, bias=True)
      (intermediate_act_fn): GELUActivation()
    )
    (output): Instantiate()
    (layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  )
)


In [None]:
with torch.no_grad():
  outputs = model(inputs)

RuntimeError: ignored

In [None]:
print(model.encoder.layer)

ModuleList(
  (0-11): 12 x ASTLayer(
    (attention): ASTAttention(
      (attention): ASTSelfAttention(
        (query): Linear(in_features=768, out_features=768, bias=True)
        (key): Linear(in_features=768, out_features=768, bias=True)
        (value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (output): ASTSelfOutput(
        (dense): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (intermediate): ASTIntermediate(
      (dense): Linear(in_features=768, out_features=3072, bias=True)
      (intermediate_act_fn): GELUActivation()
    )
    (output): Decoder(
      (dropout): Dropout(p=0.2, inplace=False)
      (embedding): Embedding(3072, 64)
      (rnn): LSTM(64, 64, num_layers=2, dropout=0.2)
      (fc): Linear(in_features=64, out_features=64, bias=True)
    )
    (layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)