<a href="https://colab.research.google.com/github/JTStephens18/AudioTranscriptor/blob/main/V9_VisionEncDec_audioProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Change log:
V9 aims to remove all unnecessary blocks and add comments about the overall code

# Installation / Setup

In [None]:
! pip install yt-dlp
# Install huggingface audio datasets
! pip install datasets[audio]
! pip install transformers evaluate jiwer
!pip install accelerate -U
!pip install nltk



In [None]:
from datasets import load_dataset, Audio, Dataset, concatenate_datasets
from transformers import VisionEncoderDecoderModel, AutoFeatureExtractor, AutoTokenizer, AutoProcessor, ASTFeatureExtractor
import subprocess
import os
from pathlib import Path
import torch
import torch.nn as nn
import torchaudio
import soundfile as sf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import librosa
import re
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
ds = load_dataset('google/MusicCaps', split="train")
ds

Dataset({
    features: ['ytid', 'start_s', 'end_s', 'audioset_positive_labels', 'aspect_list', 'caption', 'author_id', 'is_balanced_subset', 'is_audioset_eval'],
    num_rows: 5521
})

Remove unneccesary data

In [None]:
ds = ds.remove_columns(["author_id", "is_balanced_subset", "is_audioset_eval", "audioset_positive_labels"])

# Tokenize the dataset

In [None]:
from nltk.tokenize import word_tokenize
# This function removes instances of ' \ () and [] as they are not necessary
# Helps to reduce number of tokens during tokenization
def process_aspect_list(example):
  newList = []
  aspect_list = example["aspect_list"]
  for word in word_tokenize(aspect_list):
    cleaned_word = re.sub(r'[\'\[\]\(\),]', ' ', word)
    val = re.findall(r'\w+|[^\w\s]', cleaned_word)
    if(len(val) > 0):
      newList.append(val[0])
  example["aspect_list"] = newList
  return example


ds = ds.map(
    process_aspect_list,
    num_proc=4,
    writer_batch_size=1000,
    keep_in_memory=False,
)

This cell runs through the process of creating the tokens for the words in our dataset

In [None]:
import re
from nltk.tokenize import word_tokenize
# vocab_list = ["word 1", "word 2",  ..., "word N"]
vocab_list = ["<SOS>", "<EOS>", "<PAD>", "<UNK>"]
max_len = 0
vocab_count = {}
maxLenItem = 0

# Need to split on punctuation
# I see some tokens such as "funk/pop" and singing.the - These should be separate

def split_word(word):
  # Removes brackets, commas, and single quotes from the corpus
  word = re.sub(r'[\'\[\]\(\),]', '', word)
  return re.findall(r'\w+|[^\w\s]', word)


for i in range(ds.num_rows):
  aspect_list = ds[i]["aspect_list"]
  # Find maximum length of apsect_list in entire dataset
  if(len(aspect_list) > max_len):
    max_len = len(aspect_list)+1
    maxLenItem = i
  # Adds items to the vocab_list and keeps a count of how often that word appears in the dataset
  for j in range(len(aspect_list)):
    word = aspect_list[j].lower()
    if(word) not in vocab_list:
      vocab_list.append(word)
      vocab_count[word] = 1
    else:
      vocab_count[word] += 1

# Vocab_dict is useful when we have a word and want to find its token value
vocab_dict = {}
for i in range(len(vocab_list)):
  vocab_dict[vocab_list[i]] = i

# Data Processing

In [None]:
ds = ds.train_test_split(test_size=0.2)

Using the feature extractor from the AST model from hugging face

https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer

In [None]:
feature_extractor = ASTFeatureExtractor()
feature_extractor

ASTFeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "ASTFeatureExtractor",
  "feature_size": 1,
  "max_length": 1024,
  "mean": -4.2677393,
  "num_mel_bins": 128,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000,
  "std": 4.5689974
}

In [None]:
def download_clip(
    video_id,
    output_filename,
    start_time,
    end_time,
    tmp_dir='/musiccaps',
    num_attempts=5,
    url_base='https://www.youtube.com/watch?v='
):

  status = False
  command = f"""
        yt-dlp --quiet --no-warnings -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" {url_base}{video_id} --force-keyframes-at-cuts
    """.strip()
  attempts = 0
  while True:
    try:
      output = os.system(command)
    except subprocess.CalledProcess.Error as err:
      attempts += 1
      if attempts == num_attempts:
        return status, err.output
    else:
      break

  # Check if video was successfully saved
  status = os.path.exists(output_filename)
  return status, 'Downloaded'


def toUppercase(aspect_list):
  a_list = aspect_list[1:-1]
  new_list = ''
  for word in a_list:
    new_list = ''.join(a_list).upper().replace(',', '')
    # new_list = new_list.replace(' ', '|')
    if(len(new_list) > 128):
      new_list = new_list[:128]
  return new_list

# Convert the words into tokens
def tokenizeCaption(caption):
  output = [vocab_dict["<SOS>"]]
  input = word_tokenize(caption.lower())
  for i in range(len(input)):
    if(input[i]) in vocab_dict:
      index = vocab_dict[input[i]]
      output.append(index)
    else:
      output.append(vocab_dict['<UNK>'])
  output.append(vocab_dict["<EOS>"])
  return output


def tokenizeAspectList(aspect_list):
  output = [vocab_dict["<SOS>"]]
  for i in range(len(aspect_list)):
    if(aspect_list[i] in vocab_dict):
      index = vocab_dict[aspect_list[i]]
      output.append(index)
    else:
      output.append(vocab_dict["<UNK>"])
  output.append(vocab_dict["<EOS>"])
  return output


def process(example):
  output_path = str(data_dir / f"{example['ytid']}.wav")
  status = True
  # aspect_string = toUppercase(example['aspect_list'])
  if not os.path.exists(output_path):
    status = False
    status, log = download_clip(
        example['ytid'],
        output_path,
        example['start_s'],
        example['end_s'],
    )

  # example["tokenizedCaption"] = tokenizeCaption(example["caption"])
  example["tokenizedAspectList"] = tokenizeAspectList(example["aspect_list"])
  example["audio"] = output_path
  example['download_status'] = status
  if(example["ytid"] == "qsRPTMXFGsA"):
    example["download_status"] = False
  example["image_path"] = f'./spectrograms/{example["ytid"]}.png'
  return example

In [None]:
def stereo_to_mono(wav):
  chan_1 = wav[0][:]
  chan_2 = wav[1][:]
  mono = (chan_1 + chan_2) / 2
  return mono

In [None]:
def resample_waveform(example):
  filepath = example["audio"]
  y, sr = librosa.load(filepath, sr=16000)
  sf.write(filepath, y, sr)
  waveform, sampling_rate = torchaudio.load(filepath)
  if(waveform.shape[0] == 2):
    waveform = stereo_to_mono(waveform)

  example["waveform"] = waveform
  return example

In [None]:
def wavToInput(example):
  wav = np.asarray(example["waveform"])
  inputs = feature_extractor(wav, sampling_rate=sampling_rate, padding="max_length", return_tensors="pt").input_values
  example["input_values"] = inputs
  example["inputs_shape"] = inputs.shape
  return example

In [None]:
samples_to_load = 4000
cores = 4
sampling_rate = 16000
writer_batch_size = 1000
data_dir = "./music_data"
upper_limit = 5521
lower_limit = 5000

data_dir = Path(data_dir)
data_dir.mkdir(exist_ok=True, parents=True)

ds_train = ds["train"].select(range(samples_to_load))
ds_test = ds["test"].select(range(int(samples_to_load*0.2)))

ds_train = ds_train.map(
    process,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
)

ds_train = ds_train.filter(lambda ex: ex["download_status"] == True)
ds_train = ds_train.map(
    resample_waveform,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
)

ds_train = ds_train.map(
    wavToInput,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
)

ds_test = ds_test.map(
    process,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
)

ds_test = ds_test.filter(lambda ex: ex["download_status"] == True)
ds_test = ds_test.map(resample_waveform, num_proc=cores, writer_batch_size=writer_batch_size, keep_in_memory=False)
ds_test = ds_test.map(
    wavToInput,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
)

Map (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3938 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3938 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/800 [00:00<?, ? examples/s]

Filter:   0%|          | 0/800 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/786 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/786 [00:00<?, ? examples/s]

# AST Training

In [None]:
from transformers import AutoModel
model = AutoModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

In [None]:
class MultiHeadAttentionLayer(nn.Module):
  def __init__(self, hid_dim, n_heads, dropout, device):
    super().__init__()

    assert hid_dim % n_heads == 0

    self.hid_dim = hid_dim
    self.n_heads = n_heads
    self.head_dim = hid_dim // n_heads

    self.fc_q = nn.Linear(hid_dim, hid_dim)
    self.fc_k = nn.Linear(hid_dim, hid_dim)
    self.fc_v = nn.Linear(hid_dim, hid_dim)

    self.fc_o = nn.Linear(hid_dim, hid_dim)

    self.dropout = nn.Dropout(dropout)
    self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

  def forward(self, query, key, value, mask = None):

    batch_size = query.shape[0]

    # query = [batch_size, query len, hid_dim]
    # key = [batch_size, key len, hid_dim]
    # value = [batch_size, value len, hid_dim]

    Q = self.fc_q(query)
    K = self.fc_k(query)
    V = self.fc_v(query)

    # Q = [batch_size, query len, hid_dim]
    # K = [batch_size, key len, hid_dim]
    # V = [batch_size, value len, hid_dim]

    Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0,2,1,3)
    K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0,2,1,3)
    V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0,2,1,3)

    # Q = [batch_size, n heads, query len, hid_dim]
    # K = [batch_size, n heads, key len, hid_dim]
    # V = [batch_size, n heads, value len, hid_dim]

    energy = torch.matmul(Q, K.permute(0,1,3,2)) / self.scale

    # energy = [batch_size, n heads, query len, key len]

    if mask is not None:
      energy = energy.masked_fill(mask == 0, -1e10)

    attention = torch.softmax(energy, dim=-1)

    # attention = [batch_size, n_heads, query len, key len]

    x = torch.matmul(self.dropout(attention), V)

    # x = [batch_size, n_heads, query len, head_dim]

    x = x.permute(0,2,1,3).contiguous()

    # x = [batch_size, query len, n_heads, head_dim]

    x = x.view(batch_size, -1, self.hid_dim)

    # x = [batch_size, query len, hid dim]

    x = self.fc_o(x)

    # x = [batch_size, query len, hid dim]

    return x, attention

# Transformer Decoder Implementation

In [None]:
class TransformerBlock(nn.Module):
  def __init__(self, embed_size, heads, dropout, forward_expansion):
    super(TransformerBlock, self).__init__()
    self.attention = nn.MultiheadAttention(embed_size, heads).to(device)
    # self.attention = MultiHeadAttentionLayer(embed_size, heads, dropout, device)
    self.norm1 = nn.LayerNorm(embed_size)
    self.norm2 = nn.LayerNorm(embed_size)

    self.feed_forward = nn.Sequential(
        nn.Linear(embed_size, forward_expansion*embed_size),
        nn.ReLU(),
        nn.Linear(forward_expansion*embed_size, embed_size)
    )
    self.dropout = nn.Dropout(dropout)

  def forward(self, value, key, query, mask):
    # attention, _ = self.attention(query, key, value, mask)
    # Shouldn't need a mask to ignore padding on inputs since padding is not needed for inputs
    attention, _ = self.attention(query, key, value)

    x = self.dropout(self.norm1(attention + query))
    forward = self.feed_forward(x)
    out = self.dropout(self.norm2(forward + x))
    return out

In [None]:
class DecoderBlock(nn.Module):
  def __init__(self, embed_size, heads, forward_expansion, dropout, device, max_length):
    super(DecoderBlock, self).__init__()
    self.attention = nn.MultiheadAttention(embed_size, heads).to(device)
    # self.attention = MultiHeadAttentionLayer(embed_size, heads, dropout, device)
    self.norm = nn.LayerNorm(embed_size)
    self.transformer_block = TransformerBlock(
        embed_size, heads, dropout, forward_expansion
    )
    self.dropout = nn.Dropout(dropout)
    # self.feed_forward = nn.Sequential(
    #     # Target shape[1] to forward_expansion*embed_size
    #     nn.Linear(max_length, forward_expansion*embed_size),
    #     nn.ReLU(),
    #     nn.Linear(forward_expansion*embed_size, max_length)
    # )

  def forward(self, x, value, key, src_mask, trg_mask):
    # A feed forward connection adds other parameters so there is an additional case to learn if needed
    # x = self.feed_forward(x)
    attention, _ = self.attention(x, x, x, trg_mask)
    query = self.dropout(self.norm(attention + x))
    # query = self.norm(x + self.dropout(attention))
    out = self.transformer_block(value, key, query, src_mask)
    return out

In [None]:
class Decoder(nn.Module):
  def __init__(self, trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length):
    super(Decoder, self).__init__()
    self.device = device
    self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
    self.position_embedding = nn.Embedding(max_length, embed_size)
    # self.position_embedding = nn.Parameter(torch.randn(max_length, embed_size))

    self.layers = nn.ModuleList(
        [DecoderBlock(embed_size, heads, forward_expansion, dropout, device, max_length)
        for _  in range(num_layers)]
    )

    self.fc_out = nn.Linear(embed_size, trg_vocab_size)
    self.dropout = nn.Dropout(dropout)
    self.softmax = nn.Softmax(dim=0)
    self.scale = torch.sqrt(torch.FloatTensor([embed_size])).to(device)

  def forward(self, x, enc_out, src_mask, trg_mask):
    N, seq_length = x.shape
    pos = torch.arange(0, seq_length).unsqueeze(0).repeat(N, 1).to(self.device)
    # x = self.dropout(self.word_embedding(x) + self.position_embedding(pos))
    x = self.dropout((self.word_embedding(x) * self.scale) + self.position_embedding(pos))
    # positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
    # positions = position_embedding[:seq_length]
    # x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))
    # x = self.dropout((self.word_embedding(x) + self.position_embedding[:seq_length]))

    for layer in self.layers:
      # x = [N, seq_length, embed_size]
      x = layer(x, enc_out, enc_out, src_mask, trg_mask)

    out = self.fc_out(x)
    # out = self.softmax(out)
    return out


In [None]:
class Instantiate(nn.Module):
  def __init__(self):
    super(Instantiate, self).__init__()

  def forward(self, x):
    return x

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
src_vocab_size = 1024 # max_length of feature extractor - not even used
trg_vocab_size = len(vocab_list)
embed_size = 768
num_layers = 12
heads = 12
forward_expansion = 4
dropout = 0.1
device = device
max_length = max_len

src_pad_idx = 2
trg_pad_idx = 2

num_epochs = 20
# Number of training samples in the batch
batch_size = 4
learning_rate = 0.00001

In [None]:
model.layernorm = Instantiate()

In [None]:
class Transformer(nn.Module):
  def __init__(self,
               src_vocab_size,
               trg_vocab_size,
               src_pad_idx,
               trg_pad_idx,
               model,
               embed_size=768,
               num_layers=12,
               heads=12,
               forward_expansion=4,
               dropout=0,
               device=device,
               max_length=max_len,
              ):
    super(Transformer, self).__init__()

    self.encoder = model
    self.decoder = Decoder(trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length)
    self.src_pad_idx = src_pad_idx
    self.trg_pad_idx = trg_pad_idx
    self.device = device

# Src mask is used so the encoder does not pay attention to the padding values appended to the input
  def make_src_mask(self, src):
    src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
    # (N, 1, 1, src_length)
    return src_mask.to(self.device)

  def make_trg_mask(self, trg):
    N, trg_len= trg.shape
    # trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
    #     N, 1, trg_len, trg_len
    # )
    trg_mask = torch.tril(torch.ones(trg_len, N))
    return trg_mask.to(self.device)


  def make_trg_mask_custom_attention(self, trg):
    # trg  = [batch_size, trg_len]
    trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)

    # trg_pad_mask = [batch_size, 1, 1, trg_len]

    trg_len = trg.shape[1]

    trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=self.device)).bool()

    # trg_sub_mask = [trg_len, trg_len]

    trg_mask = trg_pad_mask & trg_sub_mask

    #trg_mask = [batch_size, 1, trg len, trg len]

    return trg_mask

  def forward(self, src, trg):
    src_mask = self.make_src_mask(src)
    trg_mask = self.make_trg_mask(trg)
    # trg_mask = nn.Transformer.generate_square_subsequent_mask(max_length)

    enc_src = self.encoder(src)
    enc_out = enc_src["pooler_output"].unsqueeze(1).expand(-1, trg.shape[1], -1)
    out = self.decoder(trg, enc_out, src_mask, trg_mask)
    return out


In [None]:
newModel = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, model).to(device)
optimizer = torch.optim.Adam(newModel.parameters(), learning_rate)

In [None]:
from torchtext.data.metrics import bleu_score
criterion = nn.CrossEntropyLoss(ignore_index=vocab_dict["<PAD>"])
optimizer = torch.optim.Adam(newModel.parameters(), lr=learning_rate)

In [None]:
class CustomLoss(nn.Module):
  def __init__(self, pad_idx, eos_idx):
    super(CustomLoss, self).__init__()
    self.cross_entropy_loss = nn.CrossEntropyLoss(ignore_index=vocab_dict["<PAD>"])
    self.pad_idx = pad_idx
    self.eos_idx = eos_idx

  def forward(self, logits, targets):
     # Calculate standard cross-entropy loss
    ce_loss = self.cross_entropy_loss(logits, targets)

    # Penalize extra tokens beyond first occurence of <EOS>
    eos_positions = (targets == self.eos_idx).nonzero()
    if(eos_positions.numel() > 0):
      # Only consider the first occurence of <EOS>
      first_eos_pos = eos_positions[0, :]

      # Count tokens beyond the first <EOS> position
      if first_eos_pos.dim() > 0:
        # extra_tokens = torch.clamp(first_eos_pos - logits.size(1), min=0)
        extra_tokens = max(first_eos_pos - logits.size(1), 0)
      else:
        extra_tokens = 0

      penalty_weight = 0.1

      # Add penalty term to loss
      extra_tokens_penalty = penalty_weight * extra_tokens
      total_loss = ce_loss + extra_tokens_penalty
    else:
      total_loss = ce_loss

    return total_loss


pad_idx = 2
eos_idx = 1
custom_loss = CustomLoss(pad_idx, eos_idx)

In [None]:
# Training loop
batches_per_epoch = int(ds_train.num_rows / batch_size)
output = []
sentences = []
step = 0
for epoch in range(num_epochs):
  for i in range(batches_per_epoch):
    start = i * batch_size
    # print(start)
    xBatch = ds_train[start:start+batch_size]
    # xBatch = ds_train[i]
    # print(xBatch)
    waveform = xBatch["waveform"]
    id = xBatch["ytid"]
    print(step, id)
    input = torch.tensor(xBatch["input_values"]).squeeze(1).to(device)
    # input = torch.tensor(xBatch["input_values"]).to(device)

    # captions = xBatch["tokenizedCaption"]
    captions = xBatch["tokenizedAspectList"]
    maxCaptionLen = 0
    # Find max length of caption in batch
    for item in range(batch_size):
      if(len(captions[item]) > maxCaptionLen):
        maxCaptionLen = len(captions[item])

  # Add padding to the captions based on max len
    for j in range(batch_size):
      if(len(captions[j]) < maxCaptionLen):
        for k in range(len(captions[j]), maxCaptionLen):
          captions[j].append(2)

    # target = torch.tensor(xBatch["tokenizedCaption"]).to(device)
    target = torch.tensor(xBatch["tokenizedAspectList"]).to(device)
    # target = target.unsqueeze(0)
    print(input.shape)
    print(target.shape)
    currOutput = newModel(input, target[:, :-1])
    # currOutput = newModel(input, target)
    # print(currOutput)
    pred = currOutput[0].argmax(1)
    pred2 = currOutput[1].argmax(1)
    # print(predIdx)
    # output.append(currOutput)
    # print(target[0])

    # for j in range(len(pred)):
    #   if(pred[j] == 1):
    #     tempOutput = currOutput[0][:j]
    #     print(tempOutput.shape)
        # for k in range(j+1, len(pred)):
        #   currOutput[0][k][2] = 100

    # for j in range(len(pred2)):
    #   if(pred2[j] == 1):
    #     # for k in range(j+1, len(pred2)):
    #     #   currOutput[1][k][2] = 100

    # newMax = lossOutput.argmax(1)
    # print(newMax)

    # loss = bleu_score(predSen, targetSen)
    # loss = criterion(currOutput[0], target[0])
    # loss = criterion(currOutput.reshape(-1, currOutput.shape[2]), target[0].reshape(-1))
    currOutput = currOutput.view(-1, currOutput.shape[2])
    targetInput = target[:,1:].reshape(-1)

    # targetInput = target.contiguous().view(-1)
    # loss = criterion(currOutput, targetInput)
    loss = custom_loss(currOutput, targetInput)
    print("Loss", loss)

    step += 1
    # for name, param in newModel.named_parameters():
    #   if param.grad is not None:
    #     print(name, param.grad)
    optimizer.zero_grad()
    loss.backward(loss)
    # torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
    optimizer.step()



  print("Step", step)
  print("Loss", loss)
  torch.save(newModel.state_dict(), 'newModelCheckpoint.pth')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
torch.Size([4, 1024, 128])
torch.Size([4, 37])
Loss tensor(0.5138, device='cuda:0', grad_fn=<AddBackward0>)
18432 ['cY3g6N5Sokk', 'feC0L9MtghM', '-Umconw-CRE', 'A6HJBIU1rD0']
torch.Size([4, 1024, 128])
torch.Size([4, 48])
Loss tensor(0.6584, device='cuda:0', grad_fn=<AddBackward0>)
18433 ['YE2rN3xknlk', 'B_kAtTBUDIA', 'EkmHGd0U8yE', 'BVt8RgNrwbQ']
torch.Size([4, 1024, 128])
torch.Size([4, 22])
Loss tensor(0.4301, device='cuda:0', grad_fn=<AddBackward0>)
18434 ['A5fPSkTvjmY', 'PlQibWaPAcM', 'ymuRKv9iJm4', 'LAHWV6fZwUk']
torch.Size([4, 1024, 128])
torch.Size([4, 30])
Loss tensor(0.4928, device='cuda:0', grad_fn=<AddBackward0>)
18435 ['X-uVubaJ3II', '244y56-vLWE', 'nU7x170OvJ4', 'paeNnR33i5Q']
torch.Size([4, 1024, 128])
torch.Size([4, 45])
Loss tensor(0.6400, device='cuda:0', grad_fn=<AddBackward0>)
18436 ['cG1dpyC8gV4', 'n3X8RGZsGg4', 'sDoV3sMgDhE', 'lSb7Y-_3to8']
torch.Size([4, 1024, 128])
torch.Size([4, 34])
Loss tensor(0

# Testing

In [None]:
def dynamicPadding(batch):
  captions = batch["tokenizedAspectList"]
  maxCaptionLen = 0
  for i in range(batch_size):
    if(len(captions[i]) > maxCaptionLen):
      maxCaptionLen = len(captions[i])

  for j in range(batch_size):
    if(len(captions[j]) < maxCaptionLen):
      for k in range(len(captions[j]), maxCaptionLen):
        captions[j].append(2)

  return batch

In [None]:
input = torch.tensor(ds_test[0:1]["input_values"]).squeeze(1).to(device)

batch = ds_test[0:1]
# captions = batch["tokenizedCaption"]
# maxCaptionLen = 0

# for i in range(batch_size):
#   if(len(captions[i]) > maxCaptionLen):
#     maxCaptionLen = len(captions[i])

# for j in range(batch_size):
#   if(len(captions[j]) < maxCaptionLen):
#     for k in range(len(captions[j]), maxCaptionLen):
#       captions[j].append(2)

# dynamicPadding(batch)

# target = torch.tensor(batch["tokenizedCaption"]).to(device)
target = torch.tensor(batch["tokenizedAspectList"]).to(device)
# target = target.unsqueeze(0)
print(input.shape)
print(target.shape)
modelOutput = newModel(input, target)
# loadedOutput = loaded_model(input, target)

torch.Size([1, 1024, 128])
torch.Size([1, 14])


In [None]:
tempOutput = modelOutput.view(-1, modelOutput.shape[2])
print(tempOutput.shape)

torch.Size([88, 5475])


In [None]:
print(modelOutput)
max = modelOutput[0].argmax(1)
print(max)

tensor([[[-2.5003,  1.9018, -2.6463,  ..., -3.2833, -3.7127, -3.0394],
         [-2.3698,  3.4217, -1.9702,  ..., -3.4956, -2.8221, -2.8570],
         [-1.3004,  1.9933, -0.7634,  ..., -2.0791, -1.3730, -1.9847],
         ...,
         [-0.5852,  2.4742,  0.2530,  ..., -1.0860, -1.1115, -0.5978],
         [-2.2533,  5.3062, -1.8107,  ..., -2.6317, -3.4494, -3.4386],
         [-1.3028,  2.9477, -0.4578,  ..., -2.1496, -1.6670, -1.1011]]],
       device='cuda:0', grad_fn=<ViewBackward0>)
tensor([  4, 229, 783, 132, 571,  46, 200,  38, 229,  24, 313,  24,   1, 226],
       device='cuda:0')


In [None]:
def createSentence(input):
  sentence = []
  for i in range(len(input)):
    sentence.append(vocab_list[input[i]])
  return sentence

# Inference

In [None]:
beam_size = 2
text = [0]
text = torch.tensor(text).unsqueeze(0).to(device)
hypotheses = [text]

newText = torch.tensor([0]).unsqueeze(0).to(device)

currSeqPos = 0

In [None]:
import torch.nn.functional as F
class BeamSearchNode:
  def __init__(self, prev_node, token_id, log_prob, length):
    self.prev_node = prev_node
    self.token_id = token_id
    self.log_prob = log_prob
    self.length = length


def top_p_sampling(logits, p):
  # Sort logits in descending order
  sorted_logits, sorted_indices = torch.sort(logits, descending=True)
  print(sorted_logits.shape)
  # Calculate cumulative probabilities
  cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
  print(cumulative_probs)
  # Determine indices to sample
  indices_to_sample = sorted_indices[cumulative_probs < p]
  print("Indices_to_sample", indices_to_sample, "len", indices_to_sample.shape)
  return indices_to_sample


def sample_from_subset(indices_to_sample, logits):
  # Sample one index from the subset
  multinomial_input = F.softmax(logits[indices_to_sample], dim=-1)
  print("Multinomial_input shape", multinomial_input.shape)
  sampled_index = torch.multinomial(multinomial_input, 1)
  return sampled_index.item()


def top_p_sample(logits, prevText, p, filter):
  # Compute indices to sample from
  indices_to_sample = top_p_sampling(logits, p)
  # Sample from the subset
  sampled_index = sample_from_subset(indices_to_sample, logits)
  token = indices_to_sample[sampled_index]
  # Token filtering here
  # If token is in token history then remove it from indices to sample and sample from subset again
  filterWindow = prevText.shape[1] - filter
  token_history = prevText[0][filterWindow:]
  print("History", token_history)
  for i in range(len(token_history)):
    if(token_history[i] == token):
      print("history", token_history[i], "token", token)
      indices_to_sample = torch.cat((indices_to_sample[:sampled_index], indices_to_sample[sampled_index+1:]))
      sampled_index = sample_from_subset(indices_to_sample, logits)
      token = indices_to_sample[sampled_index]
      continue
  # indices_to_sample = torch.cat((indices_to_sample[:sampled_index], indices_to_sample[sampled_index+1:]))
  return token


def beam_search_node(newModel, input, newText, max_length=10, num_beams=2):
  initial_node = BeamSearchNode(prev_node=None, token_id=newText, log_prob=0.0, length=1)
  beam = [initial_node]

  for _ in range(max_length):
    new_beam = []
    for node in beam:
      if node.length >= max_length:
        new_beam.append(node)
        continue

      tempNodes = node
      prevText = []
      while tempNodes.prev_node:
        prevText.append(tempNodes.prev_node.token_id)
        tempNodes = tempNodes.prev_node
      if(prevText == []):
        prevText.append(0)
      # Reverses the array
      prevText = prevText[::-1]
      prevText.append(node.token_id)
      prevText = torch.tensor(prevText).unsqueeze(0).to(device)
      print("Prev text", prevText)
      # print("Prev text shape", prevText.shape)
      # print("New token shape", newToken.shape)
      text_input = prevText

      print("_", _)
      print("node length", node.length)
      print("node token id", node.token_id)
      print("New beam len", len(new_beam))
      print("beam len", len(beam))

      with torch.no_grad():
        # Log prob
        print("Text input shape", text_input.shape)
        preds = newModel(input, text_input)
        print("Pred shape", preds.shape)
        temperature = 1.25
        scaled_logits = preds[0][0] / temperature
        top_p_index = top_p_sample(scaled_logits, prevText, p=0.8, filter=3)
        log_probs = F.log_softmax((scaled_logits), dim=-1)
        # log_probs = F.log_softmax(preds[0], dim=-1)
        print("Log probs shape", log_probs.shape)

      topk_probs, topk_ids = torch.topk(log_probs, num_beams)
      print(topk_probs)
      seqIdx = log_probs.shape[0] - 1

      # for prob, token_id in zip(topk_probs[seqIdx], topk_ids[seqIdx]):
      #   print("Prob", prob)
      #   new_node = BeamSearchNode(
      #       prev_node=node,
      #       token_id=token_id.item(),
      #       log_prob=node.log_prob+prob.item(),
      #       length = node.length + 1
      #   )
      #   print("New node token", new_node.token_id)
      #   new_beam.append(new_node)
      #   print("New beam after len", len(new_beam))

      new_node = BeamSearchNode(
          prev_node=node,
          token_id=top_p_index,
          log_prob = node.log_prob+scaled_logits[top_p_index],
          length = node.length+1,
      )
      new_beam.append(new_node)


    beam = sorted(new_beam, key=lambda x: x.log_prob, reverse=True)[:num_beams]

  output_sequences = []
  for node in beam:
    output_sequence = []
    while node:
      output_sequence.append(node.token_id)
      node = node.prev_node
    output_sequence.reverse()
    output_sequences.append(output_sequence)
  return output_sequences


gen = beam_search_node(newModel, input, newText, max_length=30, num_beams=2)

Prev text tensor([[0, 0]], device='cuda:0')
_ 0
node length 1
node token id tensor([[0]], device='cuda:0')
New beam len 0
beam len 1
Text input shape torch.Size([1, 2])
Pred shape torch.Size([1, 2, 4308])
torch.Size([4308])
tensor([0.0831, 0.1621, 0.2254,  ..., 1.0000, 1.0000, 1.0000], device='cuda:0')
Indices_to_sample tensor([   4,   10,   74,  ..., 4101,  711, 3050], device='cuda:0') len torch.Size([1049])
Multinomial_input shape torch.Size([1049])
History tensor([0], device='cuda:0')
Log probs shape torch.Size([4308])
tensor([-2.4876, -2.5386], device='cuda:0')
Prev text tensor([[0, 4]], device='cuda:0')
_ 1
node length 2
node token id tensor(4, device='cuda:0')
New beam len 0
beam len 1
Text input shape torch.Size([1, 2])
Pred shape torch.Size([1, 2, 4308])
torch.Size([4308])
tensor([0.0831, 0.1621, 0.2254,  ..., 1.0000, 1.0000, 1.0000], device='cuda:0')
Indices_to_sample tensor([   4,   10,   74,  ..., 4101,  711, 3050], device='cuda:0') len torch.Size([1049])
Multinomial_input s

In [None]:
print(newText)
# print(hypotheses)
print(gen)
print(len(gen[0]))

tensor([[0]], device='cuda:0')
[[tensor([[0]], device='cuda:0'), tensor(104, device='cuda:0'), tensor(104, device='cuda:0'), tensor(506, device='cuda:0'), tensor(104, device='cuda:0'), tensor(104, device='cuda:0'), tensor(66, device='cuda:0'), tensor(66, device='cuda:0'), tensor(651, device='cuda:0'), tensor(76, device='cuda:0'), tensor(48, device='cuda:0'), tensor(76, device='cuda:0'), tensor(76, device='cuda:0'), tensor(100, device='cuda:0'), tensor(4, device='cuda:0'), tensor(104, device='cuda:0'), tensor(66, device='cuda:0'), tensor(76, device='cuda:0'), tensor(4, device='cuda:0'), tensor(4, device='cuda:0'), tensor(66, device='cuda:0'), tensor(18, device='cuda:0'), tensor(104, device='cuda:0'), tensor(76, device='cuda:0'), tensor(104, device='cuda:0'), tensor(104, device='cuda:0'), tensor(66, device='cuda:0'), tensor(507, device='cuda:0'), tensor(934, device='cuda:0'), tensor(104, device='cuda:0')]]
30


In [None]:
tokenizedInput = [  0,    4,  427,   28,   74,    4,   13,  270,  558,   10,   64,  199,
         2215, 1158,   41,  269,  692,  229,    4, 2174,  233, 2084, 2095,  557,
           95,  744,  520, 4229, 2037]
outputSentence = createSentence(tokenizedInput)
print(outputSentence)
print(ds_test[0]["aspect_list"])

['<SOS>', 'low', 'scuffling', 'instrumental', 'live', 'low', 'piano', 'loud', 'symphonic', 'female', 'unrelated', 'shrutibox', 'tap', 'swing', 'jazzy', 'harp', 'indian', 'classical', 'low', 'diy', 'joyful', 'climax', 'snapping', 'opera', 'kids', 'acapella', 'bad', 'benign', 'hindu']
['percussion', 'music', 'no', 'other', 'instruments', 'no', 'voices', 'instrumental', 'advertisement', 'music', 'promotional', 'music']
