<a href="https://colab.research.google.com/github/JTStephens18/AudioTranscriptor/blob/main/V8_VisionEncDec_audioProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Change log:
V8 aims to use the aspect list vs the caption in an attempt to reduce the inference stage from only predicting the most common tokens used in an English sentence such as ".", "the", and "a".

# Installation / Setup

In [None]:
! pip install yt-dlp
# Install huggingface audio datasets
! pip install datasets[audio]
! pip install transformers evaluate jiwer
!pip install accelerate -U
!pip install nltk



In [None]:
from datasets import load_dataset, Audio, Dataset, concatenate_datasets
from transformers import VisionEncoderDecoderModel, AutoFeatureExtractor, AutoTokenizer, AutoProcessor, ASTFeatureExtractor
import subprocess
import os
from pathlib import Path
import torch
import torch.nn as nn
import torchaudio
import soundfile as sf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import librosa
import re
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
ds = load_dataset('google/MusicCaps', split="train")
ds

Dataset({
    features: ['ytid', 'start_s', 'end_s', 'audioset_positive_labels', 'aspect_list', 'caption', 'author_id', 'is_balanced_subset', 'is_audioset_eval'],
    num_rows: 5521
})

In [None]:
ds = ds.remove_columns(["author_id", "is_balanced_subset", "is_audioset_eval", "audioset_positive_labels"])

# Tokenize the dataset

In [None]:
from nltk.tokenize import word_tokenize
def process_aspect_list(example):
  newList = []
  aspect_list = example["aspect_list"]
  for word in word_tokenize(aspect_list):
    cleaned_word = re.sub(r'[\'\[\]\(\),]', ' ', word)
    val = re.findall(r'\w+|[^\w\s]', cleaned_word)
    if(len(val) > 0):
      newList.append(val[0])
  example["aspect_list"] = newList
  return example


ds = ds.map(
    process_aspect_list,
    num_proc=4,
    writer_batch_size=1000,
    keep_in_memory=False,
)

In [None]:
ds[0]["aspect_list"]

['low',
 'quality',
 'sustained',
 'strings',
 'melody',
 'soft',
 'female',
 'vocal',
 'mellow',
 'piano',
 'melody',
 'sad',
 'soulful',
 'ballad']

In [None]:
import re
from nltk.tokenize import word_tokenize
# vocab_list = ["word 1", "word 2",  ..., "word N"]
vocab_list = ["<SOS>", "<EOS>", "<PAD>", "<UNK>"]
max_len = 0
vocab_count = {}
maxLenItem = 0

# Need to split on punctuation
# I see some tokens such as "funk/pop" and singing.the - These should be separate

def split_word(word):
  # Removes brackets, commas, and single quotes from the corpus
  word = re.sub(r'[\'\[\]\(\),]', '', word)
  return re.findall(r'\w+|[^\w\s]', word)



for i in range(ds.num_rows):
  aspect_list = ds[i]["aspect_list"]
  if(len(aspect_list) > max_len):
    max_len = len(aspect_list)+1
    maxLenItem = i
  for j in range(len(aspect_list)):
    word = aspect_list[j].lower()
    if(word) not in vocab_list:
      vocab_list.append(word)
      vocab_count[word] = 1
    else:
      vocab_count[word] += 1


# for i in range(ds.num_rows):
#   caption = ds[i]["aspect_list"]
#   captionSplit = []
#   # captionSplit = word_tokenize(ds[i]["caption"])
#   for word in word_tokenize(caption):
#     # word = word.replace(".", "")
#     captionSplit.extend(split_word(word))
#   if(len(captionSplit) > max_len):
#     max_len = len(captionSplit)+1
#   for j in range(len(captionSplit)):
#     word = captionSplit[j].lower()
#     if(word) not in vocab_list:
#       vocab_list.append(word)
#       vocab_count[word] = 1
#     else:
#       vocab_count[word] += 1

# vocab_dict = { "word": index }
vocab_dict = {}
for i in range(len(vocab_list)):
  vocab_dict[vocab_list[i]] = i

In [None]:
sortedDict = dict(sorted(vocab_count.items(), key=lambda item: item[1]))
print(sortedDict)

{'direct': 1, 'input': 1, 'suspended': 1, 'shuffling': 1, 'okay': 1, 'trills': 1, 'ornamentation': 1, 'thumps': 1, 'transposed': 1, 'canon': 1, 'mellody': 1, 'doorsteps': 1, 'improvement': 1, 'lip': 1, 'smack': 1, 'windpipe': 1, 'barbershop': 1, 'seconds': 1, 'zurna': 1, 'halay': 1, 'hyper': 1, 'exploring': 1, 'territory': 1, 'parts': 1, 'tireless': 1, 'antique': 1, 'knives': 1, 'hopeless': 1, 'parole': 1, 'imploring': 1, 'forgive': 1, 'enlightening': 1, 'nirvana': 1, 'endless': 1, 'catchybeat': 1, 'baraban': 1, 'distorting': 1, 'stumping': 1, 'ukrainian': 1, 'hips': 1, 'emphasisengaging': 1, 'carribean': 1, 'drumset': 1, 'hamster': 1, 'flash': 1, 'pianos': 1, 'circuit': 1, 'lightening': 1, 'electricity': 1, 'about': 1, 'olden': 1, 'squeal': 1, 'turntablism': 1, 'throat': 1, 'coordinated': 1, 'propeller': 1, 'drill': 1, 'cimbalom': 1, 'desi': 1, 'guitark': 1, 'eldery': 1, 'trimmings': 1, 'mixing': 1, 'timid': 1, 'cheerleaders': 1, 'sweeps': 1, 'provence': 1, 'repertoire': 1, 'stars': 1

In [None]:
def testToken(input):
  output = []
  input = word_tokenize(input.lower())
  for i in range(len(input)):
    if(input[i]) in vocab_dict:
      index = vocab_dict[input[i]]
      output.append(index)
    else:
      output.append("<UNK>")
  return output

In [None]:
testVal = testToken(ds[500]["aspect_list"][2:-2])
print(testVal)
print(ds[500]["aspect_list"][2:-2])
print(max_len)
print(vocab_list[69])

[94, 299, 532, 16, '<UNK>', 177, 532, 16, '<UNK>', 41, 42, 1520, 532, 16, '<UNK>', 42, 168, 20, 169, 532, 16, '<UNK>', 23, 180, 532, 16, '<UNK>', 532, 16, '<UNK>', 532, 16, '<UNK>', 532, 16, '<UNK>', 532, 16, '<UNK>', 532, 16, '<UNK>', 532, 16, '<UNK>', 532, 16, '<UNK>', 532, 16, '<UNK>', 532, 16, '<UNK>', 334, 532, 16, '<UNK>', 147, 532, 16, '<UNK>', 720, 532, 16, '<UNK>', 72, 42]
male vocalist', 'energetic drumming', 'loud electric guitar feedback', 'electric guitar lead and harmony', 'enthusiastic vocal backup', 'youthful', 'enthusiastic', 'energetic', 'vibrant', 'boisterous', 'voracious', 'intense', 'passionate', 'metal', 'hard rock', 'rock music', 'heavy metal', 'electric bass guitar
153
shots


In [None]:
def data_search(id):
  for i in range(ds.num_rows):
    if(ds[i]["ytid"] == id):
      print(i)
      return ds[i]

item = data_search("qsRPTMXFGsA")
print(item)

4829
{'ytid': 'qsRPTMXFGsA', 'start_s': 210, 'end_s': 220, 'aspect_list': ['male', 'latin', 'singer', 'bad', 'quality', 'audio', 'latin', 'dance', 'band', 'deteriorating', 'audio', 'quality', 'groovy', 'dance', 'hits', 'retro', 'latin', 'hits', 'medium', 'tempo', 'bachata', 'vocal', 'echoes', 'latin', 'percussions', 'guitar', 'lead', 'keyboard', 'tones', 'enthusiastic', 'emotional', 'passionate', 'romantic', 'retro', 'latin', 'hits', 'latin', 'pop', 'hits', 'people', 'dancing', 'passionate', 'romantic', 'groovy', 'bass', 'line', 'latin', 'love', 'hits', 'wedding', 'music', 'people', 'dancing', 'latin', 'dance', 'hits', 'latin', 'dance', 'music'], 'caption': 'A male Latin singer sings this passionate melody. The song is medium tempo with various Latin percussions, groovy bass line, keyboard harmony and a guitar playing lead. The song is romantic and a classic Latin dance groove. The song is a Latin dance hit with a deteriorating audio quality.'}


In [None]:
print(maxLenItem)
print(ds[4829])
print(len(ds))

2059
{'ytid': 'qsRPTMXFGsA', 'start_s': 210, 'end_s': 220, 'aspect_list': "['male latin singer', 'bad quality audio', 'latin dance band', 'deteriorating audio quality', 'groovy dance hits', 'retro latin hits', 'medium tempo', 'bachata', 'vocal echoes', 'latin percussions', 'guitar lead', 'keyboard tones', 'enthusiastic', 'emotional', 'passionate', 'romantic', 'retro latin hits', 'latin pop hits', 'people dancing', 'passionate', 'romantic', 'groovy bass line', 'latin love hits', 'wedding music', 'people dancing', 'latin dance hits', 'latin dance music']", 'caption': 'A male Latin singer sings this passionate melody. The song is medium tempo with various Latin percussions, groovy bass line, keyboard harmony and a guitar playing lead. The song is romantic and a classic Latin dance groove. The song is a Latin dance hit with a deteriorating audio quality.'}
5521


TypeError: ignored

In [None]:
ds = ds.train_test_split(test_size=0.2)

In [None]:
feature_extractor = ASTFeatureExtractor()
feature_extractor

ASTFeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "ASTFeatureExtractor",
  "feature_size": 1,
  "max_length": 1024,
  "mean": -4.2677393,
  "num_mel_bins": 128,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000,
  "std": 4.5689974
}

In [None]:
def download_clip(
    video_id,
    output_filename,
    start_time,
    end_time,
    tmp_dir='/musiccaps',
    num_attempts=5,
    url_base='https://www.youtube.com/watch?v='
):

  status = False
  command = f"""
        yt-dlp --quiet --no-warnings -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" {url_base}{video_id} --force-keyframes-at-cuts
    """.strip()
  attempts = 0
  while True:
    try:
      # output = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
      output = os.system(command)
    except subprocess.CalledProcess.Error as err:
      attempts += 1
      if attempts == num_attempts:
        return status, err.output
    else:
      break

  # Check if video was successfully saved
  status = os.path.exists(output_filename)
  return status, 'Downloaded'


def toUppercase(aspect_list):
  a_list = aspect_list[1:-1]
  new_list = ''
  for word in a_list:
    new_list = ''.join(a_list).upper().replace(',', '')
    # new_list = new_list.replace(' ', '|')
    if(len(new_list) > 128):
      new_list = new_list[:128]
  return new_list

# Convert the words into tokens
def tokenizeCaption(caption):
  output = [vocab_dict["<SOS>"]]
  input = word_tokenize(caption.lower())
  for i in range(len(input)):
    if(input[i]) in vocab_dict:
      index = vocab_dict[input[i]]
      output.append(index)
    else:
      output.append(vocab_dict['<UNK>'])
  output.append(vocab_dict["<EOS>"])
  # Max_len -2 because I'm appending additional start and end tokens
  # for j in range(len(input), max_len-2):
  #  output.append(vocab_dict["<PAD>"])
  return output


def tokenizeAspectList(aspect_list):
  output = [vocab_dict["<SOS>"]]
  for i in range(len(aspect_list)):
    if(aspect_list[i] in vocab_dict):
      index = vocab_dict[aspect_list[i]]
      output.append(index)
    else:
      output.append(vocab_dict["<UNK>"])
  output.append(vocab_dict["<EOS>"])
  return output


def process(example):
  output_path = str(data_dir / f"{example['ytid']}.wav")
  status = True
  # aspect_string = toUppercase(example['aspect_list'])
  if not os.path.exists(output_path):
    status = False
    status, log = download_clip(
        example['ytid'],
        output_path,
        example['start_s'],
        example['end_s'],
    )

  # example["tokenizedCaption"] = tokenizeCaption(example["caption"])
  example["tokenizedAspectList"] = tokenizeAspectList(example["aspect_list"])
  # example['aspect_string'] = aspect_string
  example["audio"] = output_path
  example['download_status'] = status
  if(example["ytid"] == "qsRPTMXFGsA"):
    example["download_status"] = False
  example["image_path"] = f'./spectrograms/{example["ytid"]}.png'
  return example

In [None]:
def stereo_to_mono(wav):
  chan_1 = wav[0][:]
  chan_2 = wav[1][:]
  mono = (chan_1 + chan_2) / 2
  return mono

In [None]:
def resample_waveform(example):
  filepath = example["audio"]
  y, sr = librosa.load(filepath, sr=16000)
  sf.write(filepath, y, sr)
  waveform, sampling_rate = torchaudio.load(filepath)
  if(waveform.shape[0] == 2):
    waveform = stereo_to_mono(waveform)

  example["waveform"] = waveform
  return example

In [None]:
def wavToInput(example):
  wav = np.asarray(example["waveform"])
  inputs = feature_extractor(wav, sampling_rate=sampling_rate, padding="max_length", return_tensors="pt").input_values
  example["input_values"] = inputs
  example["inputs_shape"] = inputs.shape
  return example

In [None]:
samples_to_load = 4000
cores = 4
sampling_rate = 16000
writer_batch_size = 1000
data_dir = "./music_data"
upper_limit = 5521
lower_limit = 5000

data_dir = Path(data_dir)
data_dir.mkdir(exist_ok=True, parents=True)

# ds = ds.select(range(lower_limit, upper_limit))

# ds = ds.map(
#     process,
#     num_proc=cores,
#     writer_batch_size=writer_batch_size,
#     keep_in_memory=False
# )


# ds = ds.filter(lambda ex: ex["download_status"] == True)

ds_train = ds["train"].select(range(samples_to_load))
ds_test = ds["test"].select(range(int(samples_to_load*0.2)))

ds_train = ds_train.map(
    process,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
    # batched=True,
)

ds_train = ds_train.filter(lambda ex: ex["download_status"] == True)
# ds_train = ds_train.cast_column("audio", Audio(sampling_rate=sampling_rate))
ds_train = ds_train.map(
    resample_waveform,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
)

ds_train = ds_train.map(
    wavToInput,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
)

ds_test = ds_test.map(
    process,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
    # batched=True,
)

ds_test = ds_test.filter(lambda ex: ex["download_status"] == True)
# ds_test = ds_test.cast_column("audio", Audio(sampling_rate=sampling_rate))
ds_test = ds_test.map(resample_waveform, num_proc=cores, writer_batch_size=writer_batch_size, keep_in_memory=False)
ds_test = ds_test.map(
    wavToInput,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
)

Map (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3938 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3938 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/800 [00:00<?, ? examples/s]

Filter:   0%|          | 0/800 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/786 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/786 [00:00<?, ? examples/s]

# Create DataLoader

In [None]:
from torch.utils.data import DataLoader

batch_size = 2

train_loader = DataLoader(ds_train, batch_size=batch_size, shuffle=True)

In [None]:
train_data = next(iter(train_loader))
print(train_data["caption"][0])

Here we have a slow piano piece played in a major key. The peace feels calm and happy.


In [None]:
  for i, data in enumerate(train_loader):
    input_values = data["input_values"]
    test = input_values[0]
    print(test[0])
    # input_tensor = torch.tensor(input_values)
    print(type(input_values))
    tensor = torch.tensor(input_values)
    [t.size() for t in data["input_values"]]
    input = torch.tensor(data["input_values"]).squeeze(1)
    print("Input shape", input.shape)
    target = torch.tensor(data["tokenzedCaption"])
    print("Target shape", target.shape)

[tensor([-0.6306, -0.9950], dtype=torch.float64), tensor([-1.0666, -1.2776], dtype=torch.float64), tensor([-0.6898, -1.1508], dtype=torch.float64), tensor([-1.2776, -1.2776], dtype=torch.float64), tensor([-0.2374, -0.9357], dtype=torch.float64), tensor([-0.5634, -1.2617], dtype=torch.float64), tensor([ 0.0054, -0.6809], dtype=torch.float64), tensor([-0.2595, -0.9754], dtype=torch.float64), tensor([ 0.0394, -0.7316], dtype=torch.float64), tensor([-0.2751, -0.8052], dtype=torch.float64), tensor([-0.1032, -0.6333], dtype=torch.float64), tensor([-0.6595, -0.7263], dtype=torch.float64), tensor([-0.6062, -0.6730], dtype=torch.float64), tensor([ 0.0416, -0.9412], dtype=torch.float64), tensor([-0.0257, -1.0084], dtype=torch.float64), tensor([ 0.2385, -0.9479], dtype=torch.float64), tensor([ 0.1090, -1.0531], dtype=torch.float64), tensor([ 0.1664, -0.9936], dtype=torch.float64), tensor([-0.0412, -0.6724], dtype=torch.float64), tensor([-0.1746, -0.4530], dtype=torch.float64), tensor([-0.4436, -0

ValueError: ignored

# AST Training

In [None]:
from transformers import AutoModel
model = AutoModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

In [None]:
print(model)

In [None]:
class MultiHeadAttentionLayer(nn.Module):
  def __init__(self, hid_dim, n_heads, dropout, device):
    super().__init__()

    assert hid_dim % n_heads == 0

    self.hid_dim = hid_dim
    self.n_heads = n_heads
    self.head_dim = hid_dim // n_heads

    self.fc_q = nn.Linear(hid_dim, hid_dim)
    self.fc_k = nn.Linear(hid_dim, hid_dim)
    self.fc_v = nn.Linear(hid_dim, hid_dim)

    self.fc_o = nn.Linear(hid_dim, hid_dim)

    self.dropout = nn.Dropout(dropout)
    self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

  def forward(self, query, key, value, mask = None):

    batch_size = query.shape[0]

    # query = [batch_size, query len, hid_dim]
    # key = [batch_size, key len, hid_dim]
    # value = [batch_size, value len, hid_dim]

    Q = self.fc_q(query)
    K = self.fc_k(query)
    V = self.fc_v(query)

    # Q = [batch_size, query len, hid_dim]
    # K = [batch_size, key len, hid_dim]
    # V = [batch_size, value len, hid_dim]

    Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0,2,1,3)
    K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0,2,1,3)
    V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0,2,1,3)

    # Q = [batch_size, n heads, query len, hid_dim]
    # K = [batch_size, n heads, key len, hid_dim]
    # V = [batch_size, n heads, value len, hid_dim]

    energy = torch.matmul(Q, K.permute(0,1,3,2)) / self.scale

    # energy = [batch_size, n heads, query len, key len]

    if mask is not None:
      energy = energy.masked_fill(mask == 0, -1e10)

    attention = torch.softmax(energy, dim=-1)

    # attention = [batch_size, n_heads, query len, key len]

    x = torch.matmul(self.dropout(attention), V)

    # x = [batch_size, n_heads, query len, head_dim]

    x = x.permute(0,2,1,3).contiguous()

    # x = [batch_size, query len, n_heads, head_dim]

    x = x.view(batch_size, -1, self.hid_dim)

    # x = [batch_size, query len, hid dim]

    x = self.fc_o(x)

    # x = [batch_size, query len, hid dim]

    return x, attention

# Transformer Decoder Implementation

In [None]:
# Create a starting output state
  # Pass this into an output embedding (size of 768)
  # Use positional encoding
  # Pass this into a masked multi-head attention mechanism
  # Add and normalize

# Take the output from stage 1 and combine it with the output from the encoder
  # Multi-head attention
  # Add and normalize

# Feed forward
  # Add and normalize

# Linear activation function
# Softmax



In [None]:
class TransformerBlock(nn.Module):
  def __init__(self, embed_size, heads, dropout, forward_expansion):
    super(TransformerBlock, self).__init__()
    self.attention = nn.MultiheadAttention(embed_size, heads).to(device)
    # self.attention = MultiHeadAttentionLayer(embed_size, heads, dropout, device)
    self.norm1 = nn.LayerNorm(embed_size)
    self.norm2 = nn.LayerNorm(embed_size)

    self.feed_forward = nn.Sequential(
        nn.Linear(embed_size, forward_expansion*embed_size),
        nn.ReLU(),
        nn.Linear(forward_expansion*embed_size, embed_size)
    )
    self.dropout = nn.Dropout(dropout)

  def forward(self, value, key, query, mask):
    # attention, _ = self.attention(query, key, value, mask)
    # Shouldn't need a mask to ignore padding on inputs since padding is not needed for inputs
    attention, _ = self.attention(query, key, value)

    x = self.dropout(self.norm1(attention + query))
    forward = self.feed_forward(x)
    out = self.dropout(self.norm2(forward + x))
    return out

In [None]:
class DecoderBlock(nn.Module):
  def __init__(self, embed_size, heads, forward_expansion, dropout, device, max_length):
    super(DecoderBlock, self).__init__()
    self.attention = nn.MultiheadAttention(embed_size, heads).to(device)
    # self.attention = MultiHeadAttentionLayer(embed_size, heads, dropout, device)
    self.norm = nn.LayerNorm(embed_size)
    self.transformer_block = TransformerBlock(
        embed_size, heads, dropout, forward_expansion
    )
    self.dropout = nn.Dropout(dropout)
    # self.feed_forward = nn.Sequential(
    #     # Target shape[1] to forward_expansion*embed_size
    #     nn.Linear(max_length, forward_expansion*embed_size),
    #     nn.ReLU(),
    #     nn.Linear(forward_expansion*embed_size, max_length)
    # )

  def forward(self, x, value, key, src_mask, trg_mask):
    # A feed forward connection adds other parameters so there is an additional case to learn if needed
    # x = self.feed_forward(x)
    attention, _ = self.attention(x, x, x, trg_mask)
    query = self.dropout(self.norm(attention + x))
    # query = self.norm(x + self.dropout(attention))
    out = self.transformer_block(value, key, query, src_mask)
    return out

In [None]:
class Decoder(nn.Module):
  def __init__(self, trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length):
    super(Decoder, self).__init__()
    self.device = device
    self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
    self.position_embedding = nn.Embedding(max_length, embed_size)
    # self.position_embedding = nn.Parameter(torch.randn(max_length, embed_size))

    self.layers = nn.ModuleList(
        [DecoderBlock(embed_size, heads, forward_expansion, dropout, device, max_length)
        for _  in range(num_layers)]
    )

    self.fc_out = nn.Linear(embed_size, trg_vocab_size)
    self.dropout = nn.Dropout(dropout)
    self.softmax = nn.Softmax(dim=0)
    self.scale = torch.sqrt(torch.FloatTensor([embed_size])).to(device)

  def forward(self, x, enc_out, src_mask, trg_mask):
    N, seq_length = x.shape
    pos = torch.arange(0, seq_length).unsqueeze(0).repeat(N, 1).to(self.device)
    # x = self.dropout(self.word_embedding(x) + self.position_embedding(pos))
    x = self.dropout((self.word_embedding(x) * self.scale) + self.position_embedding(pos))
    # positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
    # positions = position_embedding[:seq_length]
    # x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))
    # x = self.dropout((self.word_embedding(x) + self.position_embedding[:seq_length]))

    for layer in self.layers:
      # x = [N, seq_length, embed_size]
      x = layer(x, enc_out, enc_out, src_mask, trg_mask)

    out = self.fc_out(x)
    # out = self.softmax(out)
    return out


In [None]:
class Instantiate(nn.Module):
  def __init__(self):
    super(Instantiate, self).__init__()

  def forward(self, x):
    return x

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
src_vocab_size = 1024 # max_length of feature extractor - not even used
trg_vocab_size = len(vocab_list)
embed_size = 768
num_layers = 12
heads = 12
forward_expansion = 4
dropout = 0.1
device = device
max_length = max_len

src_pad_idx = 2
trg_pad_idx = 2

num_epochs = 20
# Number of training samples in the batch
batch_size = 4
learning_rate = 0.00001

In [None]:
model.layernorm = Instantiate()
# model.decoder = Decoder(trg_vocab_size, embed_size, num_layers, heads, forward_expansion, droput, device, max_length)

In [None]:
class Transformer(nn.Module):
  def __init__(self,
               src_vocab_size,
               trg_vocab_size,
               src_pad_idx,
               trg_pad_idx,
               model,
               embed_size=768,
               num_layers=12,
               heads=12,
               forward_expansion=4,
               dropout=0,
               device=device,
               max_length=max_len,
              ):
    super(Transformer, self).__init__()

    self.encoder = model
    self.decoder = Decoder(trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length)
    self.src_pad_idx = src_pad_idx
    self.trg_pad_idx = trg_pad_idx
    self.device = device

# Src mask is used so the encoder does not pay attention to the padding values appended to the input
  def make_src_mask(self, src):
    src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
    # (N, 1, 1, src_length)
    return src_mask.to(self.device)

  def make_trg_mask(self, trg):
    N, trg_len= trg.shape
    # trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
    #     N, 1, trg_len, trg_len
    # )
    trg_mask = torch.tril(torch.ones(trg_len, N))
    return trg_mask.to(self.device)


  def make_trg_mask_custom_attention(self, trg):
    # trg  = [batch_size, trg_len]
    trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)

    # trg_pad_mask = [batch_size, 1, 1, trg_len]

    trg_len = trg.shape[1]

    trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=self.device)).bool()

    # trg_sub_mask = [trg_len, trg_len]

    trg_mask = trg_pad_mask & trg_sub_mask

    #trg_mask = [batch_size, 1, trg len, trg len]

    return trg_mask

  def forward(self, src, trg):
    src_mask = self.make_src_mask(src)
    trg_mask = self.make_trg_mask(trg)
    # trg_mask = nn.Transformer.generate_square_subsequent_mask(max_length)

    enc_src = self.encoder(src)
    enc_out = enc_src["pooler_output"].unsqueeze(1).expand(-1, trg.shape[1], -1)
    out = self.decoder(trg, enc_out, src_mask, trg_mask)
    return out


In [None]:
newModel = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, model).to(device)
optimizer = torch.optim.Adam(newModel.parameters(), learning_rate)

In [None]:
from torchtext.data.metrics import bleu_score
criterion = nn.CrossEntropyLoss(ignore_index=vocab_dict["<PAD>"])
optimizer = torch.optim.Adam(newModel.parameters(), lr=learning_rate)

In [None]:
class CustomLoss(nn.Module):
  def __init__(self, pad_idx, eos_idx):
    super(CustomLoss, self).__init__()
    self.cross_entropy_loss = nn.CrossEntropyLoss(ignore_index=vocab_dict["<PAD>"])
    self.pad_idx = pad_idx
    self.eos_idx = eos_idx

  def forward(self, logits, targets):
     # Calculate standard cross-entropy loss
    ce_loss = self.cross_entropy_loss(logits, targets)

    # Penalize extra tokens beyond first occurence of <EOS>
    eos_positions = (targets == self.eos_idx).nonzero()
    if(eos_positions.numel() > 0):
      # Only consider the first occurence of <EOS>
      first_eos_pos = eos_positions[0, :]

      # Count tokens beyond the first <EOS> position
      if first_eos_pos.dim() > 0:
        # extra_tokens = torch.clamp(first_eos_pos - logits.size(1), min=0)
        extra_tokens = max(first_eos_pos - logits.size(1), 0)
      else:
        extra_tokens = 0

      penalty_weight = 0.1

      # Add penalty term to loss
      extra_tokens_penalty = penalty_weight * extra_tokens
      total_loss = ce_loss + extra_tokens_penalty
    else:
      total_loss = ce_loss

    return total_loss


pad_idx = 2
eos_idx = 1
custom_loss = CustomLoss(pad_idx, eos_idx)

In [None]:
# Training loop
batches_per_epoch = int(ds_train.num_rows / batch_size)
output = []
sentences = []
step = 0
for epoch in range(num_epochs):
  for i in range(batches_per_epoch):
    start = i * batch_size
    # print(start)
    xBatch = ds_train[start:start+batch_size]
    # xBatch = ds_train[i]
    # print(xBatch)
    waveform = xBatch["waveform"]
    id = xBatch["ytid"]
    print(step, id)
    input = torch.tensor(xBatch["input_values"]).squeeze(1).to(device)
    # input = torch.tensor(xBatch["input_values"]).to(device)

    # captions = xBatch["tokenizedCaption"]
    captions = xBatch["tokenizedAspectList"]
    maxCaptionLen = 0
    # Find max length of caption in batch
    for item in range(batch_size):
      if(len(captions[item]) > maxCaptionLen):
        maxCaptionLen = len(captions[item])

  # Add padding to the captions based on max len
    for j in range(batch_size):
      if(len(captions[j]) < maxCaptionLen):
        for k in range(len(captions[j]), maxCaptionLen):
          captions[j].append(2)

    # target = torch.tensor(xBatch["tokenizedCaption"]).to(device)
    target = torch.tensor(xBatch["tokenizedAspectList"]).to(device)
    # target = target.unsqueeze(0)
    print(input.shape)
    print(target.shape)
    currOutput = newModel(input, target[:, :-1])
    # currOutput = newModel(input, target)
    # print(currOutput)
    pred = currOutput[0].argmax(1)
    pred2 = currOutput[1].argmax(1)
    # print(predIdx)
    # output.append(currOutput)
    # print(target[0])

    # for j in range(len(pred)):
    #   if(pred[j] == 1):
    #     tempOutput = currOutput[0][:j]
    #     print(tempOutput.shape)
        # for k in range(j+1, len(pred)):
        #   currOutput[0][k][2] = 100

    # for j in range(len(pred2)):
    #   if(pred2[j] == 1):
    #     # for k in range(j+1, len(pred2)):
    #     #   currOutput[1][k][2] = 100

    # newMax = lossOutput.argmax(1)
    # print(newMax)

    # loss = bleu_score(predSen, targetSen)
    # loss = criterion(currOutput[0], target[0])
    # loss = criterion(currOutput.reshape(-1, currOutput.shape[2]), target[0].reshape(-1))
    currOutput = currOutput.view(-1, currOutput.shape[2])
    targetInput = target[:,1:].reshape(-1)

    # targetInput = target.contiguous().view(-1)
    # loss = criterion(currOutput, targetInput)
    loss = custom_loss(currOutput, targetInput)
    print("Loss", loss)

    step += 1
    # for name, param in newModel.named_parameters():
    #   if param.grad is not None:
    #     print(name, param.grad)
    optimizer.zero_grad()
    loss.backward(loss)
    # torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
    optimizer.step()



  print("Step", step)
  print("Loss", loss)
  torch.save(newModel.state_dict(), 'newModelCheckpoint.pth')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
torch.Size([4, 1024, 128])
torch.Size([4, 37])
Loss tensor(0.5138, device='cuda:0', grad_fn=<AddBackward0>)
18432 ['cY3g6N5Sokk', 'feC0L9MtghM', '-Umconw-CRE', 'A6HJBIU1rD0']
torch.Size([4, 1024, 128])
torch.Size([4, 48])
Loss tensor(0.6584, device='cuda:0', grad_fn=<AddBackward0>)
18433 ['YE2rN3xknlk', 'B_kAtTBUDIA', 'EkmHGd0U8yE', 'BVt8RgNrwbQ']
torch.Size([4, 1024, 128])
torch.Size([4, 22])
Loss tensor(0.4301, device='cuda:0', grad_fn=<AddBackward0>)
18434 ['A5fPSkTvjmY', 'PlQibWaPAcM', 'ymuRKv9iJm4', 'LAHWV6fZwUk']
torch.Size([4, 1024, 128])
torch.Size([4, 30])
Loss tensor(0.4928, device='cuda:0', grad_fn=<AddBackward0>)
18435 ['X-uVubaJ3II', '244y56-vLWE', 'nU7x170OvJ4', 'paeNnR33i5Q']
torch.Size([4, 1024, 128])
torch.Size([4, 45])
Loss tensor(0.6400, device='cuda:0', grad_fn=<AddBackward0>)
18436 ['cG1dpyC8gV4', 'n3X8RGZsGg4', 'sDoV3sMgDhE', 'lSb7Y-_3to8']
torch.Size([4, 1024, 128])
torch.Size([4, 34])
Loss tensor(0

In [None]:
# Training loop
batches_per_epoch = int(ds_train.num_rows / batch_size)
output = []
sentences = []
step = 0
for epoch in range(num_epochs):
  for i, data in enumerate(train_loader):
    ytid = data["ytid"]
    print(step, ytid)
    input = torch.tensor(data["input_values"]).squeeze(1).to(device)
    print("Input shape", input.shape)
    target = torch.tensor(data["tokenzedCaption"]).to(device)
    print("Target shape", target.shape)
    modelOutput = newModel(input, target[:, :-1])
    lossInput = modelOutput.view(-1, modelOutput.shape[2])
    lossTarget = target[:,1:].reshape(-1)
    loss = criterion(lossInput, lossTarget)
    print("Loss", loss)
    step += 1
    optimizer.zero_grad()
    loss.backward(loss)
    optimizer.step()

0 ['zjZV0tvur2I', 'XaUXJG0BZuk']


ValueError: ignored

# Testing

In [None]:
print("Model's state_dict:")
for param_tensor in newModel.state_dict():
    print(param_tensor, "\t", newModel.state_dict()[param_tensor].size())

In [None]:
loaded_model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, model).to(device)
loaded_model.load_state_dict(torch.load("newModelCheckpoint.pth"))
loaded_model.eval()

In [None]:
# def dynamicPadding(batch):
#   captions = batch["tokenizedCaption"]
#   maxCaptionLen = 0
#   for i in range(batch_size):
#     if(len(captions[i]) > maxCaptionLen):
#       maxCaptionLen = len(captions[i])

#   for j in range(batch_size):
#     if(len(captions[j]) < maxCaptionLen):
#       for k in range(len(captions[j]), maxCaptionLen):
#         captions[j].append(2)

#   return batch


def dynamicPadding(batch):
  captions = batch["tokenizedAspectList"]
  maxCaptionLen = 0
  for i in range(batch_size):
    if(len(captions[i]) > maxCaptionLen):
      maxCaptionLen = len(captions[i])

  for j in range(batch_size):
    if(len(captions[j]) < maxCaptionLen):
      for k in range(len(captions[j]), maxCaptionLen):
        captions[j].append(2)

  return batch

In [None]:
input = torch.tensor(ds_test[0:1]["input_values"]).squeeze(1).to(device)

batch = ds_test[0:1]
# captions = batch["tokenizedCaption"]
# maxCaptionLen = 0

# for i in range(batch_size):
#   if(len(captions[i]) > maxCaptionLen):
#     maxCaptionLen = len(captions[i])

# for j in range(batch_size):
#   if(len(captions[j]) < maxCaptionLen):
#     for k in range(len(captions[j]), maxCaptionLen):
#       captions[j].append(2)

# dynamicPadding(batch)

# target = torch.tensor(batch["tokenizedCaption"]).to(device)
target = torch.tensor(batch["tokenizedAspectList"]).to(device)
# target = target.unsqueeze(0)
print(input.shape)
print(target.shape)
modelOutput = newModel(input, target)
# loadedOutput = loaded_model(input, target)

torch.Size([1, 1024, 128])
torch.Size([1, 14])


In [None]:
tempOutput = modelOutput.view(-1, modelOutput.shape[2])
print(tempOutput.shape)

torch.Size([88, 5475])


In [None]:
print(modelOutput)
max = modelOutput[0].argmax(1)
print(max)

tensor([[[-2.5003,  1.9018, -2.6463,  ..., -3.2833, -3.7127, -3.0394],
         [-2.3698,  3.4217, -1.9702,  ..., -3.4956, -2.8221, -2.8570],
         [-1.3004,  1.9933, -0.7634,  ..., -2.0791, -1.3730, -1.9847],
         ...,
         [-0.5852,  2.4742,  0.2530,  ..., -1.0860, -1.1115, -0.5978],
         [-2.2533,  5.3062, -1.8107,  ..., -2.6317, -3.4494, -3.4386],
         [-1.3028,  2.9477, -0.4578,  ..., -2.1496, -1.6670, -1.1011]]],
       device='cuda:0', grad_fn=<ViewBackward0>)
tensor([  4, 229, 783, 132, 571,  46, 200,  38, 229,  24, 313,  24,   1, 226],
       device='cuda:0')


In [None]:
lossOutput = modelOutput[0]
for i in range(len(max)):
  if(max[i] == 1):
    for j in range(i+1, len(max)):
      lossOutput[j][2] = 100

newMax = lossOutput.argmax(1)
print(newMax)

tensor([  0,   9,  94, 299,  40,  39, 416, 334,  11,  27,   4,  11,  87, 154,
        321,  96,  82,   4,  16, 170,  72,  42,  16,  91,  41,  42,  96,  20,
        156, 190,  16,  96, 534,  16,  87,  31, 180,  27,   4,  11,  87,  11,
         16, 416,  16,  42,  16,  20,  16,  11,  20, 219,  27,  39,  11,  87,
         27,   1,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
          2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
          2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
          2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
          2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
          2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
          2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2],
       device='cuda:0')


In [None]:
def createSentence(input):
  sentence = []
  for i in range(len(input)):
    sentence.append(vocab_list[input[i]])
  return sentence

In [None]:
outputSentence = createSentence(max)
print(outputSentence)
print(ds_test[0]["aspect_list"])

['low', 'classical', 'hypnotic', 'music', 'elements', 'acoustic', 'bansuri', 'singing', 'classical', 'tempo', 'uptempo', 'tempo', '<EOS>', 'easygoing']
['oriental', 'meditative', 'zitar', 'percussive', 'sounds', 'shrutibox', 'voices', 'chanting', 'slow', 'to', 'medium', 'tempo']


In [None]:
pred1 = modelOutput[0].argmax(1)
print(pred1)
for i in range(len(pred1)):
  if(pred1[i] == 1):
    tempOutput = modelOutput[0][:i]
    print(tempOutput.shape)
    break

pred2 = modelOutput[1].argmax(1)
print(pred2)

for j in range(len(pred2)):
  if(pred2[j] == 1):
    tempOutput2 = modelOutput[1][:j]
    print(tempOutput2.shape)
    break

tensor([ 39,  87,   9,   9, 238,   9,  96,   9,  27,   9,  27,   9,  96,  27,
          4,  94,  23,  87,   9,  20,   4,  94,  27,  20,   1,  11,  40, 238,
         77,   8,   3,   4,  27,  87,   4,  11,  87,  90,  91,  49,   4,   3,
        223,  27,   1,  27], device='cuda:0')
torch.Size([24, 5475])
tensor([ 39,  11,   6,   7,   8,   9,  96,  27,  87, 285,  77,   9,  96,   6,
         27,  20,   4,  87,  20,  20,   3,  27,  27,  20,  27,  27,  20,   9,
          4,   4,  27,   4,  11,  87,   9,  27,   4,  90,  91,  27,  27,   3,
         11,  27,  20,  27], device='cuda:0')


In [None]:
newTempOutput = torch.cat((tempOutput, tempOutput2))
print(newTempOutput.shape)
# currOutput = currOutput.view(-1, currOutput.shape[2])
# targetInput = target[:,1:].reshape(-1)

NameError: ignored

In [None]:
targetTest1 = [0, 1]
targetTest2 = [0, 1]
for i in range(151):
  targetTest1.append(2)
  targetTest2.append(2)

test1 = torch.tensor(targetTest1).unsqueeze(0).to(device)
test2 = torch.tensor(targetTest2).unsqueeze(0).to(device)
targetCaption = torch.cat((test1, test2))
print(targetCaption.shape)

torch.Size([2, 153])


In [None]:
testOutput = newModel(input, targetCaption)
print(testOutput)
max = testOutput[0].argmax(1)
print(max)

## Dynamic Padding

In [None]:
batch = ds_train[0:2]
captions = batch["tokenizedCaption"]
maxCaptionLen = 0

# Find max length of caption in batch
for i in range(batch_size):
  if(len(captions[i]) > maxCaptionLen):
    maxCaptionLen = len(captions[i])

for j in range(batch_size):
  if(len(captions[j]) < maxCaptionLen):
    for k in range(len(captions[j]), maxCaptionLen):
      captions[j].append(2)


target = torch.tensor(batch["tokenizedCaption"]).to(device)
print(len(batch["tokenizedCaption"][0]))
print(len(batch["tokenizedCaption"][1]))

78
78


In [None]:
!zip -r 'audioCaptionModel.zip' 'newModelCheckpoint.pth'

  adding: newModelCheckpoint.pth (deflated 7%)


In [None]:
from google.colab import files
files.download("newModelCheckpoint.pth")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files
files.download("audioCaptionModel.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Inference

In [None]:
# [<SOS> ... <PAD> ]
infTarg = [0]
for i in range(152):
  infTarg.append(2)
print(len(infTarg))

infTarg = torch.tensor(infTarg).unsqueeze(0).to(device)
infTarg = torch.cat((infTarg, infTarg))
print(infTarg.shape)

153
torch.Size([2, 153])


In [None]:
input = torch.tensor(ds_test[0:2]["input_values"]).squeeze(1).to(device)
modelOutput = newModel(input, infTarg)

# Branch out, rank, reduce, repeat
beam_size = 3

probabilities = modelOutput.clone()
hypothesis = []
score = []
for i in range(beam_size):
  pred = probabilities[0].argmax(1)
  hypothesis.append([pred])
  probabilities[0] = probabilities[0].remove(pred)


In [None]:
beam_size = 2
text = [0]
text = torch.tensor(text).unsqueeze(0).to(device)
hypotheses = [text]

newText = torch.tensor([0]).unsqueeze(0).to(device)

currSeqPos = 0

In [None]:
output = newModel(input,text)

In [None]:
# Code to slice an index from the output tensor
# output = newModel(input,text)
seqPos = 0;
predIdx = 4
# output[0][seqPos] = torch.cat((output[0][seqPos][:4], output[0][seqPos][4:]))
# output[0][seqPos][predIdx] = torch.cat((output[0][seqPos][:predIdx], output[0][seqPos][predIdx+1:]))
# output[0][seqPos] = tempOutput

# Can't set the output to a tensor with different values - in the case of slicing, we remove 1
# So we instead set that prediction to 0
print(output[0].argmax(1))
predIdx = output[0].argmax(1)
print(output[0][0][predIdx])
output[0][seqPos][predIdx] = 0
print(output[0][0][predIdx])

tensor([39], device='cuda:0')
tensor([7.6185], device='cuda:0', grad_fn=<IndexBackward0>)
tensor([0.], device='cuda:0', grad_fn=<IndexBackward0>)


In [None]:
newInputTensor = torch.tensor([5]).to(device)
newText = newText.squeeze(0).to(device)
newText = torch.cat((newText, newInputTensor))
# newText = newText.unsqueeze(0).to(device)

In [None]:
print(newText)

tensor([0, 5, 5, 5], device='cuda:0')


In [None]:
newText[0] = tempVal

RuntimeError: ignored

In [None]:
def beam_search(input, newText, currSeqPos, max_depth=100):

  if(currSeqPos >= max_depth):
    return

  newText = newText.unsqueeze(0).to(device)
  output = newModel(input, newText)
  # del probabilities
  # probabilities = output.detach().clone()
  if(currSeqPos != 0):
    currSeqPos += 1


  for i in range(beam_size):
    print(beam_size)
    # Provides values for each token in sequence - found via largest probability
    # prediction = probabilities[0].argmax(1)
    prediction = output[0].argmax(1)
    tokenIdx = prediction[currSeqPos]
    print(tokenIdx)
    newToken = tokenIdx.view(1)
    print(newToken)
    # Appends token to newText based on the position of the sequence
    # newText.append(tokenIdx)
    # newText[0] = torch.cat((newText[0], tokenIdx))
    newText = newText.squeeze(0).to(device)
    print(newText)
    newText = torch.cat((newText, newToken))
    # Remove the highest probability from the output to find the next highest probability for the next iteration of the beam length
    print(output[0][currSeqPos][tokenIdx])
    output[0][currSeqPos][tokenIdx] = 0
    print(output[0][currSeqPos][tokenIdx])
    # Calculate score
    # Delete current probabilities tensor to free up memory
    beam_search(input, newText, currSeqPos)

  # del probabilities

beam_search(input, newText, currSeqPos)

2
tensor(4, device='cuda:0')
tensor([4], device='cuda:0')
tensor([0], device='cuda:0')
tensor(7.9543, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(0., device='cuda:0', grad_fn=<SelectBackward0>)
2
tensor(4, device='cuda:0')
tensor([4], device='cuda:0')
tensor([0, 4], device='cuda:0')
tensor(7.9543, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(0., device='cuda:0', grad_fn=<SelectBackward0>)
2
tensor(4, device='cuda:0')
tensor([4], device='cuda:0')
tensor([0, 4, 4], device='cuda:0')
tensor(7.9543, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(0., device='cuda:0', grad_fn=<SelectBackward0>)


OutOfMemoryError: ignored

In [None]:
def beam_search_iterative(input, newText, max_depth=10):
  for currSeqPos in range(max_depth):
    newText = newText.unsqueeze(0).to(device)
    print("New Text shape", newText.shape)
    output = newModel(input, newText)
    probabilities = output.detach().clone()

    for i in range(beam_size):
      prediction = probabilities[0].argmax(1)
      tokenIdx = prediction[currSeqPos]
      # print(tokenIdx)
      # if(tokenIdx == 1):
      #   break
      newToken = tokenIdx.view(1)
      newText = newText.squeeze(0).to(device)
      newText = torch.cat((newText, newToken))
      output[0][currSeqPos][tokenIdx] = 0

    del probabilities


beam_search_iterative(input, newText)

In [None]:
import collections
# Test cell to remove a number from a tensor
arr = [ 4,   35,   31, 2004, 2126,  197]
remove = 197
tensor = torch.tensor(arr).to(device)
filter=2
# print(tensor)
# index = ((tensor == remove).nonzero().squeeze())
# print(index)
# newTen = torch.cat((tensor[:index], tensor[index+1:]))
# print(newTen)
# test = collections.deque(tensor, maxlen=filter)
# print(test)

tenLen = tensor.shape[0]
print(tenLen)

indexes = tenLen - filter

test = tensor[indexes:]
print(test)
# test = tensor[0]
for i in range(len(test)):
  if(test[i] == remove):
    found = i
  else:
    found = 0


newIdx = indexes+found
print(tensor[newIdx])

6
tensor([2126,  197], device='cuda:0')
tensor(197, device='cuda:0')


In [None]:
import torch.nn.functional as F
class BeamSearchNode:
  def __init__(self, prev_node, token_id, log_prob, length):
    self.prev_node = prev_node
    self.token_id = token_id
    self.log_prob = log_prob
    self.length = length


def top_p_sampling(logits, p):
  # Sort logits in descending order
  sorted_logits, sorted_indices = torch.sort(logits, descending=True)
  print(sorted_logits.shape)
  # Calculate cumulative probabilities
  cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
  print(cumulative_probs)
  # Determine indices to sample
  indices_to_sample = sorted_indices[cumulative_probs < p]
  print("Indices_to_sample", indices_to_sample, "len", indices_to_sample.shape)
  return indices_to_sample


def sample_from_subset(indices_to_sample, logits):
  # Sample one index from the subset
  multinomial_input = F.softmax(logits[indices_to_sample], dim=-1)
  print("Multinomial_input shape", multinomial_input.shape)
  sampled_index = torch.multinomial(multinomial_input, 1)
  return sampled_index.item()


def top_p_sample(logits, prevText, p, filter):
  # Compute indices to sample from
  indices_to_sample = top_p_sampling(logits, p)
  # Sample from the subset
  sampled_index = sample_from_subset(indices_to_sample, logits)
  token = indices_to_sample[sampled_index]
  # Token filtering here
  # If token is in token history then remove it from indices to sample and sample from subset again
  filterWindow = prevText.shape[1] - filter
  token_history = prevText[0][filterWindow:]
  print("History", token_history)
  for i in range(len(token_history)):
    if(token_history[i] == token):
      print("history", token_history[i], "token", token)
      indices_to_sample = torch.cat((indices_to_sample[:sampled_index], indices_to_sample[sampled_index+1:]))
      sampled_index = sample_from_subset(indices_to_sample, logits)
      token = indices_to_sample[sampled_index]
      continue
  # indices_to_sample = torch.cat((indices_to_sample[:sampled_index], indices_to_sample[sampled_index+1:]))
  return token



# def node_to_array(node):
#   array = [node.token_id]
#   while(node.prev):
#     node = node.prev
#     array.append(node.token_id)
#   print(array)
#   return array


def beam_search_node(newModel, input, newText, max_length=10, num_beams=2):
  initial_node = BeamSearchNode(prev_node=None, token_id=newText, log_prob=0.0, length=1)
  beam = [initial_node]

  for _ in range(max_length):
    new_beam = []
    for node in beam:
      if node.length >= max_length:
        new_beam.append(node)
        continue

      tempNodes = node
      prevText = []
      while tempNodes.prev_node:
        prevText.append(tempNodes.prev_node.token_id)
        tempNodes = tempNodes.prev_node
      if(prevText == []):
        prevText.append(0)
      # Reverses the array
      prevText = prevText[::-1]
      prevText.append(node.token_id)
      prevText = torch.tensor(prevText).unsqueeze(0).to(device)
      print("Prev text", prevText)
      # print("Prev text shape", prevText.shape)
      # print("New token shape", newToken.shape)
      text_input = prevText

      print("_", _)
      print("node length", node.length)
      print("node token id", node.token_id)
      print("New beam len", len(new_beam))
      print("beam len", len(beam))

      with torch.no_grad():
        # Log prob
        print("Text input shape", text_input.shape)
        preds = newModel(input, text_input)
        print("Pred shape", preds.shape)
        temperature = 1.25
        scaled_logits = preds[0][0] / temperature
        top_p_index = top_p_sample(scaled_logits, prevText, p=0.8, filter=3)
        log_probs = F.log_softmax((scaled_logits), dim=-1)
        # log_probs = F.log_softmax(preds[0], dim=-1)
        print("Log probs shape", log_probs.shape)

      topk_probs, topk_ids = torch.topk(log_probs, num_beams)
      print(topk_probs)
      seqIdx = log_probs.shape[0] - 1

      # for prob, token_id in zip(topk_probs[seqIdx], topk_ids[seqIdx]):
      #   print("Prob", prob)
      #   new_node = BeamSearchNode(
      #       prev_node=node,
      #       token_id=token_id.item(),
      #       log_prob=node.log_prob+prob.item(),
      #       length = node.length + 1
      #   )
      #   print("New node token", new_node.token_id)
      #   new_beam.append(new_node)
      #   print("New beam after len", len(new_beam))

      new_node = BeamSearchNode(
          prev_node=node,
          token_id=top_p_index,
          log_prob = node.log_prob+scaled_logits[top_p_index],
          length = node.length+1,
      )
      new_beam.append(new_node)


    beam = sorted(new_beam, key=lambda x: x.log_prob, reverse=True)[:num_beams]

  output_sequences = []
  for node in beam:
    output_sequence = []
    while node:
      output_sequence.append(node.token_id)
      node = node.prev_node
    output_sequence.reverse()
    output_sequences.append(output_sequence)
  return output_sequences


gen = beam_search_node(newModel, input, newText, max_length=30, num_beams=2)

Prev text tensor([[0, 0]], device='cuda:0')
_ 0
node length 1
node token id tensor([[0]], device='cuda:0')
New beam len 0
beam len 1
Text input shape torch.Size([1, 2])
Pred shape torch.Size([1, 2, 4308])
torch.Size([4308])
tensor([0.0831, 0.1621, 0.2254,  ..., 1.0000, 1.0000, 1.0000], device='cuda:0')
Indices_to_sample tensor([   4,   10,   74,  ..., 4101,  711, 3050], device='cuda:0') len torch.Size([1049])
Multinomial_input shape torch.Size([1049])
History tensor([0], device='cuda:0')
Log probs shape torch.Size([4308])
tensor([-2.4876, -2.5386], device='cuda:0')
Prev text tensor([[0, 4]], device='cuda:0')
_ 1
node length 2
node token id tensor(4, device='cuda:0')
New beam len 0
beam len 1
Text input shape torch.Size([1, 2])
Pred shape torch.Size([1, 2, 4308])
torch.Size([4308])
tensor([0.0831, 0.1621, 0.2254,  ..., 1.0000, 1.0000, 1.0000], device='cuda:0')
Indices_to_sample tensor([   4,   10,   74,  ..., 4101,  711, 3050], device='cuda:0') len torch.Size([1049])
Multinomial_input s

In [None]:
print(newText)
# print(hypotheses)
print(gen)
print(len(gen[0]))

tensor([[0]], device='cuda:0')
[[tensor([[0]], device='cuda:0'), tensor(104, device='cuda:0'), tensor(104, device='cuda:0'), tensor(506, device='cuda:0'), tensor(104, device='cuda:0'), tensor(104, device='cuda:0'), tensor(66, device='cuda:0'), tensor(66, device='cuda:0'), tensor(651, device='cuda:0'), tensor(76, device='cuda:0'), tensor(48, device='cuda:0'), tensor(76, device='cuda:0'), tensor(76, device='cuda:0'), tensor(100, device='cuda:0'), tensor(4, device='cuda:0'), tensor(104, device='cuda:0'), tensor(66, device='cuda:0'), tensor(76, device='cuda:0'), tensor(4, device='cuda:0'), tensor(4, device='cuda:0'), tensor(66, device='cuda:0'), tensor(18, device='cuda:0'), tensor(104, device='cuda:0'), tensor(76, device='cuda:0'), tensor(104, device='cuda:0'), tensor(104, device='cuda:0'), tensor(66, device='cuda:0'), tensor(507, device='cuda:0'), tensor(934, device='cuda:0'), tensor(104, device='cuda:0')]]
30


In [None]:
tokenizedInput = [  0,    4,  427,   28,   74,    4,   13,  270,  558,   10,   64,  199,
         2215, 1158,   41,  269,  692,  229,    4, 2174,  233, 2084, 2095,  557,
           95,  744,  520, 4229, 2037]
outputSentence = createSentence(tokenizedInput)
print(outputSentence)
print(ds_test[0]["aspect_list"])

['<SOS>', 'low', 'scuffling', 'instrumental', 'live', 'low', 'piano', 'loud', 'symphonic', 'female', 'unrelated', 'shrutibox', 'tap', 'swing', 'jazzy', 'harp', 'indian', 'classical', 'low', 'diy', 'joyful', 'climax', 'snapping', 'opera', 'kids', 'acapella', 'bad', 'benign', 'hindu']
['percussion', 'music', 'no', 'other', 'instruments', 'no', 'voices', 'instrumental', 'advertisement', 'music', 'promotional', 'music']


In [None]:
vocab_list[4]

'low'

In [None]:
input = torch.tensor(ds_test[0:2]["input_values"]).squeeze(1).to(device)

for i in range(152):
  modelOutput = newModel(input, infTarg)
  pred = modelOutput[0].argmax(1)
  infTarg[0][i+1] = torch.tensor(pred[i+1])
  print(pred[i+1])
  if(pred[i+1] == 1):
    break

  infTarg[0][i+1] = torch.tensor(pred[i+1])


tensor(27)
tensor(27)


KeyboardInterrupt: ignored

In [None]:
print(vocab_list[4])

the


In [None]:
print(infTarg)

tensor([[  0, 110, 110, 110, 219, 110, 219, 219, 219, 110, 110, 110, 110, 110,
         219, 219, 110, 110, 110, 219, 110, 219, 219, 110, 110, 110, 219, 219,
         110, 110, 110, 219, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110,
         219, 219, 219, 110, 219, 219, 110, 110, 219, 110, 219, 219, 110, 110,
         110, 110, 219, 219, 110, 110, 110, 110, 110, 110, 219, 219, 110, 110,
         110, 110, 110, 110, 110, 219, 219, 219, 110, 219, 110, 110, 110, 110,
         110, 219, 110, 110, 219, 110, 110, 110, 110, 219, 110, 110, 110, 110,
         110, 110, 110, 110, 110, 110, 110, 219, 219, 219, 110, 219, 219, 110,
         110, 110, 110, 110, 110, 110, 219, 110, 219, 110, 110, 110, 110, 219,
         219, 110, 219, 219, 110, 110, 219, 110, 110, 110, 219, 219, 219, 110,
         110, 219, 219, 110, 219, 110, 110, 110, 219, 219, 110, 219, 219],
        [  0,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
           2,   2,   2,   2,   2,   2,   2,   2,   2,   

In [None]:
encoderOutput = model(input)

In [None]:
print(encoderOutput.keys())

odict_keys(['last_hidden_state', 'pooler_output'])


In [None]:
print(encoderOutput["last_hidden_state"].shape)
print(encoderOutput["pooler_output"].shape)
print(encoderOutput["pooler_output"].unsqueeze(1).expand(-1, target.shape[1], -1).shape)
enc_src = encoderOutput["pooler_output"].unsqueeze(1).expand(-1, target.shape[1], -1)
print(enc_src.shape)
print(enc_src[0].shape)

torch.Size([1, 1214, 768])
torch.Size([1, 768])
torch.Size([1, 153, 768])
torch.Size([1, 153, 768])
torch.Size([153, 768])


In [None]:
def make_src_mask(src):
  src_mask = (src != src_pad_idx).unsqueeze(1).unsqueeze(2)
  # (N, 1, 1, src_length)
  return src_mask.to(device)

def make_trg_mask(trg):
  N, trg_len = trg.shape
  # trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
  #     N, 1, trg_len, trg_len
  # )
  trg_mask = torch.tril(torch.ones(trg_len, N))
  return trg_mask.to(device)



src_mask = make_src_mask(input)
trg_mask = make_trg_mask(target)

decoderBlock = Decoder(trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length).to(device)

decoderOutput = decoderBlock(target, enc_src, src_mask, trg_mask).to(device)

In [None]:
print(target[:,:-1].shape)

torch.Size([1, 152])


In [None]:
print(decoderOutput.shape)
print(decoderOutput[0])

torch.Size([1, 153, 5475])
tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]], grad_fn=<SelectBackward0>)


In [None]:
print(modelOutput[0].shape)
print(modelOutput[0])
pred = modelOutput[0].argmax(1)
print(pred)

In [None]:
# currOutput[0]
# pred = currOutput[0].argmax(1)
# print(pred)
# print(len(currOutput[0][1]))
# print(currOutput.shape)
print(currOutput)
# print(outputSentence)
print(pred)

tensor([[0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.7229, 0.5077, 0.4864,  ..., 0.3799, 0.6367, 0.4397],
        [0.1653, 0.7800, 0.5162,  ..., 0.6711, 0.3135, 0.3795],
        ...,
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000]],
       device='cuda:0', grad_fn=<ViewBackward0>)
tensor([   0,   39,  196,   13,    9,  918,   91,   55,  210,  132,  716,   43,
         669,  110,    9,   19,   49,    4,   61,  394,  112,   96,    9,  730,
         461,   20,    9,   72,  461,   27,   39,   11,  109,   90,  110,   36,
           9, 1967, 5278,   39,  918,   27,    1,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,

In [None]:
print(vocab_list[310])

used


In [None]:
outputSentence = createSentence(pred)
print(outputSentence)

['<SOS>', 'this', 'audio', 'contains', 'a', 'composition', 'played', 'by', 'brass', 'instruments', 'such', 'as', 'trumpets', 'playing', 'a', 'melody', 'in', 'the', 'higher', 'range', 'along', 'with', 'a', 'deeper', 'horn', 'and', 'a', 'bass', 'horn', '.', 'this', 'song', 'may', 'be', 'playing', 'at', 'a', 'theater', 'presenting', 'this', 'composition', '.', '<EOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '<SOS>', '

In [None]:
for i in range(len(ds_train)):
  if(ds_train[i]["ytid"] == "r_KdRKquXsM"):
    print(ds_train[i]["caption"])

This audio contains a composition played by brass instruments such as trumpets playing a melody in the higher range along with a deeper horn and a bass horn. This song may be playing at a theater presenting this composition.


In [None]:
input = torch.tensor(ds_test[0]["input_values"]).to(device)
target = torch.tensor(ds_test[0]["tokenizedCaption"]).to(device)
testOutput = newModel(input.squeeze(1), target.unsqueeze(0))

In [None]:
print(testOutput[0])

tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]], device='cuda:0',
       grad_fn=<SelectBackward0>)


In [None]:
print(currOutput[1][489])

tensor(0.5000, grad_fn=<SelectBackward0>)


In [None]:
from gensim.models import Word2Vec
import gensim.downloader
# word2vec = Word2Vec.load()
glove_vec = gensim.downloader.load('word2vec-google-news-300')

In [None]:
glove_vec.most_similar("guitar")

In [None]:
glove_vec.index_to_key

In [None]:
closest_word = None
max_sim = -1
vector1 = output[0][1][0]
vector2 = torch.zeros(149)
print(vector1.shape)

vector = torch.cat((vector1, vector2), dim=0)
for word in glove_vec.index_to_key:
  word_vector = torch.tensor(glove_vec[word])
  sim = torch.dot(vector, word_vector) / (torch.norm(vector) * torch.norm(word_vector)).detach.numpy()
  if sim > max_sim:
    max_sim = sim
    closest_word = word


print(closest_word)

# Debugging

In [None]:
with torch.no_grad():
  outputs = model(inputs)

In [None]:
print(outputs.keys())
outputs["last_hidden_state"].shape

odict_keys(['last_hidden_state', 'pooler_output'])


torch.Size([1, 1214, 768])

In [None]:
decoder = Decoder(trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length).to(device)

In [None]:
print(decoder)

Decoder(
  (word_embedding): Embedding(6143, 768)
  (layers): ModuleList(
    (0-11): 12 x DecoderBlock(
      (attention): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
      )
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (transformer_block): TransformerBlock(
        (attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (feed_forward): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): ReLU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0, inplace=False)
      )
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (fc_out): Linear(in_featur

In [None]:
def make_src_mask(src):
  # src_mask = (src != src_pad_idx).unsqueeze(1).unsqueeze(2)
  src_mask = (src != src_pad_idx).squeeze()
  # (N, 1, 1, src_length)
  return src_mask.to(device)

In [None]:
def make_trg_mask(trg):
  N, trg_len = trg.shape
  # trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
  #     N, 1, trg_len, trg_len
  # )
  trg_mask = torch.tril(torch.ones(trg_len, N))
  return trg_mask.to(device)

In [None]:
enc_src = outputs["pooler_output"].unsqueeze(1).expand(-1, 18, -1)
src_mask = make_src_mask(inputs)
trg_mask = make_trg_mask(trg)
print(trg_mask.shape)
print(src_mask.shape)
decodeOut = decoder(trg, enc_src, trg_mask, trg_mask)

torch.Size([18, 1])
torch.Size([1024, 128])


In [None]:
print(decodeOut.shape)

torch.Size([1, 18, 6143])


In [None]:
N, seq_length = trg.shape
positions = torch.arange(0, seq_length).expand(N, seq_length).to(device)
word_embedding = nn.Embedding(trg_vocab_size, embed_size).to(device)
# position_embedding = nn.Embedding(embed_size, max_length).to(device)
position_embedding = nn.Parameter(torch.randn(max_length, embed_size)).to(device)
dropoutFunc =  nn.Dropout(dropout)

In [None]:
word = word_embedding(trg)
# pos = position_embedding(positions)
pos = position_embedding[:seq_length]
print(word.shape)
print(pos.shape)
x = dropoutFunc(word + pos)
# x = dropout((word + pos))
print(x.shape)

torch.Size([1, 18, 768])
torch.Size([18, 768])
torch.Size([1, 18, 768])


In [None]:
self_attention = nn.MultiheadAttention(embed_size, heads)
self_norm = nn.LayerNorm(embed_size)
attention, _ = self_attention(x, x, x, trg_mask)
print(attention)
query = dropoutFunc(self_norm(attention + x))
print(query.shape)
# Modifies enc_src to be the same shape as query
test_enc_src = enc_src.unsqueeze(1).expand(-1, query.size(1), -1)
print(test_enc_src.shape)
print(src_mask.shape)
test_src_mask = src_mask[-1, -1, :, :]
print(test_src_mask.shape)

tensor([[[-0.7805, -0.1441, -1.3261,  ..., -0.6013, -0.0808,  0.0878],
         [ 0.7496,  1.0978,  0.0337,  ..., -0.5370, -0.1557, -0.3764],
         [-1.1219, -0.2174, -0.2250,  ...,  0.6839, -0.2006,  0.6456],
         ...,
         [ 0.2170, -0.6863, -0.5809,  ...,  0.0870,  0.3194,  0.1975],
         [-0.1514, -0.7697,  0.3599,  ...,  1.1205, -0.8512, -0.4996],
         [ 0.7957,  0.4360,  0.1964,  ..., -0.3208, -1.2579, -0.8765]]],
       grad_fn=<ViewBackward0>)
torch.Size([1, 18, 768])
torch.Size([1, 18, 768])
torch.Size([1, 1, 1024, 128])
torch.Size([1024, 128])


In [None]:
transform_attention = self_attention(query, test_enc_src, test_enc_src, trg_mask)

In [None]:
self_transformer_block = TransformerBlock(embed_size, heads, dropout, forward_expansion)
print(enc_src.shape)
trans = self_transformer_block(query, test_enc_src, test_enc_src, src_mask)

torch.Size([1, 768])


AssertionError: ignored

In [None]:
print(trg_mask.shape)

torch.Size([18, 18])


In [None]:
target = [[1,3,2,4,5], [6,7,8,9,10]]
target = np.asarray(target)
target_mask = make_trg_mask(target)

tensor([[[[1., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0.],
          [1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1.]]],


        [[[1., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0.],
          [1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1.]]]])


In [None]:
print(target_mask)
print(target_mask.shape)

tensor([[[[1., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0.],
          [1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1.]]],


        [[[1., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0.],
          [1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1.]]]])
torch.Size([2, 1, 5, 5])


# LSTM Decoder Implementation

In [None]:
print(model.encoder.layer)

ModuleList(
  (0-11): 12 x ASTLayer(
    (attention): ASTAttention(
      (attention): ASTSelfAttention(
        (query): Linear(in_features=768, out_features=768, bias=True)
        (key): Linear(in_features=768, out_features=768, bias=True)
        (value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (output): ASTSelfOutput(
        (dense): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (intermediate): ASTIntermediate(
      (dense): Linear(in_features=768, out_features=3072, bias=True)
      (intermediate_act_fn): GELUActivation()
    )
    (output): ASTOutput(
      (dense): Linear(in_features=3072, out_features=768, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  )
)


In [None]:
class Decoder(nn.Module):
  def __init__(self, input=3072, output_size=64, embedding_size=64, hidden_size=64, num_layers=2, p=0.2):
    super(Decoder, self).__init__()

    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.dropout = nn.Dropout(p)
    self.embedding = nn.Embedding(input, embedding_size)
    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
    self.fc = nn.Linear(hidden_size, output_size)

    self.h = [torch.zeros(num_layers, 4, hidden_size) for _ in range(2)]

  def forward(self, x, hidden):
    x = x.unsqueeze(0)
    # embedding = self.dropout(self.embedding(x))
    embedding = self.embedding(x.int())
    outputs, (hidden, cell) = self.rnn(embedding, (hidden, self.h))
    predictions = self.fc(outputs)
    predictions = predictions.squeeze(0)
    return predictions, hidden, cell


In [None]:
class Decoder(nn.Module):
  def __init__(self, vocab_size, n_hidden, n_layers):
    super(Decoder, self).__init__()

    self.input_hidden = nn.Embedding(vocab_size, n_hidden)
    self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
    self.hidden_out = nn.Linear(n_hidden, vocab_size)
    self.h = [torch.zeros(n_layers, 4, n_hidden) for _ in range(2)]

  def forward(self, x, bias):
    res, h = self.rnn(self.input_hidden(x.int()), self.h)
    self.h = [h_.detach() for h_ in h]
    return self.hidden_out(res)

In [None]:
# model.encoder
# model.encoder.layer.output = LSTM(3072, 64)
h = [torch.zeros(2, 4, 64) for _ in range(2)]
print(len(h))
print(h[0].shape)
input_h = nn.Embedding(3072, 64)
print(input_h)

2
torch.Size([2, 4, 64])
Embedding(3072, 64)


In [None]:
class Instantiate(nn.Module):
  def __init__(self):
    super(Instantiate, self).__init__()

  def forward(self, x, bias):
    return x

In [None]:
# print(model.encoder.layer[0].output)
for layer in model.encoder.layer:
  # layer.output = Decoder(3072, 64, 2)
  layer.output = Instantiate()

print(model.encoder.layer[0:12])

ModuleList(
  (0-11): 12 x ASTLayer(
    (attention): ASTAttention(
      (attention): ASTSelfAttention(
        (query): Linear(in_features=768, out_features=768, bias=True)
        (key): Linear(in_features=768, out_features=768, bias=True)
        (value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (output): ASTSelfOutput(
        (dense): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (intermediate): ASTIntermediate(
      (dense): Linear(in_features=768, out_features=3072, bias=True)
      (intermediate_act_fn): GELUActivation()
    )
    (output): Instantiate()
    (layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  )
)


In [None]:
with torch.no_grad():
  outputs = model(inputs)

RuntimeError: ignored

In [None]:
print(model.encoder.layer)

ModuleList(
  (0-11): 12 x ASTLayer(
    (attention): ASTAttention(
      (attention): ASTSelfAttention(
        (query): Linear(in_features=768, out_features=768, bias=True)
        (key): Linear(in_features=768, out_features=768, bias=True)
        (value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (output): ASTSelfOutput(
        (dense): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (intermediate): ASTIntermediate(
      (dense): Linear(in_features=768, out_features=3072, bias=True)
      (intermediate_act_fn): GELUActivation()
    )
    (output): Decoder(
      (dropout): Dropout(p=0.2, inplace=False)
      (embedding): Embedding(3072, 64)
      (rnn): LSTM(64, 64, num_layers=2, dropout=0.2)
      (fc): Linear(in_features=64, out_features=64, bias=True)
    )
    (layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)