<a href="https://colab.research.google.com/github/JTStephens18/AudioTranscriptor/blob/main/V5_VisionEncDec_audioProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install yt-dlp
# Install huggingface audio datasets
! pip install datasets[audio]
! pip install transformers evaluate jiwer
!pip install accelerate -U
!pip install nltk



In [2]:
from datasets import load_dataset, Audio, Dataset
from transformers import VisionEncoderDecoderModel, AutoFeatureExtractor, AutoTokenizer, AutoProcessor, ASTFeatureExtractor
import subprocess
import os
from pathlib import Path
import torch
import torch.nn as nn
import torchaudio
import soundfile as sf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import librosa
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
ds = load_dataset('google/MusicCaps', split="train")
ds

Dataset({
    features: ['ytid', 'start_s', 'end_s', 'audioset_positive_labels', 'aspect_list', 'caption', 'author_id', 'is_balanced_subset', 'is_audioset_eval'],
    num_rows: 5521
})

In [4]:
ds = ds.remove_columns(["author_id", "is_balanced_subset", "is_audioset_eval", "audioset_positive_labels"])

# Tokenize the dataset

In [5]:
import re
from nltk.tokenize import word_tokenize
# vocab_list = ["word 1", "word 2",  ..., "word N"]
vocab_list = ["<SOS>", "<EOS>", "<PAD>", "<UNK>"]
max_len = 0

for i in range(ds.num_rows):
  captionSplit = word_tokenize(ds[i]["caption"])
  if(len(captionSplit) > max_len):
    max_len = len(captionSplit)
  for j in range(len(captionSplit)):
    word = captionSplit[j].lower()
    if(word) not in vocab_list:
      vocab_list.append(word)

# vocab_dict = { "word": index }
vocab_dict = {}
for i in range(len(vocab_list)):
  vocab_dict[vocab_list[i]] = i

In [None]:
max_len

151

In [43]:
print(len(vocab_list))
print(vocab_list[1])

6147
<EOS>


In [None]:
print(vocab_dict["<EOS>"])

2


In [None]:
def testToken(input):
  output = []
  input = word_tokenize(input.lower())
  for i in range(len(input)):
    if(input[i]) in vocab_dict:
      index = vocab_dict[input[i]]
      output.append(index)
    else:
      output.append("<UNK>")
  return output

In [None]:
input = word_tokenize(ds["train"][500]["caption"])
print(len(input))
for i in range(len(input), max_len):
  input.append("<PAD>")

print(len(input))
print(input[100])

82
151
<PAD>


In [None]:
print(ds[500]["caption"])

A male vocalist sings this vigorous Rock song. The tempo is fast with enthusiastic electric guitar lead and harmonies, electric bass guitar and hard hitting drumming with cymbal rides. The song is passionate, youthful, enthusiastic, intense, compelling, boisterous and vigorous. This song is a Hard Rock/Heavy Metal song.


In [None]:
testVal = testToken(ds[500]["caption"])
print(testVal)

[5, 90, 294, 180, 35, 397, 329, 7, 23, 0, 316, 83, 149, 92, 317, 37, 38, 163, 16, 447, 12, 37, 68, 38, 16, 728, 1500, 172, 92, 318, 2050, 23, 0, 7, 83, 215, 12, 724, 12, 317, 12, 404, 12, 1240, 12, 2051, 16, 397, 23, 35, 7, 83, 5, 728, 2052, 720, 7, 23]


In [6]:
ds = ds.train_test_split(test_size=0.2)

In [7]:
feature_extractor = ASTFeatureExtractor()
feature_extractor

ASTFeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "ASTFeatureExtractor",
  "feature_size": 1,
  "max_length": 1024,
  "mean": -4.2677393,
  "num_mel_bins": 128,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000,
  "std": 4.5689974
}

In [8]:
def download_clip(
    video_id,
    output_filename,
    start_time,
    end_time,
    tmp_dir='/musiccaps',
    num_attempts=5,
    url_base='https://www.youtube.com/watch?v='
):

  status = False
  command = f"""
        yt-dlp --quiet --no-warnings -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" {url_base}{video_id} --force-keyframes-at-cuts
    """.strip()
  attempts = 0
  while True:
    try:
      # output = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
      output = os.system(command)
    except subprocess.CalledProcess.Error as err:
      attempts += 1
      if attempts == num_attempts:
        return status, err.output
    else:
      break

  # Check if video was successfully saved
  status = os.path.exists(output_filename)
  return status, 'Downloaded'


def toUppercase(aspect_list):
  a_list = aspect_list[1:-1]
  new_list = ''
  for word in a_list:
    new_list = ''.join(a_list).upper().replace(',', '')
    # new_list = new_list.replace(' ', '|')
    if(len(new_list) > 128):
      new_list = new_list[:128]
  return new_list

# Add padding to make all captions the same length
def tokenizeCaption(caption):
  output = [vocab_dict["<SOS>"]]
  input = word_tokenize(caption.lower())
  for i in range(len(input)):
    if(input[i]) in vocab_dict:
      index = vocab_dict[input[i]]
      output.append(index)
    else:
      output.append(vocab_dict['<UNK>'])
  output.append(vocab_dict["<EOS>"])
  # Max_len -2 because I'm appending additional start and end tokens
  for j in range(len(input), max_len-2):
   output.append(vocab_dict["<PAD>"])
  return output


def process(example):
  output_path = str(data_dir / f"{example['ytid']}.wav")
  status = True
  # aspect_string = toUppercase(example['aspect_list'])
  if not os.path.exists(output_path):
    status = False
    status, log = download_clip(
        example['ytid'],
        output_path,
        example['start_s'],
        example['end_s'],
    )

  example["tokenizedCaption"] = tokenizeCaption(example["caption"])
  # example['aspect_string'] = aspect_string
  example["audio"] = output_path
  example['download_status'] = status
  example["image_path"] = f'./spectrograms/{example["ytid"]}.png'
  return example

In [9]:
def stereo_to_mono(wav):
  chan_1 = wav[0][:]
  chan_2 = wav[1][:]
  mono = (chan_1 + chan_2) / 2
  return mono

In [10]:
def resample_waveform(example):
  filepath = example["audio"]
  y, sr = librosa.load(filepath, sr=16000)
  sf.write(filepath, y, sr)
  waveform, sampling_rate = torchaudio.load(filepath)
  if(waveform.shape[0] == 2):
    waveform = stereo_to_mono(waveform)
  # Process it to add 125 more values
  # if(len(waveform[0]) < 160125):
  #   pad_len = 160125 - len(waveform[0])
  #   padding = np.zeros(pad_len)
  #   print(type(padding))
  #   print(type(waveform[0]))
  #   waveformTemp = np.concatenate((np.asarray(waveform[0]), padding), axis=0)
  #   example["waveform"] = waveformTemp
  #   return example
    # padding = torch.zeros(1,pad_len)
    # print("Wav1", type(waveform[0]))
    # print("Pad", type(padding[0]))
    # print("Pad Shape", padding.shape)
    # waveformTemp = torch.cat((waveform[0], padding[0]), dim=0)
    # print("Wav len", len(waveform))
    # print("Wav2", type(waveform))

  example["waveform"] = waveform
  return example

In [11]:
def wavToInput(example):
  wav = np.asarray(example["waveform"])
  inputs = feature_extractor(wav, sampling_rate=sampling_rate, padding="max_length", return_tensors="pt").input_values
  example["input_values"] = inputs
  example["inputs_shape"] = inputs.shape
  return example

In [12]:
samples_to_load = 100
cores = 4
sampling_rate = 16000
writer_batch_size = 1000
data_dir = "./music_data"
upper_limit = 5521
lower_limit = 5000

data_dir = Path(data_dir)
data_dir.mkdir(exist_ok=True, parents=True)

# ds = ds.select(range(lower_limit, upper_limit))

# ds = ds.map(
#     process,
#     num_proc=cores,
#     writer_batch_size=writer_batch_size,
#     keep_in_memory=False
# )


# ds = ds.filter(lambda ex: ex["download_status"] == True)

ds_train = ds["train"].select(range(samples_to_load))
ds_test = ds["test"].select(range(int(samples_to_load*0.2)))

ds_train = ds_train.map(
    process,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
    # batched=True,
)

ds_train = ds_train.filter(lambda ex: ex["download_status"] == True)
# ds_train = ds_train.cast_column("audio", Audio(sampling_rate=sampling_rate))
ds_train = ds_train.map(
    resample_waveform,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
)

ds_train = ds_train.map(
    wavToInput,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
)

ds_test = ds_test.map(
    process,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
    # batched=True,
)

ds_test = ds_test.filter(lambda ex: ex["download_status"] == True)
# ds_test = ds_test.cast_column("audio", Audio(sampling_rate=sampling_rate))
ds_test = ds_test.map(resample_waveform, num_proc=cores, writer_batch_size=writer_batch_size, keep_in_memory=False)
ds_test = ds_test.map(
    wavToInput,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
)

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/98 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/98 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/20 [00:00<?, ? examples/s]

Filter:   0%|          | 0/20 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/19 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/19 [00:00<?, ? examples/s]

In [None]:
for i in range(ds_train.num_rows):
  print(len(ds_train[i]["waveform"][0]))

160000
160000
160125
160125
160000
160125
160000
160000
160125
160000


In [None]:
# print(type(ds_train[0]["waveform"]))
# print(len(ds_train[0]["waveform"]))
print(ds_train[2])
print(ds_train[3])
print(len(ds_train[3]["tokenizedCaption"]))

{'ytid': 'TgCv5a22YUU', 'start_s': 50, 'end_s': 60, 'aspect_list': "['low quality', 'muffled music', 'noisy', 'soul', 'harmonizing vocals', 'passionate female vocal', 'crushed', 'phone recording']", 'caption': 'The low quality recording features a muffled soul song that consists of harmonizing vocals and passionate female vocals. It is too noisy, almost crushed - it is barely recognizable as a soul song. It seems it was recorded with an old phone.', 'tokenizedCaption': [0, 4, 5, 6, 7, 8, 9, 557, 1265, 11, 12, 286, 77, 625, 325, 20, 219, 22, 325, 27, 26, 87, 1468, 293, 16, 1165, 999, 342, 26, 87, 1894, 4691, 43, 9, 1265, 11, 27, 26, 449, 26, 558, 106, 96, 40, 1263, 560, 27, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], 'audi

In [None]:
filepath = ds_train[2]["audio"]
y, sr = librosa.load(filepath, sr=16000)
sf.write(filepath, y, sr)
waveform, sampling_rate = torchaudio.load(filepath)
print(waveform.shape)

if(waveform.shape[0] == 2):
  waveform = stereo_to_mono(waveform)
# Process it to add 125 more values
if(len(waveform[0]) < 160125):
  pad_len =  160125 - len(waveform[0])
  padding = torch.zeros(1,pad_len)
  waveform = torch.cat((waveform[0], padding[0]), dim=0).unsqueeze(0)
  print(waveform.shape)

torch.Size([1, 160000])
torch.Size([1, 160125])


In [79]:
ds_train = ds_train.map(
    wavToInput,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
)

Map (num_proc=4):   0%|          | 0/97 [00:00<?, ? examples/s]

TimeoutError: ignored

In [None]:
# print(ds_train[0]["input_values"][0][1][1])
# print(ds_train[0]["input_values"].shape)
# print(ds_train[0]["inputs_shape"])

print(np.asarray(ds_train[1]["waveform"]).shape)
wav = np.asarray(ds_train[1]["waveform"]).squeeze()
input = feature_extractor(wav, sampling_rate=sampling_rate, padding="max_length", return_tensors="pt").input_values
print(input.shape)

# input = torch.tensor(ds_train[0]["input_values"])
# print(input.shape)

# input2 = torch.tensor(ds_train[1]["input_values"])
# concat_input = torch.cat((input, input2), dim=0)
# print(concat_input.shape)
# output = model(concat_input)

(1, 160000)
torch.Size([1, 1024, 128])


In [None]:
single_output = model(input)

In [None]:
print(output.keys())
print(output['pooler_output'].shape)
print(single_output['pooler_output'].shape)

odict_keys(['last_hidden_state', 'pooler_output'])
torch.Size([2, 768])
torch.Size([1, 768])


In [None]:
testBatch = ds_train[0:2]
print(np.asarray(testBatch["input_values"]).shape)
testInput = torch.tensor(testBatch["input_values"]).squeeze()
print(testInput.shape)
testOutput = model(testInput)

(2, 1, 1024, 128)
torch.Size([2, 1024, 128])


In [None]:
print(testOutput["pooler_output"].shape)

torch.Size([2, 768])


In [None]:
print(torch.tensor(testBatch["tokenizedCaption"]).shape)

torch.Size([2, 151])


# AST Training

In [13]:
from transformers import AutoModel
model = AutoModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

In [None]:
print(model)

ASTModel(
  (embeddings): ASTEmbeddings(
    (patch_embeddings): ASTPatchEmbeddings(
      (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ASTEncoder(
    (layer): ModuleList(
      (0-11): 12 x ASTLayer(
        (attention): ASTAttention(
          (attention): ASTSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): ASTSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ASTIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation(

In [None]:
print(model.config)

In [None]:
ds_train[0]["caption"]

'Female singers sing in funny animated vocals. The song is medium tempo with a groovy bass line, keyboard accompaniment, arpeggiated harmony tones and steady drumming rhythm. The song is cheerful and energetic. The audio quality is average.'

In [None]:
wav = ds_train[3]["waveform"]
# wav = np.asarray(wav).squeeze()
print(len(wav))
for i in range(ds_train.num_rows):
  wav = ds_train[i]["waveform"]
  wav = np.asarray(wav).squeeze()
  if(wav.shape[0] > 160000):
    print(i, wav.shape)

160125
0 (160125,)
1 (160125,)
2 (160125,)
3 (160125,)
4 (160125,)
5 (160125,)
6 (160125,)
7 (160125,)
8 (160125,)
9 (160125,)


In [None]:
wav = ds_train[2]["waveform"]
# wav = np.asarray(wav)
# print(wav.shape)
inputWav = feature_extractor(wav, sampling_rate=sampling_rate, padding="max_length", return_tensors="pt").input_values
print(inputWav.shape)

torch.Size([1, 1024, 128])


In [None]:
inputs = feature_extractor(wav, sampling_rate=sampling_rate, padding="max_length", return_tensors="pt").input_values
print(inputs.shape)

torch.Size([1, 1024, 128])


In [None]:
# outputs
# print(outputs["pooler_output"].shape)
# print(outputs.keys())
# pred = outputs["pooler_output"].argmax(-1).item()
# print(pred)
# print(model.config.id2label[pred])

torch.Size([1, 768])


# Transformer Decoder Implementation

In [None]:
# Create a starting output state
  # Pass this into an output embedding (size of 768)
  # Use positional encoding
  # Pass this into a masked multi-head attention mechanism
  # Add and normalize

# Take the output from stage 1 and combine it with the output from the encoder
  # Multi-head attention
  # Add and normalize

# Feed forward
  # Add and normalize

# Linear activation function
# Softmax



In [14]:
class TransformerBlock(nn.Module):
  def __init__(self, embed_size, heads, dropout, forward_expansion):
    super(TransformerBlock, self).__init__()
    self.attention = nn.MultiheadAttention(embed_size, heads)
    self.norm1 = nn.LayerNorm(embed_size)
    self.norm2 = nn.LayerNorm(embed_size)

    self.feed_forward = nn.Sequential(
        nn.Linear(embed_size, forward_expansion*embed_size),
        nn.ReLU(),
        nn.Linear(forward_expansion*embed_size, embed_size)
    )
    self.dropout = nn.Dropout(dropout)

  def forward(self, value, key, query, mask):
    # attention, _ = self.attention(query, key, value, mask)
    # Shouldn't need a mask to ignore padding on inputs since padding is not needed for inputs
    attention, _ = self.attention(query, key, value)

    x = self.dropout(self.norm1(attention + query))
    forward = self.feed_forward(x)
    out = self.dropout(self.norm2(forward + x))
    return out

In [15]:
class DecoderBlock(nn.Module):
  def __init__(self, embed_size, heads, forward_expansion, dropout, device):
    super(DecoderBlock, self).__init__()
    self.attention = nn.MultiheadAttention(embed_size, heads)
    self.norm = nn.LayerNorm(embed_size)
    self.transformer_block = TransformerBlock(
        embed_size, heads, dropout, forward_expansion
    )
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, value, key, src_mask, trg_mask):
    attention, _ = self.attention(x, x, x, trg_mask)
    query = self.dropout(self.norm(attention + x))
    out = self.transformer_block(value, key, query, src_mask)
    return out

In [16]:
class Decoder(nn.Module):
  def __init__(self, trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length):
    super(Decoder, self).__init__()
    self.device = device
    self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
    # self.position_embedding = nn.Embedding(embed_size, max_length)
    self.position_embedding = nn.Parameter(torch.randn(max_length, embed_size))

    self.layers = nn.ModuleList(
        [DecoderBlock(embed_size, heads, forward_expansion, dropout, device)
        for _  in range(num_layers)]
    )

    self.fc_out = nn.Linear(embed_size, trg_vocab_size)
    self.dropout = nn.Dropout(dropout)
    self.softmax = nn.Softmax(dim=0)

  def forward(self, x, enc_out, src_mask, trg_mask):
    N, seq_length = x.shape
    # positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
    # positions = position_embedding[:seq_length]
    # x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))
    x = self.dropout((self.word_embedding(x) + self.position_embedding[:seq_length]))

    for layer in self.layers:
      x = layer(x, enc_out, enc_out, src_mask, trg_mask)

    out = self.fc_out(x)
    out = self.softmax(out)
    return out


In [17]:
class Instantiate(nn.Module):
  def __init__(self):
    super(Instantiate, self).__init__()

  def forward(self, x):
    return x

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
src_vocab_size = 1024 # max_length of feature extractor - not even used
trg_vocab_size = len(vocab_list)
embed_size = 768
num_layers = 12
heads = 12
forward_expansion = 4
dropout = 0
device = device
max_length = max_len

src_pad_idx = 0
trg_pad_idx = '-inf'

num_epochs = 100
# Number of training samples in the batch
batch_size = 2
learning_rate = 0.001

In [19]:
model.layernorm = Instantiate()
# model.decoder = Decoder(trg_vocab_size, embed_size, num_layers, heads, forward_expansion, droput, device, max_length)

In [20]:
class Transformer(nn.Module):
  def __init__(self,
               src_vocab_size,
               trg_vocab_size,
               src_pad_idx,
               trg_pad_idx,
               model,
               embed_size=768,
               num_layers=12,
               heads=12,
               forward_expansion=4,
               dropout=0,
               device=device,
               max_length=max_len,
              ):
    super(Transformer, self).__init__()

    self.encoder = model
    self.decoder = Decoder(trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length)

    self.src_pad_idx = src_pad_idx
    self.trg_pad_idx = trg_pad_idx
    self.device = device

# Src mask is used so the encoder does not pay attention to the padding values appended to the input
  def make_src_mask(self, src):
    src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
    # (N, 1, 1, src_length)
    return src_mask.to(self.device)

  def make_trg_mask(self, trg):
    N, trg_len = trg.shape
    # trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
    #     N, 1, trg_len, trg_len
    # )
    trg_mask = torch.tril(torch.ones(trg_len, N))
    return trg_mask.to(self.device)

  def forward(self, src, trg):
    src_mask = self.make_src_mask(src)
    trg_mask = self.make_trg_mask(trg)

    enc_src = self.encoder(src)
    enc_out = enc_src["pooler_output"].unsqueeze(1).expand(-1, trg.shape[1], -1)
    out = self.decoder(trg, enc_out, src_mask, trg_mask)
    return out


In [21]:
newModel = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, model).to(device)
optimizer = torch.optim.Adam(newModel.parameters(), learning_rate)

In [None]:
# print(newModel)
num_batches = int(5000 / 12)
print(num_batches)
# train_loader = torch.utils.data.DataLoader(ds_train, batch_size=batch_size, shuffle=True)
# real_batch = next(iter(train_loader))
# print(real_batch)
testBatch = ds_train[0:12]
print(len(testBatch))
print(testBatch["ytid"])
for i in range(len(testBatch)-1):
  print(i, testBatch['ytid'][i])

416
11
['e4abjaPD9R0', 'cGrQw46ftj0', 'RQbNC1J4Jfk', 'wt-f7K_suBc', 'ZLXW4ewrVpQ', 'w2MeQg3W7Po', 'OI7S7vaBT4I', 'I-XYm2Ck2r8', 'wMi_0eEIpcM', 'S8fE5jNVchg']
0 e4abjaPD9R0
1 cGrQw46ftj0
2 RQbNC1J4Jfk
3 wt-f7K_suBc
4 ZLXW4ewrVpQ
5 w2MeQg3W7Po
6 OI7S7vaBT4I
7 I-XYm2Ck2r8
8 wMi_0eEIpcM
9 S8fE5jNVchg


In [None]:
# train_dataset = torch.utils.data.TensorDataset(ds_train)
train_loader = torch.utils.data.DataLoader(ds_train, batch_size=batch_size, shuffle=True)
# print(len(ds_train[8]["waveform"][0]))
# val = next(iter(train_loader))
#for i, data in enumerate(train_loader, 0):
#   print(data["ytid"])

In [None]:
for epoch in range(1):
  for i, data in enumerate(train_loader, 0):
    # for j in range(batch_size):
      print(data["ytid"])
      print(i)

  # Shape =  data["waveform"]["#/160125"][# in batch]
      # wav = np.asarray(data["waveform"])

      # print("Shape 1", wav.shape, "Value", wav)

      print(len(data["input_values"]))
      print(torch.tensor(data["input_values"]).shape)

      # inputs = feature_extractor(data["waveform"], sampling_rate=sampling_rate, padding="max_length", return_tensors="pt")
      # inputs = inputs.input_values
      # print(inputs.shape)
      # print(len(data["waveform"][j]))
    # waveform = np.asarray(data["waveform"]).squeeze()
    # inputs = feature_extractor(waveform)

In [None]:
print(ds_train[1]["ytid"])
print(ds_train[1]["waveform"][1])

3uz_ZrGsIaA
0.025299072265625


In [22]:
criterion = nn.CrossEntropyLoss(ignore_index=vocab_dict["<PAD>"])
optimizer = torch.optim.Adam(newModel.parameters(), lr=learning_rate)

In [27]:
# Training loop
batches_per_epoch = int(ds_train.num_rows / batch_size)
output = []
sentences = []
step = 0
for epoch in range(1):
  for i in range(batches_per_epoch):
    start = i * batch_size
    # print(start)
    xBatch = ds_train[start:start+batch_size]
    # print(xBatch)
    waveform = xBatch["waveform"]
    id = xBatch["ytid"]
    print(id)
    input = torch.tensor(xBatch["input_values"]).squeeze()
    target = torch.tensor(xBatch["tokenizedCaption"])
    currOutput = newModel(input, target)
    print(currOutput[0])
    output.append(currOutput)
    loss = criterion(currOutput.reshape(-1, currOutput.shape[2]), target[0].reshape(-1))
    print("Loss", loss)
    step += 1
    optimizer.zero_grad()
    loss.backward(loss)
    optimizer.step()


    # for j in range(len(currOutput)):
    #   outputSentence = []
    #   vector = currOutput[j]
    #   for k in range(len(vector)):
    #     sumInput = vector[k].detach().numpy()
    #     sum = np.sum(sumInput)
    #     outputSentence.append(vocab_list[int(sum)])
    #   sentences.append(outputSentence)

['6QfM3BRp-78', 'g0scnRzoo9M']
tensor([[0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        ...,
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000]],
       grad_fn=<SelectBackward0>)


ValueError: ignored

In [23]:
# Training loop
batches_per_epoch = int(ds_train.num_rows / batch_size)
output = []
sentences = []
step = 0
for epoch in range(1):
  for i in range(batches_per_epoch):
    start = i * batch_size
    # print(start)
    xBatch = ds_train[start:start+batch_size]
    # print(xBatch)
    waveform = xBatch["waveform"]
    id = xBatch["ytid"]
    print(id)
    input = torch.tensor(xBatch["input_values"]).squeeze()
    target = torch.tensor(xBatch["tokenizedCaption"])
    currOutput = newModel(input, target)
    print(currOutput[0])
    predIdx = currOutput[0].argmax(1)
    # print(predIdx)
    # output.append(currOutput.item())
    # print(target[0])

    # loss = bleu_score(predSen, targetSen)
    loss = criterion(currOutput[0], target[0])
    # loss = criterion(currOutput.reshape(-1, currOutput.shape[2]), target[0].reshape(-1))
    print("Loss", loss)
    step += 1
    optimizer.zero_grad()
    loss.backward(loss)
    optimizer.step()


    # for j in range(len(currOutput)):
    #   outputSentence = []
    #   vector = currOutput[j]
    #   for k in range(len(vector)):
    #     sumInput = vector[k].detach().numpy()
    #     sum = np.sum(sumInput)
    #     outputSentence.append(vocab_list[int(sum)])
    #   sentences.append(outputSentence)

['2U8Dvh7nwFI', 'puAclKsCbes']
tensor([[0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        ...,
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000]],
       grad_fn=<SelectBackward0>)
Loss tensor(8.7275, grad_fn=<NllLossBackward0>)
['9L6ePkWtZI4', 'hoPnrbKOEl8']
tensor([[0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        ...,
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000]],
       grad_fn=<SelectBackward0>)
Loss tensor(8.7233, grad_f

KeyboardInterrupt: ignored

In [75]:
print(ds_train[0]["ytid"])
ds_train[0]["caption"]

ODRAYQE9GXs


'Someone is strumming chords on an e-guitar while playing a melody with the harmonica in a higher register. This song may be playing gathered around a bonfire.'

In [30]:
criterion = nn.CrossEntropyLoss(ignore_index=vocab_dict["<PAD>"])
output = currOutput[0]
print(output)
pred = output.argmax(1)
max = output[2].max()
print(max)
print(output[2][3251])
print(pred)

predSen = []
for i in range(len(pred)):
  predSen.append(vocab_list[pred[i]])

print(predSen)

targetSen = []
example = ds_train[0]["tokenizedCaption"]
for j in range(len(example)):
  targetSen.append(vocab_list[example[j]])

print(targetSen)

# loss = bleu_score(predSen, targetSen)
# print(loss)

loss2 = criterion(output, torch.tensor(example))
print(loss2)

tensor([[0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        ...,
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000]],
       grad_fn=<SelectBackward0>)
tensor(0.5000, grad_fn=<MaxBackward1>)
tensor(0.5000, grad_fn=<SelectBackward0>)
tensor([   0,  707,    4, 1611, 3097,    4, 3374,   91,    4,  705,  853,  547,
        1243,    4,  108, 2342,  361, 1530, 3550,    4, 2108, 1964,    4,   64,
         142,  741,    4, 1412,    4, 1882,   87,  302,    9, 1665,    7, 1315,
        5736, 4592,  972,    9,  368,    9,   27,  872,  277, 5010,  383, 1773,
          90, 2248, 2867,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
       

In [31]:
linearOut = output[0][1]
print(len(linearOut))
pred = linearOut.argmax(1)
print(linearOut)
print(pred)
# print(pred.item())

max = linearOut.max()
print(max)
max1 = linearOut[1].max()
print(max1)
print(linearOut[1][3695])

# embedding = nn.Embedding(trg_vocab_size, embed_size)
# res = embedding(pred)
# print(res)
print(vocab_list[0])
print(vocab_list[172])

IndexError: ignored

In [None]:
print(len(output[0][1]))
print(output[0][1])
print(len(output[0]))
linearOut = output[0][1]
print(linearOut[0].shape)
sumInput = linearOut[0].detach().numpy()
# sum  = np.sum(sumInput)
# print(sum)
max = np.argmax(sumInput)
print(max)
print(max.shape)
# finalOut = nn.Softmax(linearOut)
# print(finalOut)
# print(vocab_list[int(sum)])

# outputSentence = []
# for i in range(len(linearOut)):
#   sumInput = linearOut[i].detach().numpy()
#   sum = np.sum(sumInput)
#   outputSentence.append(vocab_list[int(sum)])

151
tensor([[0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.4512, 0.4572, 0.5425,  ..., 0.5513, 0.4730, 0.4996],
        ...,
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000]],
       grad_fn=<SelectBackward0>)
2
torch.Size([6147])
0
()


In [None]:
outputCaption = []
tokenCap = np.asarray(ds_train[1]["tokenizedCaption"])
for i in range(len(ds_train[1]["tokenizedCaption"])):
  outputCaption.append(vocab_list[tokenCap[i]])

In [None]:
print(outputCaption)
print(outputSentence)

['<SOS>', 'the', 'song', 'is', 'an', 'instrumental', '.', 'the', 'tempo', 'is', 'medium', 'with', 'a', 'guitar', 'playing', 'a', 'romantic', 'lead', ',', 'steady', 'drumming', ',', 'rock', 'drumming', ',', 'percussive', 'bass', 'line', ',', 'cymbals', 'crashing', 'and', 'guitar', 'strumming', 'rhythm', '.', 'the', 'song', 'is', 'exciting', 'and', 'youthful', '.', 'the', 'song', 'has', 'a', 'bad', 'audio', 'quality', '.', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>',

In [None]:
from torchtext.data.metrics import bleu_score
score = bleu_score(outputSentence, outputCaption)
print(score)

0.0


In [None]:
from gensim.models import Word2Vec
import gensim.downloader
# word2vec = Word2Vec.load()
glove_vec = gensim.downloader.load('word2vec-google-news-300')



In [None]:
glove_vec.most_similar("guitar")

[('guitars', 0.8395804762840271),
 ('acoustic_guitar', 0.828093409538269),
 ('electric_guitar', 0.8114222884178162),
 ('mandolin', 0.7817174196243286),
 ('harmonica', 0.7718141674995422),
 ('saxophone', 0.7682830095291138),
 ('bass_guitar', 0.7592058777809143),
 ('acoustic_guitars', 0.7538740634918213),
 ('sax', 0.7534022331237793),
 ('banjo', 0.7468352913856506)]

In [None]:
glove_vec.index_to_key

In [None]:
closest_word = None
max_sim = -1
vector1 = output[0][1][0]
vector2 = torch.zeros(149)
print(vector1.shape)

vector = torch.cat((vector1, vector2), dim=0)
for word in glove_vec.index_to_key:
  word_vector = torch.tensor(glove_vec[word])
  sim = torch.dot(vector, word_vector) / (torch.norm(vector) * torch.norm(word_vector)).detach.numpy()
  if sim > max_sim:
    max_sim = sim
    closest_word = word


print(closest_word)

torch.Size([6147])


RuntimeError: ignored

In [None]:
trg = torch.tensor([[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]]).to(device)
print(trg.shape)
inputs = inputs.to(device)
with torch.no_grad():
  outputs = newModel(inputs, trg)

torch.Size([1, 18])


KeyboardInterrupt: ignored

# Debugging

In [None]:
with torch.no_grad():
  outputs = model(inputs)

In [None]:
print(outputs.keys())
outputs["last_hidden_state"].shape

odict_keys(['last_hidden_state', 'pooler_output'])


torch.Size([1, 1214, 768])

In [None]:
decoder = Decoder(trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length).to(device)

In [None]:
print(decoder)

Decoder(
  (word_embedding): Embedding(6143, 768)
  (layers): ModuleList(
    (0-11): 12 x DecoderBlock(
      (attention): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
      )
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (transformer_block): TransformerBlock(
        (attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (feed_forward): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): ReLU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0, inplace=False)
      )
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (fc_out): Linear(in_featur

In [None]:
def make_src_mask(src):
  # src_mask = (src != src_pad_idx).unsqueeze(1).unsqueeze(2)
  src_mask = (src != src_pad_idx).squeeze()
  # (N, 1, 1, src_length)
  return src_mask.to(device)

In [None]:
def make_trg_mask(trg):
  N, trg_len = trg.shape
  # trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
  #     N, 1, trg_len, trg_len
  # )
  trg_mask = torch.tril(torch.ones(trg_len, N))
  return trg_mask.to(device)

In [None]:
enc_src = outputs["pooler_output"].unsqueeze(1).expand(-1, 18, -1)
src_mask = make_src_mask(inputs)
trg_mask = make_trg_mask(trg)
print(trg_mask.shape)
print(src_mask.shape)
decodeOut = decoder(trg, enc_src, trg_mask, trg_mask)

torch.Size([18, 1])
torch.Size([1024, 128])


In [None]:
print(decodeOut.shape)

torch.Size([1, 18, 6143])


In [None]:
N, seq_length = trg.shape
positions = torch.arange(0, seq_length).expand(N, seq_length).to(device)
word_embedding = nn.Embedding(trg_vocab_size, embed_size).to(device)
# position_embedding = nn.Embedding(embed_size, max_length).to(device)
position_embedding = nn.Parameter(torch.randn(max_length, embed_size)).to(device)
dropoutFunc =  nn.Dropout(dropout)

In [None]:
word = word_embedding(trg)
# pos = position_embedding(positions)
pos = position_embedding[:seq_length]
print(word.shape)
print(pos.shape)
x = dropoutFunc(word + pos)
# x = dropout((word + pos))
print(x.shape)

torch.Size([1, 18, 768])
torch.Size([18, 768])
torch.Size([1, 18, 768])


In [None]:
self_attention = nn.MultiheadAttention(embed_size, heads)
self_norm = nn.LayerNorm(embed_size)
attention, _ = self_attention(x, x, x, trg_mask)
print(attention)
query = dropoutFunc(self_norm(attention + x))
print(query.shape)
# Modifies enc_src to be the same shape as query
test_enc_src = enc_src.unsqueeze(1).expand(-1, query.size(1), -1)
print(test_enc_src.shape)
print(src_mask.shape)
test_src_mask = src_mask[-1, -1, :, :]
print(test_src_mask.shape)

tensor([[[-0.7805, -0.1441, -1.3261,  ..., -0.6013, -0.0808,  0.0878],
         [ 0.7496,  1.0978,  0.0337,  ..., -0.5370, -0.1557, -0.3764],
         [-1.1219, -0.2174, -0.2250,  ...,  0.6839, -0.2006,  0.6456],
         ...,
         [ 0.2170, -0.6863, -0.5809,  ...,  0.0870,  0.3194,  0.1975],
         [-0.1514, -0.7697,  0.3599,  ...,  1.1205, -0.8512, -0.4996],
         [ 0.7957,  0.4360,  0.1964,  ..., -0.3208, -1.2579, -0.8765]]],
       grad_fn=<ViewBackward0>)
torch.Size([1, 18, 768])
torch.Size([1, 18, 768])
torch.Size([1, 1, 1024, 128])
torch.Size([1024, 128])


In [None]:
transform_attention = self_attention(query, test_enc_src, test_enc_src, trg_mask)

In [None]:
self_transformer_block = TransformerBlock(embed_size, heads, dropout, forward_expansion)
print(enc_src.shape)
trans = self_transformer_block(query, test_enc_src, test_enc_src, src_mask)

torch.Size([1, 768])


AssertionError: ignored

In [None]:
print(trg_mask.shape)

torch.Size([18, 18])


In [None]:
target = [[1,3,2,4,5], [6,7,8,9,10]]
target = np.asarray(target)
target_mask = make_trg_mask(target)

tensor([[[[1., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0.],
          [1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1.]]],


        [[[1., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0.],
          [1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1.]]]])


In [None]:
print(target_mask)
print(target_mask.shape)

tensor([[[[1., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0.],
          [1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1.]]],


        [[[1., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0.],
          [1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1.]]]])
torch.Size([2, 1, 5, 5])


# LSTM Decoder Implementation

In [None]:
print(model.encoder.layer)

ModuleList(
  (0-11): 12 x ASTLayer(
    (attention): ASTAttention(
      (attention): ASTSelfAttention(
        (query): Linear(in_features=768, out_features=768, bias=True)
        (key): Linear(in_features=768, out_features=768, bias=True)
        (value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (output): ASTSelfOutput(
        (dense): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (intermediate): ASTIntermediate(
      (dense): Linear(in_features=768, out_features=3072, bias=True)
      (intermediate_act_fn): GELUActivation()
    )
    (output): ASTOutput(
      (dense): Linear(in_features=3072, out_features=768, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  )
)


In [None]:
class Decoder(nn.Module):
  def __init__(self, input=3072, output_size=64, embedding_size=64, hidden_size=64, num_layers=2, p=0.2):
    super(Decoder, self).__init__()

    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.dropout = nn.Dropout(p)
    self.embedding = nn.Embedding(input, embedding_size)
    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
    self.fc = nn.Linear(hidden_size, output_size)

    self.h = [torch.zeros(num_layers, 4, hidden_size) for _ in range(2)]

  def forward(self, x, hidden):
    x = x.unsqueeze(0)
    # embedding = self.dropout(self.embedding(x))
    embedding = self.embedding(x.int())
    outputs, (hidden, cell) = self.rnn(embedding, (hidden, self.h))
    predictions = self.fc(outputs)
    predictions = predictions.squeeze(0)
    return predictions, hidden, cell


In [None]:
class Decoder(nn.Module):
  def __init__(self, vocab_size, n_hidden, n_layers):
    super(Decoder, self).__init__()

    self.input_hidden = nn.Embedding(vocab_size, n_hidden)
    self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
    self.hidden_out = nn.Linear(n_hidden, vocab_size)
    self.h = [torch.zeros(n_layers, 4, n_hidden) for _ in range(2)]

  def forward(self, x, bias):
    res, h = self.rnn(self.input_hidden(x.int()), self.h)
    self.h = [h_.detach() for h_ in h]
    return self.hidden_out(res)

In [None]:
# model.encoder
# model.encoder.layer.output = LSTM(3072, 64)
h = [torch.zeros(2, 4, 64) for _ in range(2)]
print(len(h))
print(h[0].shape)
input_h = nn.Embedding(3072, 64)
print(input_h)

2
torch.Size([2, 4, 64])
Embedding(3072, 64)


In [None]:
class Instantiate(nn.Module):
  def __init__(self):
    super(Instantiate, self).__init__()

  def forward(self, x, bias):
    return x

In [None]:
# print(model.encoder.layer[0].output)
for layer in model.encoder.layer:
  # layer.output = Decoder(3072, 64, 2)
  layer.output = Instantiate()

print(model.encoder.layer[0:12])

ModuleList(
  (0-11): 12 x ASTLayer(
    (attention): ASTAttention(
      (attention): ASTSelfAttention(
        (query): Linear(in_features=768, out_features=768, bias=True)
        (key): Linear(in_features=768, out_features=768, bias=True)
        (value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (output): ASTSelfOutput(
        (dense): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (intermediate): ASTIntermediate(
      (dense): Linear(in_features=768, out_features=3072, bias=True)
      (intermediate_act_fn): GELUActivation()
    )
    (output): Instantiate()
    (layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  )
)


In [None]:
with torch.no_grad():
  outputs = model(inputs)

RuntimeError: ignored

In [None]:
print(model.encoder.layer)

ModuleList(
  (0-11): 12 x ASTLayer(
    (attention): ASTAttention(
      (attention): ASTSelfAttention(
        (query): Linear(in_features=768, out_features=768, bias=True)
        (key): Linear(in_features=768, out_features=768, bias=True)
        (value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (output): ASTSelfOutput(
        (dense): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (intermediate): ASTIntermediate(
      (dense): Linear(in_features=768, out_features=3072, bias=True)
      (intermediate_act_fn): GELUActivation()
    )
    (output): Decoder(
      (dropout): Dropout(p=0.2, inplace=False)
      (embedding): Embedding(3072, 64)
      (rnn): LSTM(64, 64, num_layers=2, dropout=0.2)
      (fc): Linear(in_features=64, out_features=64, bias=True)
    )
    (layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)