<a href="https://colab.research.google.com/github/JTStephens18/AudioTranscriptor/blob/main/V5_VisionEncDec_audioProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install yt-dlp
# Install huggingface audio datasets
! pip install datasets[audio]
! pip install transformers evaluate jiwer
!pip install accelerate -U
!pip install nltk

Collecting yt-dlp
  Downloading yt_dlp-2023.10.13-py2.py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mutagen (from yt-dlp)
  Downloading mutagen-1.47.0-py3-none-any.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.4/194.4 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pycryptodomex (from yt-dlp)
  Downloading pycryptodomex-3.19.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting websockets (from yt-dlp)
  Downloading websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (130 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.2/130.2 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Collecting brotli

In [2]:
from datasets import load_dataset, Audio, Dataset
from transformers import VisionEncoderDecoderModel, AutoFeatureExtractor, AutoTokenizer, AutoProcessor, ASTFeatureExtractor
import subprocess
import os
from pathlib import Path
import torch
import torch.nn as nn
import torchaudio
import soundfile as sf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import librosa
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
ds = load_dataset('google/MusicCaps', split="train")
ds

Downloading readme:   0%|          | 0.00/5.06k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['ytid', 'start_s', 'end_s', 'audioset_positive_labels', 'aspect_list', 'caption', 'author_id', 'is_balanced_subset', 'is_audioset_eval'],
    num_rows: 5521
})

In [4]:
ds = ds.remove_columns(["author_id", "is_balanced_subset", "is_audioset_eval", "audioset_positive_labels"])

In [5]:
ds = ds.train_test_split(test_size=0.2)

In [6]:
def stereo_to_mono(wav):
  chan_1 = wav[0][:]
  chan_2 = wav[1][:]
  mono = (chan_1 + chan_2) / 2
  return mono

In [7]:
def download_clip(
    video_id,
    output_filename,
    start_time,
    end_time,
    tmp_dir='/musiccaps',
    num_attempts=5,
    url_base='https://www.youtube.com/watch?v='
):

  status = False
  command = f"""
        yt-dlp --quiet --no-warnings -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" {url_base}{video_id} --force-keyframes-at-cuts
    """.strip()
  attempts = 0
  while True:
    try:
      # output = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
      output = os.system(command)
    except subprocess.CalledProcess.Error as err:
      attempts += 1
      if attempts == num_attempts:
        return status, err.output
    else:
      break

  # Check if video was successfully saved
  status = os.path.exists(output_filename)
  return status, 'Downloaded'


def toUppercase(aspect_list):
  a_list = aspect_list[1:-1]
  new_list = ''
  for word in a_list:
    new_list = ''.join(a_list).upper().replace(',', '')
    # new_list = new_list.replace(' ', '|')
    if(len(new_list) > 128):
      new_list = new_list[:128]
  return new_list


def process(example):
  output_path = str(data_dir / f"{example['ytid']}.wav")
  status = True
  aspect_string = toUppercase(example['aspect_list'])
  if not os.path.exists(output_path):
    status = False
    status, log = download_clip(
        example['ytid'],
        output_path,
        example['start_s'],
        example['end_s'],
    )

  example['aspect_string'] = aspect_string
  example["audio"] = output_path
  example['download_status'] = status
  example["image_path"] = f'./spectrograms/{example["ytid"]}.png'
  return example

In [8]:
def resample_waveform(example):
  filepath = example["audio"]
  y, sr = librosa.load(filepath, sr=16000)
  sf.write(filepath, y, sr)
  waveform, sampling_rate = torchaudio.load(filepath)
  if(waveform.shape[0] == 2):
    waveform = stereo_to_mono(waveform)
  example["waveform"] = waveform
  return example

In [9]:
samples_to_load = 10
cores = 4
sampling_rate = 16000
writer_batch_size = 1000
data_dir = "./music_data"
upper_limit = 5521
lower_limit = 5000

data_dir = Path(data_dir)
data_dir.mkdir(exist_ok=True, parents=True)

# ds = ds.select(range(lower_limit, upper_limit))

# ds = ds.map(
#     process,
#     num_proc=cores,
#     writer_batch_size=writer_batch_size,
#     keep_in_memory=False
# )


# ds = ds.filter(lambda ex: ex["download_status"] == True)

ds_train = ds["train"].select(range(samples_to_load))
ds_test = ds["test"].select(range(int(samples_to_load*0.2)))

ds_train = ds_train.map(
    process,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
    # batched=True,
)

ds_train = ds_train.filter(lambda ex: ex["download_status"] == True)
# ds_train = ds_train.cast_column("audio", Audio(sampling_rate=sampling_rate))
ds_train = ds_train.map(
    resample_waveform,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
)

ds_test = ds_test.map(
    process,
    num_proc=cores,
    writer_batch_size=writer_batch_size,
    keep_in_memory=False,
    # batched=True,
)

ds_test = ds_test.filter(lambda ex: ex["download_status"] == True)
# ds_test = ds_test.cast_column("audio", Audio(sampling_rate=sampling_rate))
ds_test = ds_test.map(resample_waveform, num_proc=cores, writer_batch_size=writer_batch_size, keep_in_memory=False)

Map (num_proc=4):   0%|          | 0/10 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/10 [00:00<?, ? examples/s]

num_proc must be <= 2. Reducing num_proc to 2 for dataset of size 2.


Map (num_proc=2):   0%|          | 0/2 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2 [00:00<?, ? examples/s]

num_proc must be <= 2. Reducing num_proc to 2 for dataset of size 2.


Map (num_proc=2):   0%|          | 0/2 [00:00<?, ? examples/s]

# AST Training

In [10]:
from transformers import AutoModel
model = AutoModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

Downloading (…)lve/main/config.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

In [28]:
print(model)

ASTModel(
  (embeddings): ASTEmbeddings(
    (patch_embeddings): ASTPatchEmbeddings(
      (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ASTEncoder(
    (layer): ModuleList(
      (0-11): 12 x ASTLayer(
        (attention): ASTAttention(
          (attention): ASTSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): ASTSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ASTIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation(

In [None]:
print(model.config)

In [11]:
feature_extractor = ASTFeatureExtractor()
feature_extractor

ASTFeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "ASTFeatureExtractor",
  "feature_size": 1,
  "max_length": 1024,
  "mean": -4.2677393,
  "num_mel_bins": 128,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000,
  "std": 4.5689974
}

In [12]:
ds_train[0]["caption"]

'Female singers sing in funny animated vocals. The song is medium tempo with a groovy bass line, keyboard accompaniment, arpeggiated harmony tones and steady drumming rhythm. The song is cheerful and energetic. The audio quality is average.'

In [13]:
wav = ds_train[0]["waveform"]
wav = np.asarray(wav).squeeze()
print(wav.shape)

(160000,)


In [14]:
inputs = feature_extractor(wav, sampling_rate=sampling_rate, padding="max_lenght", return_tensors="pt").input_values
print(inputs.shape)

torch.Size([1, 1024, 128])


In [None]:
# outputs
# print(outputs["pooler_output"].shape)
# print(outputs.keys())
# pred = outputs["pooler_output"].argmax(-1).item()
# print(pred)
# print(model.config.id2label[pred])

torch.Size([1, 768])


# Transformer Decoder Implementation

In [None]:
# Create a starting output state
  # Pass this into an output embedding (size of 768)
  # Use positional encoding
  # Pass this into a masked multi-head attention mechanism
  # Add and normalize

# Take the output from stage 1 and combine it with the output from the encoder
  # Multi-head attention
  # Add and normalize

# Feed forward
  # Add and normalize

# Linear activation function
# Softmax



In [15]:
class TransformerBlock(nn.Module):
  def __init__(self, embed_size, heads, dropout, forward_expansion):
    super(TransformerBlock, self).__init__()
    self.attention = nn.MultiheadAttention(embed_size, heads)
    self.norm1 = nn.LayerNorm(embed_size)
    self.norm2 = nn.LayerNorm(embed_size)

    self.feed_forward = nn.Sequential(
        nn.Linear(embed_size, forward_expansion*embed_size),
        nn.ReLU(),
        nn.Linear(forward_expansion*embed_size, embed_size)
    )
    self.dropout = nn.Dropout(dropout)

  def forward(self, value, key, query, mask):
    attention = self.attention(value, key, query, mask)

    x = self.dropout(self.norm1(attention + query))
    forward = self.feed_forward(x)
    out = self.dropout(self.norm2(forward + x))
    return out

In [65]:
class DecoderBlock(nn.Module):
  def __init__(self, embed_size, heads, forward_expansion, dropout, device):
    super(DecoderBlock, self).__init__()
    self.attention = nn.MultiheadAttention(embed_size, heads)
    self.norm = nn.LayerNorm(embed_size)
    self.transformer_block = TransformerBlock(
        embed_size, heads, dropout, forward_expansion
    )
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, value, key, src_mask, trg_mask):
    attention, _ = self.attention(x, x, x, trg_mask)
    query = self.dropout(self.norm(attention + x))
    out = self.transformer_block(value, key, query, src_mask)
    return out

In [66]:
class Decoder(nn.Module):
  def __init__(self, trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length):
    super(Decoder, self).__init__()
    self.device = device
    self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
    # self.position_embedding = nn.Embedding(embed_size, max_length)
    self.position_embedding = nn.Parameter(torch.randn(max_length, embed_size))

    self.layers = nn.ModuleList(
        [DecoderBlock(embed_size, heads, forward_expansion, dropout, device)
        for _  in range(num_layers)]
    )

    self.fc_out = nn.Linear(embed_size, trg_vocab_size)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, enc_out, src_mask, trg_mask):
    N, seq_length = x.shape
    # positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
    # positions = position_embedding[:seq_length]
    # x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))
    x = self.dropout((self.word_embedding(x) + self.position_embedding[:seq_length]))

    for layer in self.layers:
      x = layer(x, enc_out, enc_out, src_mask, trg_mask)

    out = self.fc_out(x)
    return out


In [18]:
class Instantiate(nn.Module):
  def __init__(self):
    super(Instantiate, self).__init__()

  def forward(self, x):
    return x

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
src_vocab_size = 26
trg_vocab_size = 26
embed_size = 768
num_layers = 12
heads = 12
forward_expansion = 4
dropout = 0
device = device
max_length = 128

src_pad_idx = 0
trg_pad_idx = 0

In [27]:
model.layernorm = Instantiate()
# model.decoder = Decoder(trg_vocab_size, embed_size, num_layers, heads, forward_expansion, droput, device, max_length)

In [26]:
class Transformer(nn.Module):
  def __init__(self,
               src_vocab_size,
               trg_vocab_size,
               src_pad_idx,
               trg_pad_idx,
               model,
               embed_size=768,
               num_layers=12,
               heads=12,
               forward_expansion=4,
               dropout=0,
               device=device,
               max_length=128,
              ):
    super(Transformer, self).__init__()

    self.encoder = model
    self.decoder = Decoder(trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length)

    self.src_pad_idx = src_pad_idx
    self.trg_pad_idx = trg_pad_idx
    self.device = device

  def make_src_mask(self, src):
    src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
    # (N, 1, 1, src_length)
    return src_mask.to(self.device)

  def make_trg_mask(self, trg):
    N, trg_len = trg.shape
    trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
        N, 1, trg_len, trg_len
    )
    return trg_mask.to(self.device)

  def forward(self, src, trg):
    src_mask = self.make_src_mask(src)
    trg_mask = self.make_trg_mask(trg)

    enc_src = self.encoder(src, src_mask)
    out = self.decoder(trg, enc_src, src_mask, trg_mask)
    return out


In [28]:
newModel = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, model).to(device)

In [24]:
print(newModel)

cpu


In [29]:
trg = torch.tensor([[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]]).to(device)
inputs = inputs.to(device)
with torch.no_grad():
  outputs = newModel(inputs, trg)

RuntimeError: ignored

# Debugging

In [30]:
with torch.no_grad():
  outputs = model(inputs)

In [46]:
print(outputs.keys())
outputs["last_hidden_state"].shape

odict_keys(['last_hidden_state', 'pooler_output'])


torch.Size([1, 1214, 768])

In [67]:
decoder = Decoder(trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length).to(device)

In [None]:
print(decoder)

In [99]:
def make_src_mask(src):
  src_mask = (src != src_pad_idx).unsqueeze(1).unsqueeze(2)
  src_mask = (src != src_pad_idx).unsqueeze(1)
  # (N, 1, 1, src_length)
  return src_mask.to(device)

In [41]:
def make_trg_mask(trg):
  N, trg_len = trg.shape
  # trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
  #     N, 1, trg_len, trg_len
  # )
  trg_mask = torch.tril(torch.ones(trg_len, N))
  return trg_mask.to(device)

In [100]:
enc_src = outputs["pooler_output"]
src_mask = make_src_mask(inputs)
trg_mask = make_trg_mask(trg)
decodeOut = decoder(trg, enc_src, src_mask, trg_mask)

AssertionError: ignored

In [52]:
N, seq_length = trg.shape
positions = torch.arange(0, seq_length).expand(N, seq_length).to(device)
word_embedding = nn.Embedding(trg_vocab_size, embed_size).to(device)
# position_embedding = nn.Embedding(embed_size, max_length).to(device)
position_embedding = nn.Parameter(torch.randn(max_length, embed_size)).to(device)
dropoutFunc =  nn.Dropout(dropout)

In [54]:
word = word_embedding(trg)
# pos = position_embedding(positions)
pos = position_embedding[:seq_length]
print(word.shape)
print(pos.shape)
x = dropoutFunc(word + pos)
# x = dropout((word + pos))
print(x.shape)

torch.Size([1, 18, 768])
torch.Size([18, 768])
torch.Size([1, 18, 768])


In [126]:
self_attention = nn.MultiheadAttention(embed_size, heads)
self_norm = nn.LayerNorm(embed_size)
attention, _ = self_attention(x, x, x, trg_mask)
print(attention)
query = dropoutFunc(self_norm(attention + x))
print(query.shape)
# Modifies enc_src to be the same shape as query
test_enc_src = enc_src.unsqueeze(1).expand(-1, query.size(1), -1)
print(test_enc_src.shape)
print(src_mask.shape)
test_src_mask = src_mask[-1, -1, :, :]
print(test_src_mask.shape)

tensor([[[-0.1607,  0.4814, -0.1328,  ...,  0.0072,  0.0647,  0.0122],
         [-0.4174,  0.1596, -0.2115,  ..., -0.3777,  0.1598, -0.5464],
         [-0.3788, -1.0916,  0.6458,  ..., -0.0882, -0.0187,  0.5969],
         ...,
         [-0.3935, -0.8019, -0.3426,  ...,  0.2568, -0.5754,  0.7237],
         [-0.3196, -0.0794, -0.2700,  ..., -0.2493, -0.3895,  0.7070],
         [-0.6386, -0.1491, -0.6067,  ..., -0.1334,  0.2426, -0.4819]]],
       grad_fn=<ViewBackward0>)
torch.Size([1, 18, 768])
torch.Size([1, 18, 768])
torch.Size([1, 1, 1024, 128])
torch.Size([1024, 128])


In [122]:
transform_attention = self_attention(query, test_enc_src, test_enc_src, trg_mask)

In [84]:
self_transformer_block = TransformerBlock(embed_size, heads, dropout, forward_expansion)
print(enc_src.shape)
trans = self_transformer_block(enc_src, enc_src, query, src_mask)

torch.Size([1, 768])
torch.Size([1, 768])


AssertionError: ignored

In [38]:
print(trg_mask.shape)

torch.Size([18, 18])


In [29]:
target = [[1,3,2,4,5], [6,7,8,9,10]]
target = np.asarray(target)
target_mask = make_trg_mask(target)

tensor([[[[1., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0.],
          [1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1.]]],


        [[[1., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0.],
          [1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1.]]]])


In [30]:
print(target_mask)
print(target_mask.shape)

tensor([[[[1., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0.],
          [1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1.]]],


        [[[1., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0.],
          [1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1.]]]])
torch.Size([2, 1, 5, 5])


# LSTM Decoder Implementation

In [None]:
print(model.encoder.layer)

ModuleList(
  (0-11): 12 x ASTLayer(
    (attention): ASTAttention(
      (attention): ASTSelfAttention(
        (query): Linear(in_features=768, out_features=768, bias=True)
        (key): Linear(in_features=768, out_features=768, bias=True)
        (value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (output): ASTSelfOutput(
        (dense): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (intermediate): ASTIntermediate(
      (dense): Linear(in_features=768, out_features=3072, bias=True)
      (intermediate_act_fn): GELUActivation()
    )
    (output): ASTOutput(
      (dense): Linear(in_features=3072, out_features=768, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  )
)


In [None]:
class Decoder(nn.Module):
  def __init__(self, input=3072, output_size=64, embedding_size=64, hidden_size=64, num_layers=2, p=0.2):
    super(Decoder, self).__init__()

    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.dropout = nn.Dropout(p)
    self.embedding = nn.Embedding(input, embedding_size)
    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
    self.fc = nn.Linear(hidden_size, output_size)

    self.h = [torch.zeros(num_layers, 4, hidden_size) for _ in range(2)]

  def forward(self, x, hidden):
    x = x.unsqueeze(0)
    # embedding = self.dropout(self.embedding(x))
    embedding = self.embedding(x.int())
    outputs, (hidden, cell) = self.rnn(embedding, (hidden, self.h))
    predictions = self.fc(outputs)
    predictions = predictions.squeeze(0)
    return predictions, hidden, cell


In [None]:
class Decoder(nn.Module):
  def __init__(self, vocab_size, n_hidden, n_layers):
    super(Decoder, self).__init__()

    self.input_hidden = nn.Embedding(vocab_size, n_hidden)
    self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
    self.hidden_out = nn.Linear(n_hidden, vocab_size)
    self.h = [torch.zeros(n_layers, 4, n_hidden) for _ in range(2)]

  def forward(self, x, bias):
    res, h = self.rnn(self.input_hidden(x.int()), self.h)
    self.h = [h_.detach() for h_ in h]
    return self.hidden_out(res)

In [None]:
# model.encoder
# model.encoder.layer.output = LSTM(3072, 64)
h = [torch.zeros(2, 4, 64) for _ in range(2)]
print(len(h))
print(h[0].shape)
input_h = nn.Embedding(3072, 64)
print(input_h)

2
torch.Size([2, 4, 64])
Embedding(3072, 64)


In [None]:
class Instantiate(nn.Module):
  def __init__(self):
    super(Instantiate, self).__init__()

  def forward(self, x, bias):
    return x

In [None]:
# print(model.encoder.layer[0].output)
for layer in model.encoder.layer:
  # layer.output = Decoder(3072, 64, 2)
  layer.output = Instantiate()

print(model.encoder.layer[0:12])

ModuleList(
  (0-11): 12 x ASTLayer(
    (attention): ASTAttention(
      (attention): ASTSelfAttention(
        (query): Linear(in_features=768, out_features=768, bias=True)
        (key): Linear(in_features=768, out_features=768, bias=True)
        (value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (output): ASTSelfOutput(
        (dense): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (intermediate): ASTIntermediate(
      (dense): Linear(in_features=768, out_features=3072, bias=True)
      (intermediate_act_fn): GELUActivation()
    )
    (output): Instantiate()
    (layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  )
)


In [None]:
with torch.no_grad():
  outputs = model(inputs)

RuntimeError: ignored

In [None]:
print(model.encoder.layer)

ModuleList(
  (0-11): 12 x ASTLayer(
    (attention): ASTAttention(
      (attention): ASTSelfAttention(
        (query): Linear(in_features=768, out_features=768, bias=True)
        (key): Linear(in_features=768, out_features=768, bias=True)
        (value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (output): ASTSelfOutput(
        (dense): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (intermediate): ASTIntermediate(
      (dense): Linear(in_features=768, out_features=3072, bias=True)
      (intermediate_act_fn): GELUActivation()
    )
    (output): Decoder(
      (dropout): Dropout(p=0.2, inplace=False)
      (embedding): Embedding(3072, 64)
      (rnn): LSTM(64, 64, num_layers=2, dropout=0.2)
      (fc): Linear(in_features=64, out_features=64, bias=True)
    )
    (layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)