# Multimodal Transformer based on GPT2

## Load the GPT2 model

In [2]:
import transformers


model_name = "distilgpt2"
model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
# outputs = model(**inputs, labels=inputs["input_ids"])
# loss = outputs.loss

print(model)

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


## Encode tokens into embeddings

In [23]:
# dir(model)

input_ids = inputs['input_ids'] # 6 tokens
input_embeds = model.transformer.wte(input_ids)

# print("embed_dim:",model.base_model.embed_dim) # 768
# model(inputs_embeds=input_embeds, attention_mask=inputs['attention_mask'], labels=inputs['input_ids'])

print(inputs['input_ids'].shape) # [1,6]
print(input_embeds.shape) # [1,6,768]

torch.Size([1, 6])
torch.Size([1, 6, 768])


## Generate video embeddings

In [18]:
from transformers import AutoImageProcessor, ViTModel
import torch
from datasets import load_dataset

dataset = load_dataset("huggingface/cats-image")
image = dataset["test"]["image"][0]

image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")

inputs = image_processor(image, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs, output_attentions=True, output_hidden_states=True)
last_hidden_states = outputs.last_hidden_state
list(last_hidden_states.shape)

  0%|          | 0/1 [00:00<?, ?it/s]

[1, 197, 768]

In [24]:
type(image)

PIL.JpegImagePlugin.JpegImageFile

In [74]:
random_image = torch.rand(3, 224, 224)
image_processor(random_image, do_rescale=False, return_tensors="pt") # from official doc.

{'pixel_values': tensor([[[[ 0.6471,  0.8980, -0.2000,  ...,  0.6314, -0.3412, -0.7255],
          [-0.3333,  0.4667,  0.5294,  ...,  0.4196,  0.6314,  0.7020],
          [ 0.8196, -0.4039, -0.8745,  ...,  0.5922,  0.2314, -0.8275],
          ...,
          [-0.9373, -0.5451,  0.2941,  ...,  0.2314, -0.4039,  0.1922],
          [-0.9373, -0.5922, -0.4902,  ..., -0.3412,  0.6235, -0.3569],
          [-0.7882,  0.5373,  0.3882,  ...,  0.6471, -0.3333,  0.8667]],

         [[ 0.3569,  0.0196, -0.7725,  ..., -0.5137,  0.4824,  0.2000],
          [-0.8196, -0.0980,  0.1451,  ..., -0.6549, -0.6157,  0.6471],
          [ 0.2863,  0.3882, -0.9765,  ..., -0.2784,  0.1294, -0.5529],
          ...,
          [-0.6784, -0.0196, -0.5059,  ..., -0.8353, -0.7176,  0.3333],
          [-0.3098, -0.5373, -0.4667,  ..., -0.9843, -0.7333,  0.1765],
          [-0.2863, -0.0667, -0.6471,  ...,  0.8745, -0.3804, -0.8039]],

         [[ 0.3255,  0.1373, -0.3647,  ...,  0.4196,  0.1294, -0.4431],
          [-0

In [72]:
import numpy
import numpy as np
# torch_random_image = numpy.random.rand(3, 224, 224)
torch_random_image = np.random.randint(0, 256, (224, 224, 3), dtype=numpy.uint8)

# torch_random_image

# 480, 640, 3, uint8
# print(image_processor(torch_random_image,return_tensors="pt")['pixel_values'].shape) # negative values
# print(image_processor(torch_random_image,return_tensors="pt")['pixel_values'].mean())
# print(torch_random_image.mean())
# model(**image_processor(torch_random_image,return_tensors="pt"))
numpy.array(image).shape, image_processor(torch_random_image, return_tensors="pt"), torch_random_image, image_processor(image)

((480, 640, 3),
 {'pixel_values': tensor([[[[ 0.0824,  0.8196,  0.6000,  ...,  0.1843,  0.4118, -0.1216],
           [-0.2706, -0.2078,  0.4275,  ...,  0.1686,  0.0275, -0.8667],
           [-0.8588,  0.3176,  0.0902,  ...,  0.6706,  0.5529,  0.7725],
           ...,
           [ 0.3961,  0.1137,  0.0118,  ..., -0.4431, -0.5059,  0.6078],
           [-0.3176,  0.7020,  0.7255,  ..., -0.3882, -0.7725,  0.7569],
           [-0.2627, -0.9373,  0.2706,  ...,  0.7882, -0.5373, -0.4196]],
 
          [[ 0.8667,  0.5294, -0.7255,  ...,  0.8353, -0.7647,  0.4039],
           [ 0.3961, -0.2784,  0.2863,  ..., -0.6392, -0.3020,  0.8431],
           [-0.6627,  0.2941,  0.1922,  ...,  0.2471, -0.2000,  0.5765],
           ...,
           [-0.4588,  0.2314,  0.7412,  ...,  0.0588, -0.2157, -0.9294],
           [ 0.2784,  0.8745, -0.9765,  ..., -0.8902,  0.1137, -0.9373],
           [ 0.6706,  0.6627,  0.4275,  ..., -0.3725, -0.5843, -0.7569]],
 
          [[-0.0588, -0.9451,  0.6314,  ..., -0.1137,

In [2]:
print([key for key in dir(outputs) if not key.startswith('_')])

['attentions', 'clear', 'copy', 'fromkeys', 'get', 'hidden_states', 'items', 'keys', 'last_hidden_state', 'move_to_end', 'pooler_output', 'pop', 'popitem', 'setdefault', 'to_tuple', 'update', 'values']


In [8]:
# outputs.attentions # attention tensor per layer.
outputs.attentions[-1].shape # [1, 12, 197, 197]

torch.Size([1, 12, 197, 197])

In [8]:
outputs.hidden_states[-1].shape # [1, 197, 768]

TypeError: 'NoneType' object is not subscriptable

In [79]:
outputs.last_hidden_state[:,0,:].shape

torch.Size([1, 768])

In [40]:
# dir(outputs)
# [batch, channel, height, width]
# print(inputs['pixel_values'].shape) # torch.Size([1, 3, 224, 224])
# inputs['pixel_values']
print(inputs['pixel_values'].mean())

tensor(0.0373)


In [None]:
# load ViT model from huggingface.


## Generate audio embeddings

In [7]:
# now, audio tokens.

# load audio dataset, or just use your own random data.

# load audio feature extractor, "AST" model, or WhisperFeatureExtractor

# load model for audio classification or ASTModel.

from transformers import AutoProcessor, ASTModel
import torch
from datasets import load_dataset

dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
dataset = dataset.sort("id")
sampling_rate = dataset.features["audio"].sampling_rate

processor = AutoProcessor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
model = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

# audio file is decoded on the fly
inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

# modify the length of the input

inputs.input_values=inputs.input_values[:,:512,:]

with torch.no_grad():
    outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

# the input state will be padded, if it is too short.
list(last_hidden_states.shape),list(inputs.input_values.shape) # [1, 1214, 768]




([1, 1214, 768], [1, 512, 128])

In [10]:
# outputs.keys() # ordered dict, ['last_hidden_state', 'pooler_output']

outputs['pooler_output'].shape # [1, 768], ready for audio embedding and classification

torch.Size([1, 768])

In [9]:
outputs.last_hidden_state.shape # torch.Size([1, 1214, 768])

torch.Size([1, 1214, 768])

In [117]:
type(processor), type(model)

(transformers.models.audio_spectrogram_transformer.feature_extraction_audio_spectrogram_transformer.ASTFeatureExtractor,
 transformers.models.audio_spectrogram_transformer.modeling_audio_spectrogram_transformer.ASTModel)

In [129]:
dataset[0]['audio']['array'].shape, inputs.input_values.shape 

# 1024 is the max length of the fbank transform.


# ((93680,), torch.Size([1, 1024, 128]))
# independent of audio length? 

((93680,), torch.Size([1, 1024, 128]))

In [None]:
# accepts at most 10.24 seconds of audio.
# so we can deduce: max_length/100 = max_audio_length_in_seconds

# window_shift = int(sample_frequency * frame_shift / 1000)
# m = num_samples / window_shift (approximately)

# to ensure consistency you might want to pad or truncate fbank output 

# PS: WhisperFeatureExtractor uses chunk_length=30 to limit input to 30 seconds.
# Whisper model is bounded to 30 second inputs. Shorter inputs need to be padded.

In [133]:
# get fbank actual length

import torchaudio

waveform = torch.from_numpy(dataset[0]['audio']['array']).unsqueeze(0)
sample_frequency = processor.sampling_rate
num_mel_bins = processor.num_mel_bins

fbank = torchaudio.compliance.kaldi.fbank(waveform, sample_frequency = sample_frequency, window_type='hanning', num_mel_bins = num_mel_bins)
fbank.shape # torch.Size([584, 128])


torch.Size([584, 128])

In [None]:
# to run multilingual transcription, first slice the audio by speakers, then run whisper on each segment
# https://github.com/pyannote/pyannote-audio (speaker diarization)

# For audio segmentation and classification, check You-Only-Hear-Once
# So we can differentiate speech, music or noise