In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import torch
from transformers import Wav2Vec2Model

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-960h-lv60-self", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to(device)

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
print(model)

Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2LayerNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
      (1-4): 4 x Wav2Vec2LayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2LayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=1024, bias=True)
    (dropout)

In [6]:
from torchinfo import summary


inp = torch.rand(size=(32, 41216), dtype=torch.float16).to(device=device)

summary(model=model,
        input_data=inp,
        col_names=['input_size', 'output_size', 'num_params', 'trainable'],
        col_width=20,
        row_settings=['var_names'])

Layer (type (var_name))                                           Input Shape          Output Shape         Param #              Trainable
Wav2Vec2Model (Wav2Vec2Model)                                     [32, 41216]          [32, 128, 512]       1,024                True
├─Wav2Vec2FeatureEncoder (feature_extractor)                      [32, 41216]          [32, 512, 128]       --                   True
│    └─ModuleList (conv_layers)                                   --                   --                   --                   True
│    │    └─Wav2Vec2LayerNormConvLayer (0)                        [32, 1, 41216]       [32, 512, 8242]      6,656                True
│    │    └─Wav2Vec2LayerNormConvLayer (1)                        [32, 512, 8242]      [32, 512, 4120]      787,968              True
│    │    └─Wav2Vec2LayerNormConvLayer (2)                        [32, 512, 4120]      [32, 512, 2059]      787,968              True
│    │    └─Wav2Vec2LayerNormConvLayer (3)               

In [8]:
ip1 = model.feature_extractor(inp)
ip1.shape

torch.Size([32, 512, 128])

In [12]:
ip2 = model.feature_projection(ip1.permute(0, 2, 1))
ip2[0].shape, ip2[1].shape

(torch.Size([32, 128, 1024]), torch.Size([32, 128, 512]))

In [15]:
ip3 = model.encoder(ip2[0])
ip3.last_hidden_state.shape

torch.Size([32, 128, 1024])

In [16]:
import inspect

print(inspect.getsource(model.forward))

    @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=Wav2Vec2BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        input_values: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        mask_time_indices: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if 

In [5]:
from dataset import Wav2Vec2PretrainDataset

dataset = Wav2Vec2PretrainDataset(root='E:\Amrita\Subjects\Sem 5\BMSP paper work\Dataset\Italian health pretrain 1')
len(dataset)

19028

In [6]:
dataset[1]['Input'].shape

torch.Size([41216])

In [7]:
from dataset import get_data_loaders

train_loader = get_data_loaders(root1='E:\Amrita\Subjects\Sem 5\BMSP paper work\Dataset\Spanish healthy pretrain 1', root2='E:\Amrita\Subjects\Sem 5\BMSP paper work\Dataset\Italian health pretrain 1')

In [8]:
next(iter(train_loader))

{'Input': tensor([[-5.1446e-02, -1.3312e-02,  4.3040e-03,  ...,  1.3010e-01,
           1.5021e-01,  1.4316e-01],
         [ 3.1222e-02,  3.8302e-02,  1.6557e-02,  ..., -2.8636e-01,
          -4.0722e-01, -4.2896e-01],
         [-2.8291e-01, -3.1649e-01, -3.5919e-01,  ...,  2.3169e-01,
           1.2125e-01,  2.1746e-01],
         ...,
         [-1.3131e-02, -7.1301e-02, -1.5411e-01,  ...,  2.1850e+00,
           1.8551e+00,  1.9024e+00],
         [-9.5647e-01, -1.0360e+00, -1.0356e+00,  ...,  1.0123e-01,
           1.5509e-03, -1.0434e-01],
         [-3.5962e+00, -3.8776e+00, -3.3064e+00,  ...,  1.1614e-02,
          -6.3533e-03, -2.1652e-02]]),
 'Mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]])}