In [17]:
import soundfile as sf
import os
import numpy as np
import torchaudio
from IPython.display import Audio
import io

import torch


from config import Wav2Vec2Config
from model import Wav2Vec2ForPreTraining,Wav2Vec2FeatureEncoder,Wav2Vec2GumbelVectorQuantizer,_compute_mask_indices,Wav2Vec2Encoder


def resample_audio_torchaudio(file_path, original_sample_rate=44100, target_sample_rate=16000):
    waveform, sample_rate = torchaudio.load(file_path)
    if sample_rate != original_sample_rate:
        raise ValueError(f"Expected sample rate to be {original_sample_rate}, but got {sample_rate}")
    
    resampler = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=target_sample_rate)
    waveform = resampler(waveform)
    
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    
    return waveform.squeeze(), target_sample_rate

def load_audio(path):
    waveform,sample_rate = torchaudio.load(path)

    return waveform.mean(dim=0), sample_rate

def load_dataset(file_list):
    dataset = []
    for file_path in file_list:
        if file_path.endswith('.mp3'):
            audio, sample_rate = resample_audio_torchaudio(file_path)
            dataset.append(audio)
    return torch.stack(dataset)


dataset = load_dataset([f'data/mp3_train_files/Gould/Gould - WTC_clip_{i}.mp3' for i in range(1,4)])
dataset.shape

torch.Size([3, 80000])

In [21]:
config = Wav2Vec2Config()


feature_encoder = Wav2Vec2FeatureEncoder(config)


latent_reps= feature_encoder(dataset)#.transpose(1,2)
print(latent_reps.shape)


torch.Size([3, 512, 249])


In [22]:
encoder = Wav2Vec2Encoder(config)

encoder(latent_reps)

RuntimeError: Given groups=16, weight of size [768, 48, 128], expected input[3, 249, 512] to have 768 channels, but got 249 channels instead

In [None]:
quantizer = Wav2Vec2GumbelVectorQuantizer(config)

mask_time_indices = torch.tensor(_compute_mask_indices(shape=(latent_reps.shape[0], latent_reps.shape[1]), mask_prob=0.2, mask_length=2))

codevectors, perplexity = quantizer(hidden_states = latent_reps,mask_time_indices=mask_time_indices) 

codebook = quantizer.codevectors

print(codevectors.shape)

In [50]:
config = Wav2Vec2Config()
model = Wav2Vec2ForPreTraining(config)

In [60]:
input_values = [] #  Float values of input raw speech waveform.
attention_mask = []  # bool tensor (batch_size, seq_len)
mask_time_indices = []# bool tensor (batch_size, seq_len)
sampled_negative_indices = [] # bool tensor (batch_size, sequence_length, num_negatives)
output_attentions = torch.BoolTensor(0)
output_hidden_states = torch.BoolTensor(0)
return_dict = torch.BoolTensor(1)


In [61]:
model

Wav2Vec2ForPreTraining(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (