In [29]:
import torch
import torchaudio
import torch.nn.functional as F
import os
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from IPython.display import Audio

## 0. Experimenting with TorchAudio

In [30]:
def plot_waveform(waveform, sample_rate):
    waveform = waveform.numpy()

    num_channels, num_frames = waveform.shape
    time_axis = torch.arange(0, num_frames) / sample_rate

    figure, axes = plt.subplots(num_channels, 1)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].plot(time_axis, waveform[c], linewidth=1)
        axes[c].grid(True)
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
    figure.suptitle("waveform")

def plot_specgram(waveform, sample_rate, title="Spectrogram"):
    waveform = waveform.numpy()

    num_channels, num_frames = waveform.shape

    figure, axes = plt.subplots(num_channels, 1)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].specgram(waveform[c], Fs=sample_rate)
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
    figure.suptitle(title)

## 1. Creating the Dataset

In [31]:
from torchaudio.transforms import Resample

In [66]:
class LibriDataset(Dataset):

    def __init__(self, audioDir, newSampleFreq, maxLenght):
        self.audioList = self.getAudioList(audioDir)
        self.resampler = Resample(new_freq=newSampleFreq)
        self.maxLenght = maxLenght

    def getAudioList(self,audioDir):
        flac_files = []
        for root, dirs, files in os.walk(audioDir):
            for file in files:
                if file.endswith(".flac"):
                    file_path = os.path.join(root, file)
                    flac_files.append(file_path)
        return flac_files


    def __len__(self):
        return len(self.audioList)

    def __getitem__(self, idx):
        waveform, sampleRate = torchaudio.load(self.audioList[idx])
        waveformResampled = self.resampler(waveform)
        waveformPadded = F.pad(waveformResampled, pad = (0,self.maxLenght - len(waveformResampled[0])))
        
        return waveformPadded, sampleRate


In [67]:
SAMPLE_FREQ = 16000
AUDIO_MAX_DURATION = 60

trainDataset = LibriDataset("data_cut", newSampleFreq = SAMPLE_FREQ, maxLenght = SAMPLE_FREQ * AUDIO_MAX_DURATION)

len(trainDataset),trainDataset[2][0].shape

(57, torch.Size([1, 960000]))

In [68]:
BATCH_SIZE = 16

dataLoader = DataLoader(trainDataset, batch_size=BATCH_SIZE, shuffle=True)

In [69]:
for idx, waveform in enumerate(dataLoader):
    print(idx, waveform)
    break

0 [tensor([[[ 0.0023,  0.0028,  0.0011,  ...,  0.0000,  0.0000,  0.0000]],

        [[-0.0007, -0.0007, -0.0005,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0001,  0.0002,  0.0007,  ...,  0.0000,  0.0000,  0.0000]],

        ...,

        [[ 0.0335,  0.0340,  0.0333,  ...,  0.0000,  0.0000,  0.0000]],

        [[-0.0011,  0.0002,  0.0010,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0272,  0.0275,  0.0277,  ...,  0.0000,  0.0000,  0.0000]]]), tensor([16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000,
        16000, 16000, 16000, 16000, 16000, 16000])]


In [70]:
print(trainDataset[0][0].shape)
print(len(trainDataset))

Audio(trainDataset[0][0], rate = SAMPLE_FREQ)

torch.Size([1, 960000])
57


## W2V-BERT TEST

In [71]:
from transformers import Wav2Vec2BertModel, AutoFeatureExtractor
import torch

In [72]:
model = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0")
processor = AutoFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")

In [73]:
model

Wav2Vec2BertModel(
  (feature_projection): Wav2Vec2BertFeatureProjection(
    (layer_norm): LayerNorm((160,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=160, out_features=1024, bias=True)
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): Wav2Vec2BertEncoder(
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x Wav2Vec2BertEncoderLayer(
        (ffn1_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (ffn1): Wav2Vec2BertFeedForward(
          (intermediate_dropout): Dropout(p=0.0, inplace=False)
          (intermediate_dense): Linear(in_features=1024, out_features=4096, bias=True)
          (intermediate_act_fn): SiLU()
          (output_dense): Linear(in_features=4096, out_features=1024, bias=True)
          (output_dropout): Dropout(p=0.0, inplace=False)
        )
        (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (self_attn_dropout): Drop

In [74]:
inputs = processor(trainDataset[0][0][:30000], sampling_rate=SAMPLE_FREQ, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs, output_hidden_states = True)

In [75]:
CHOSEN_OUTPUT_LAYER = 20

embedding = outputs.hidden_states[CHOSEN_OUTPUT_LAYER]
embedding.shape

torch.Size([1, 2999, 1024])

In [76]:
print(outputs.extract_features.shape)
print(outputs.last_hidden_state.shape)
print(outputs.last_hidden_state)
print(outputs.hidden_states[24])
print(len(outputs.hidden_states))

torch.Size([1, 2999, 160])
torch.Size([1, 2999, 1024])
tensor([[[ 0.1351, -0.0008,  0.0412,  ...,  0.0472, -0.0231,  0.0368],
         [ 0.1478,  0.0595,  0.0004,  ...,  0.0234,  0.0153,  0.1185],
         [ 0.1297,  0.0117,  0.0134,  ..., -0.0035,  0.0183,  0.1646],
         ...,
         [ 0.0045, -0.0280, -0.0077,  ..., -0.0426,  0.0236,  0.0476],
         [ 0.0068, -0.0258, -0.0111,  ..., -0.0478,  0.0212,  0.0438],
         [ 0.0058, -0.0223, -0.0214,  ..., -0.0372,  0.0138,  0.0230]]])
tensor([[[ 0.1351, -0.0008,  0.0412,  ...,  0.0472, -0.0231,  0.0368],
         [ 0.1478,  0.0595,  0.0004,  ...,  0.0234,  0.0153,  0.1185],
         [ 0.1297,  0.0117,  0.0134,  ..., -0.0035,  0.0183,  0.1646],
         ...,
         [ 0.0045, -0.0280, -0.0077,  ..., -0.0426,  0.0236,  0.0476],
         [ 0.0068, -0.0258, -0.0111,  ..., -0.0478,  0.0212,  0.0438],
         [ 0.0058, -0.0223, -0.0214,  ..., -0.0372,  0.0138,  0.0230]]])
25


In [79]:
from sklearn.cluster import MiniBatchKMeans

n_clusters = 10
kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=BATCH_SIZE)

In [85]:
for batch in dataLoader:
    waveforms, sample_rates = batch
    inputs = processor(waveforms, sampling_rate=sample_rates[0], return_tensors="pt", padding=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state
        
    ##embeddings = embeddings.mean(dim=1).cpu().numpy()
    ##kmeans.partial_fit(batch_embeddings)
    break

ValueError: Input waveform must have only one dimension, shape is (16, 960000)