In [13]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data preparation

In [14]:
!git clone https://github.com/facebookresearch/libri-light.git
!git clone https://github.com/Haislich/AudioLM.git

fatal: destination path 'libri-light' already exists and is not an empty directory.
fatal: destination path 'AudioLM' already exists and is not an empty directory.


In [15]:
#!python /content/libri-light/data_preparation/build_all_stats.py /content/drive/MyDrive/AudioLMDataset/datasets_raw/small/small /content/drive/MyDrive/AudioLMDataset/ecciu

# Imports

In [16]:
#!pip install datasets
!pip install librosa  ##Nuova dipendenza, devo aggiungerla
import os
import numpy as np
from transformers import Wav2Vec2BertModel, AutoProcessor
import torch
from torch.nn.utils.rnn import pad_sequence     ##Nuova dipendenza, devo aggiungerla
from torch.utils.data import Dataset, DataLoader ##Nuova dipendenza, devo aggiungerla
import random as rd
from tqdm import tqdm
import librosa as lb

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Classes

## Utils

In [17]:
def set_seed(seed):
    rd.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def count_flac(data_path):
  data_list = os.walk(data_path)
  cnt=0

  for dirpath, dirnames, filenames in data_list:
    for filename in filenames:
      path_to_audio = os.path.join(dirpath, filename)
      if path_to_audio.endswith(".flac"):
        cnt+=1

  return cnt

## Preprocessing

## Model

In [18]:
set_seed(42)
#from datasets import load_dataset

data_path = "/content/drive/MyDrive/AudioLMDataset/dataset_segmented"
#dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
processor = AutoProcessor.from_pretrained("hf-audio/wav2vec2-bert-CV16-en")
model = Wav2Vec2BertModel.from_pretrained("hf-audio/wav2vec2-bert-CV16-en")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Wav2Vec2BertModel(
  (feature_projection): Wav2Vec2BertFeatureProjection(
    (layer_norm): LayerNorm((160,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=160, out_features=1024, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Wav2Vec2BertEncoder(
    (dropout): Dropout(p=0.1, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x Wav2Vec2BertEncoderLayer(
        (ffn1_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (ffn1): Wav2Vec2BertFeedForward(
          (intermediate_dropout): Dropout(p=0.1, inplace=False)
          (intermediate_dense): Linear(in_features=1024, out_features=4096, bias=True)
          (intermediate_act_fn): SiLU()
          (output_dense): Linear(in_features=4096, out_features=1024, bias=True)
          (output_dropout): Dropout(p=0.1, inplace=False)
        )
        (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (self_attn_dropout): Drop

In [19]:
class AudioDataset(Dataset):
  def __init__(self, path, processor, sr=16000):
    self.path = path
    self.processor = processor
    self.sr = sr
    self.audios = self.collate_audios(path)
    self.num_audio = self.__len__(),
    self.processor_dim = 2999,
    self.target_length = sr*60

  def collate_audios(self, path):
    path_audios = []
    for dirpath, _, filenames in os.walk(path):
      for filename in filenames:
        path_to_audio = os.path.join(dirpath, filename)
        if path_to_audio.endswith(".flac"):
          path_audios.append(path_to_audio)

    return path_audios

  def __len__(self):
    return len(self.audios)

  def __getitem__(self, idx):
    path = self.audios[idx]
    audio, _ = lb.load(path, sr=self.sr)
    if(len(audio)) > self.target_length:
      audio = audio[:self.target_length]

    input = processor(
        audio=audio,
        return_tensors="pt",
        padding=True,
        sampling_rate=16000
      )
    len_tens = input['input_features'].shape[1]
    if len_tens < self.processor_dim[0]:
      #print(input['input_features'].shape)
      paddin_len = self.processor_dim[0]-len_tens
      padding = (0, 0, 0, self.processor_dim[0]-len_tens, 0, 0)
      input['input_features'] = pad(input['input_features'], padding, "constant", value=0)
      padding_mask = (0, paddin_len)
      input['attention_mask'] = pad(input['attention_mask'], padding_mask, "constant", value=0)

    return input



In [28]:
def collate_fn(batch):
  input_features = [input['input_features'] for input in batch]
  attention_masks = [input['attention_mask'] for input in batch]
  #print("Output prima torch.stack: ", [input.shape for input in input_features])
  #print("Output prima torch.stack: ", [input.shape for input in attention_masks])

  input = torch.cat(input_features, dim=0)
  attention_mask = torch.cat(attention_masks, dim=0)
  #print("Output dopo torch.stack: ", input.shape)
  #print("Output dopo torch.stack: ", attention_mask.shape)

  return {'input_features': input, 'attention_mask': attention_mask}

batch_size = 1
dataset_loader = AudioDataset(data_path, processor)
dataset = DataLoader(dataset_loader, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [22]:
import gc
from time import sleep

In [48]:
def from_audio_2_embeddings(dataset, model, dataset_segmented=True, max_files=None):

    model.eval()
    total_batches = len(dataset)

    audio_embeddings = []
    torch.cuda.empty_cache()
    gc.collect()
    sleep(3)

    pbar = tqdm(total=min(total_batches, max_files) if max_files is not None else total_batches, desc="Featuring audios...")


    with torch.no_grad():
      for i, batch in enumerate(dataset):
        #print(batch['input_features'])
        #print(len(batch['attention_mask']))
        if max_files is not None and i >= max_files:
            break
        input_features = batch['input_features'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        output = model(input_features, attention_mask=attention_mask, output_hidden_states=True, return_dict=True)
        seventh_layer_output = output.hidden_states[6].detach().cpu().numpy()
        audio_embeddings.append(seventh_layer_output)
        pbar.update(1)
        #torch.cuda.empty_cache()
        #gc.collect()

    pbar.close()

    audio_embeddings = np.concatenate(audio_embeddings, axis=0)

    return audio_embeddings



def from_embd_to_semToken(audio_embeddings):
    scaler = StandardScaler()
    reshaped_data = audio_embeddings.reshape(-1, audio_embeddings.shape[2])

    audio_embeddings = scaler.fit_transform(reshaped_data)

    k_means = KMeans(n_clusters=1024, random_state=42)
    k_means.fit(audio_embeddings)

    return k_means.labels_




In [None]:
audio_embed = from_audio_2_embeddings(dataset, model, True, 15)
audio_embed.shape
semantic_tokens = from_embd_to_semToken(audio_embed)
#audio_embed = test_function(data_path

Featuring audios...: 100%|██████████| 15/15 [00:38<00:00,  2.55s/it]


In [27]:
torch.cuda.empty_cache()
gc.collect()

145

In [None]:
#!pip install nvitop
!nvitop

In [None]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))


# Transformer class