In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# Data preparation

In [3]:
!git clone https://github.com/facebookresearch/libri-light.git
!git clone https://github.com/Haislich/AudioLM.git

Cloning into 'libri-light'...
remote: Enumerating objects: 178, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 178 (delta 8), reused 17 (delta 5), pack-reused 155[K
Receiving objects: 100% (178/178), 374.49 KiB | 6.14 MiB/s, done.
Resolving deltas: 100% (62/62), done.
Cloning into 'AudioLM'...
remote: Enumerating objects: 207, done.[K
remote: Counting objects: 100% (207/207), done.[K
remote: Compressing objects: 100% (139/139), done.[K
remote: Total 207 (delta 87), reused 153 (delta 44), pack-reused 0[K
Receiving objects: 100% (207/207), 13.07 MiB | 42.48 MiB/s, done.
Resolving deltas: 100% (87/87), done.


In [4]:
#!python /content/libri-light/data_preparation/build_all_stats.py /content/drive/MyDrive/AudioLMDataset/datasets_raw/small/small /content/drive/MyDrive/AudioLMDataset/ecciu

# Imports

In [2]:
#!pip install datasets
!pip install soundfile  ##Nuova dipendenza, aggiungila
import os
import librosa
import numpy as np
from transformers import Wav2Vec2BertModel, AutoProcessor
import torch
from torch.utils.data import Dataset, DataLoader ##Nuova dipendenza, aggiungila
import random as rd
from tqdm import tqdm
import soundfile as sf

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler



# Classes

## Utils

In [3]:
def set_seed(seed):
    rd.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def count_flac(data_path):
  data_list = os.walk(data_path)
  cnt=0

  for dirpath, dirnames, filenames in data_list:
    for filename in filenames:
      path_to_audio = os.path.join(dirpath, filename)
      if path_to_audio.endswith(".flac"):
        cnt+=1

  return cnt

## Preprocessing

## Model

In [4]:
set_seed(42)
#from datasets import load_dataset

data_path = "/content/drive/MyDrive/AudioLMDataset/dataset_segmented/"
#dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
processor = AutoProcessor.from_pretrained("hf-audio/wav2vec2-bert-CV16-en")
model = Wav2Vec2BertModel.from_pretrained("hf-audio/wav2vec2-bert-CV16-en")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/299 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/369 [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/32.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/544 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.42G [00:00<?, ?B/s]

Wav2Vec2BertModel(
  (feature_projection): Wav2Vec2BertFeatureProjection(
    (layer_norm): LayerNorm((160,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=160, out_features=1024, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Wav2Vec2BertEncoder(
    (dropout): Dropout(p=0.1, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x Wav2Vec2BertEncoderLayer(
        (ffn1_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (ffn1): Wav2Vec2BertFeedForward(
          (intermediate_dropout): Dropout(p=0.1, inplace=False)
          (intermediate_dense): Linear(in_features=1024, out_features=4096, bias=True)
          (intermediate_act_fn): SiLU()
          (output_dense): Linear(in_features=4096, out_features=1024, bias=True)
          (output_dropout): Dropout(p=0.1, inplace=False)
        )
        (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (self_attn_dropout): Drop

In [6]:
class AudioDataset(Dataset):
  def __init__(self, path, processor, sr=16000):
    self.path = path
    self.processor = processor
    self.sr = sr
    self.audios = self.collate_audios(path)
    self.num_audio = self.__len__()

  def collate_audios(self, path):
    path_audios = []
    for dirpath, _, filenames in os.walk(path):
      for filename in filenames:
        path_to_audio = os.path.join(dirpath, filename)
        if path_to_audio.endswith(".flac"):
          path_audios.append(path_to_audio)

    return path_audios

  def __len__(self):
    return len(self.audios)

  def __getitem__(self, idx):
    path = self.audios[idx]
    audio, _ = sf.read(path)
    #input = self.processor(audio, return_tensors="pt", sampling_rate=self.sr)
    return audio



In [None]:
def collate_fn(batch):
  inputs = processor(
              [audio for audio in batch],
              return_tensors="pt",
              padding=True,
              sampling_rate=16000
          )
  return inputs

batch_size = 2
dataset_loader = AudioDataset(data_path, processor)
dataset = DataLoader(dataset_loader, batch_size=batch_size, shuffle=False, collate_fn=collate_fn).to_(device)

In [None]:
def from_audio_2_embeddings(dataset, model, dataset_segmented=True, max_files=None):

    model.eval()
    total_batches = len(dataset)

    audio_embeddings = []
    cnt=0
    if max_files != None:
      total = max_files


    pbar = tqdm(total=min(total_batches, max_files) if max_files is not None else total_batches, desc="Featuring audios...")


    with torch.no_grad():
      for i, batch in enumerate(dataset):
        #print(batch['input_features'])
        #print(len(batch['attention_mask']))
        if max_files is not None and i >= max_files:
            break
        input_features = batch['input_features'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        output = model(input_features, attention_mask=attention_mask, output_hidden_states=True, return_dict=True)
        model.
        seventh_layer_output = output.hidden_states[6].detach().cpu().numpy()
        #print("\n", len(seventh_layer_output))
        audio_embeddings.append(seventh_layer_output)
        torch.cuda.empty_cache()
        pbar.update(1)


    pbar.close()

    audio_embeddings = np.concatenate(audio_embeddings, axis=0)

    return audio_embeddings


def test_function(demo_dataset):
  demo_dataset = demo_dataset.sort("id")
  sr = demo_dataset.features["audio"].sampling_rate
  audio_embeddings = []

  for audios in (demo_dataset):
    input = processor(audios["audio"]["array"], return_tensors="pt", sampling_rate=sr)
    output = model(input['input_features'], output_hidden_states=True, return_dict=True)
    seventh_layer_output = output.hidden_states[6] # get the output of the 7th layer of BERT model
    audio_embeddings.append(seventh_layer_output.squeeze(0).detach().numpy())


  audio_embeddings = np.concatenate(audio_embeddings, axis=0)

  return audio_embeddings

def from_embd_to_semToken(audio_embeddings):
    scaler = StandardScaler()
    audio_embeddings = scaler.fit_transform(audio_embeddings)

    k_means = KMeans(n_clusters=1024, random_state=42)
    k_means.fit(audio_embeddings)

    return k_means.labels_

audio_embed = from_audio_2_embeddings(dataset, model)
semantic_tokens = from_embd_to_semToken(audio_embed)

semantic_tokens



In [None]:
torch.cuda.empty_cache()

In [None]:
!pip install nvitop
!nvitop

In [None]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))


# Transformer class