<a href="https://colab.research.google.com/github/Haislich/AudioLM/blob/semantic_modeling/AudioLM_notebook_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# Data preparation

In [4]:
!git clone https://github.com/facebookresearch/libri-light.git
!git clone https://github.com/Haislich/AudioLM.git

Cloning into 'libri-light'...
remote: Enumerating objects: 178, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 178 (delta 8), reused 17 (delta 5), pack-reused 155[K
Receiving objects: 100% (178/178), 374.49 KiB | 1.80 MiB/s, done.
Resolving deltas: 100% (62/62), done.
Cloning into 'AudioLM'...
remote: Enumerating objects: 204, done.[K
remote: Counting objects: 100% (204/204), done.[K
remote: Compressing objects: 100% (136/136), done.[K
remote: Total 204 (delta 85), reused 154 (delta 44), pack-reused 0[K
Receiving objects: 100% (204/204), 13.06 MiB | 20.71 MiB/s, done.
Resolving deltas: 100% (85/85), done.


In [5]:
#!python /content/libri-light/data_preparation/build_all_stats.py /content/drive/MyDrive/AudioLMDataset/datasets_raw/small/small /content/drive/MyDrive/AudioLMDataset/ecciu

# Imports

In [None]:
#!pip install datasets
!pip install soundfile  ##Nuova dipendenza, aggiungila
import os
import librosa
import numpy as np
from transformers import Wav2Vec2BertModel, AutoProcessor
import torch
from torch.utils.data import Dataset, DataLoader ##Nuova dipendenza, aggiungila
import random as rd
from tqdm import tqdm
import soundfile as sf

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Classes

## Utils

In [7]:
def set_seed(seed):
    rd.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def count_flac(data_path):
  data_list = os.walk(data_path)
  cnt=0

  for dirpath, dirnames, filenames in data_list:
    for filename in filenames:
      path_to_audio = os.path.join(dirpath, filename)
      if path_to_audio.endswith(".flac"):
        cnt+=1

  return cnt

## Preprocessing

## Model

In [44]:
set_seed(42)
from datasets import load_dataset

dataset = "/content/drive/MyDrive/AudioLMDataset/dataset_segmented/"
#dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
processor = AutoProcessor.from_pretrained("hf-audio/wav2vec2-bert-CV16-en")
model = Wav2Vec2BertModel.from_pretrained("hf-audio/wav2vec2-bert-CV16-en")



In [86]:
suca = "/content/drive/MyDrive/AudioLMDataset/dataset_segmented/100/2315/01_baum_sea_fairies_64kb_0000.flac"
suca2 = "/content/drive/MyDrive/AudioLMDataset/dataset_segmented/100/2315/01_baum_sea_fairies_64kb_0005.flac"
data, _ = sf.read(suca)
data1, _ = sf.read(suca2)
data, data1

(array([ 0.        ,  0.        ,  0.        , ...,  0.00436401,
        -0.00524902, -0.00210571]),
 array([-1.52587891e-04,  9.15527344e-05,  3.05175781e-04, ...,
        -4.69970703e-03, -5.18798828e-03, -6.07299805e-03]))

In [95]:
ei = processor(audio=[data, data1], return_tensors="pt", sampling_rate=16000)
ei["attention_mask"]

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], dtype=torch.int32)

In [104]:
class AudioDataset(Dataset):
  def __init__(self, path, processor, sr=16000):
    self.path = path
    self.processor = processor
    self.sr = sr
    self.audios = self.collate_audios(path)
    self.num_audio = self.__len__()

  def collate_audios(self, path):
    path_audios = []
    for dirpath, _, filenames in os.walk(path):
      for filename in filenames:
        path_to_audio = os.path.join(dirpath, filename)
        if path_to_audio.endswith(".flac"):
          path_audios.append(path_to_audio)

    return path_audios

  def __len__(self):
    return len(self.audios)

  def __getitem__(self, idx):
    path = self.audios[idx]
    audio, _ = sf.read(path)
    #input = self.processor(audio, return_tensors="pt", sampling_rate=self.sr)
    return audio



In [99]:
a = torch.tensor([1,2])
b = torch.tensor([1])
c = torch.tensor([1,2,3])
torch.cat([a,b,c], dim=0)

tensor([1, 2, 1, 1, 2, 3])

In [108]:
def collate_fn(batch):
  inputs = processor(
              [audio for audio in batch],
              return_tensors="pt",
              padding=True,
              sampling_rate=16000
          )
  return inputs

batch_size = 32
dataset_loader = AudioDataset(dataset, processor)
data_loader = DataLoader(dataset_loader, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [None]:
for i in data_loader:
  print(i)
  print(len(i))
  print(len(i['input_features']))
  print(len(i['attention_mask']))

In [15]:
def from_audio_2_embeddings(data_loader, model, dataset_segmented=True, max_files=None):

  total = data_loader.dataset.num_audio / data_loader.batch_size
  audio_embeddings = []
  cnt=0
  if max_files != None:
    total = max_files

  model.eval()
  pbar = tqdm(total=total, desc="Featuring audios...")

  with torch.no_grad():
    for batch, attention_mask in data_loader:
      output = model()
          #print(len(audio))
          inputs = processor(audio, return_tensors="pt", sampling_rate=16000) # extract features from audio file with processor
          with torch.no_grad():
            output = model(inputs['input_features'], output_hidden_states=True, return_dict=True)
            seventh_layer_output = output.hidden_states[6] # get the output of the 7th layer of BERT model
            audio_embeddings.append(seventh_layer_output.squeeze(0).detach().cpu().numpy())
            cnt+=1
            pbar.update(1)
          if cnt >= total:
            break
        except Exception as e:
          print(f"Error: File {path_to_audio}: {e}")


  pbar.close()

  audio_embeddings = np.concatenate(audio_embeddings, axis=0)

  return audio_embeddings


def test_function(demo_dataset):
  demo_dataset = demo_dataset.sort("id")
  sr = demo_dataset.features["audio"].sampling_rate
  audio_embeddings = []

  for audios in (demo_dataset):
    input = processor(audios["audio"]["array"], return_tensors="pt", sampling_rate=sr)
    output = model(input['input_features'], output_hidden_states=True, return_dict=True)
    seventh_layer_output = output.hidden_states[6] # get the output of the 7th layer of BERT model
    audio_embeddings.append(seventh_layer_output.squeeze(0).detach().numpy())


  audio_embeddings = np.concatenate(audio_embeddings, axis=0)

  return audio_embeddings

def from_embd_to_semToken(audio_embeddings):
    scaler = StandardScaler()
    audio_embeddings = scaler.fit_transform(audio_embeddings)

    k_means = KMeans(n_clusters=1024, random_state=42)
    k_means.fit(audio_embeddings)

    return k_means.labels_

audio_embed = from_audio_2_embeddings(dataset, True, 5)
semantic_tokens = from_embd_to_semToken(audio_embed)

semantic_tokens








Featuring audios...:   0%|          | 0/5 [00:00<?, ?it/s][A[A[A[A[A




Featuring audios...:  20%|██        | 1/5 [00:05<00:22,  5.64s/it][A[A[A[A[A




Featuring audios...:  40%|████      | 2/5 [00:35<01:00, 20.16s/it][A[A[A[A[A




Featuring audios...:  60%|██████    | 3/5 [01:05<00:48, 24.32s/it][A[A[A[A[A




Featuring audios...:  80%|████████  | 4/5 [01:05<00:14, 14.98s/it][A[A[A[A[A




Featuring audios...: 100%|██████████| 5/5 [01:29<00:00, 18.15s/it][A[A[A[A[A




Featuring audios...: 6it [01:37, 14.74s/it]                       [A[A[A[A[A




Featuring audios...: 7it [01:46, 12.84s/it][A[A[A[A[A




Featuring audios...: 8it [01:55, 11.43s/it][A[A[A[A[A




Featuring audios...: 9it [02:26, 17.53s/it][A[A[A[A[A




Featuring audios...: 10it [02:32, 14.04s/it][A[A[A[A[A




Featuring audios...: 11it [02:55, 16.69s/it][A[A[A[A[A




Featuring audios...: 12it [03:13, 17.10s/it][A[A[A[A[A




Featuring audios...:

KeyboardInterrupt: 

In [10]:
count_flac(dataset)

36229

In [None]:
audio_em = from_audio_2_embeddings(dataset)
semantic_tokens = from_embd_to_semToken(audio_em)

# Transformer class

In [None]:
class