<a href="https://colab.research.google.com/github/Haislich/AudioLM/blob/semantic_modeling/AudioLM_notebook_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data preparation

In [34]:
!git clone https://github.com/facebookresearch/libri-light.git
!git clone https://github.com/Haislich/AudioLM.git

Cloning into 'libri-light'...
remote: Enumerating objects: 178, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 178 (delta 8), reused 17 (delta 5), pack-reused 155[K
Receiving objects: 100% (178/178), 374.49 KiB | 1.49 MiB/s, done.
Resolving deltas: 100% (62/62), done.
Cloning into 'AudioLM'...
remote: Enumerating objects: 190, done.[K
remote: Counting objects: 100% (190/190), done.[K
remote: Compressing objects: 100% (124/124), done.[K
remote: Total 190 (delta 78), reused 148 (delta 42), pack-reused 0[K
Receiving objects: 100% (190/190), 13.04 MiB | 20.06 MiB/s, done.
Resolving deltas: 100% (78/78), done.


In [49]:
#!python /content/libri-light/data_preparation/build_all_stats.py /content/drive/MyDrive/AudioLMDataset/datasets_raw/small/small /content/drive/MyDrive/AudioLMDataset/ecciu

Gathering the list of metadata
No cache found at /content/drive/MyDrive/AudioLMDataset/ecciu/.cache/metadata.pkl
Saving a cache at /content/drive/MyDrive/AudioLMDataset/ecciu/.cache/metadata.pkl
2588 files found
Building the genre statistics
No cache found at /content/drive/MyDrive/AudioLMDataset/ecciu/.cache/meta_genre_stats.json
  2% (59 of 2588) |                                           | Elapsed Time: 0:01:01 ETA:   0:48:40^C


# Imports

In [6]:
!pip install datasets

import os
import librosa
import numpy as np
from transformers import Wav2Vec2BertModel, AutoProcessor
import torch
import random as rd
from tqdm import tqdm

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Classes

## Utils

In [7]:
def set_seed(seed):
    rd.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def count_flac(data_path):
  data_list = os.walk(data_path)
  cnt=0

  for dirpath, dirnames, filenames in data_list:
    for filename in filenames:
      path_to_audio = os.path.join(dirpath, filename)
      if path_to_audio.endswith(".flac"):
        cnt+=1

  return cnt

## Preprocessing

## Model

In [8]:
set_seed(42)
from datasets import load_dataset

dataset = "/content/drive/MyDrive/AudioLMDataset/dataset_segmented"
#dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
processor = AutoProcessor.from_pretrained("hf-audio/wav2vec2-bert-CV16-en")
model = Wav2Vec2BertModel.from_pretrained("hf-audio/wav2vec2-bert-CV16-en")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
def from_audio_2_embeddings(data_path, max_files=None):
  data_list = os.walk(data_path)
  audio_embeddings = []
  cnt=0
  total = count_flac(data_path)

  if max_files != None:
    total = max_files


  pbar = tqdm(total=total, desc="Featuring audios...")

  for dirpath, dirnames, filenames in data_list:
    for filename in filenames:
      path_to_audio = os.path.join(dirpath, filename)
      if path_to_audio.endswith(".flac"):
        audio, sr = librosa.load(path_to_audio, sr=None)  # load audio file with librosa library and sample rate 16kHz
        #print(len(audio))
        inputs = processor(audio, return_tensors="pt", sampling_rate=sr) # extract features from audio file with processor
        print(inputs)
        with torch.no_grad():
          output = model(inputs['input_features'], output_hidden_states=True, return_dict=True)
          seventh_layer_output = output.hidden_states[6] # get the output of the 7th layer of BERT model
          audio_embeddings.append(seventh_layer_output.squeeze(0).detach().numpy())
          cnt+=1
          pbar.update(1)
          if cnt >= total:
            break


  pbar.close()

  audio_embeddings = np.concatenate(audio_embeddings, axis=0)

  return audio_embeddings


def test_function(demo_dataset):
  demo_dataset = demo_dataset.sort("id")
  sr = demo_dataset.features["audio"].sampling_rate
  audio_embeddings = []

  for audios in (demo_dataset):
    input = processor(audios["audio"]["array"], return_tensors="pt", sampling_rate=sr)
    output = model(input['input_features'], output_hidden_states=True, return_dict=True)
    seventh_layer_output = output.hidden_states[6] # get the output of the 7th layer of BERT model
    audio_embeddings.append(seventh_layer_output.squeeze(0).detach().numpy())


  audio_embeddings = np.concatenate(audio_embeddings, axis=0)

  return audio_embeddings

def from_embd_to_semToken(audio_embeddings):
    scaler = StandardScaler()
    audio_embeddings = scaler.fit_transform(audio_embeddings)

    k_means = KMeans(n_clusters=1024, random_state=42)
    k_means.fit(audio_embeddings)

    return k_means.labels_

#audio_embed = from_audio_2_embeddings(dataset, 10)
#semantic_tokens = from_embd_to_semToken(audio_embed)

#semantic_tokens



In [10]:
audio_em = from_audio_2_embeddings(dataset)
semantic_tokens = from_embd_to_semToken(audio_em)

Featuring audios...:   0%|          | 0/36229 [00:00<?, ?it/s]

{'input_features': tensor([[[-2.1301, -1.9912, -1.5485,  ..., -1.7942, -1.8913, -2.2307],
         [-1.6300, -1.3804, -1.4738,  ..., -1.4663, -1.5525, -1.8901],
         [-1.0104, -0.9717, -1.2421,  ..., -1.4721, -1.4840, -1.7839],
         ...,
         [-0.9875, -1.1265, -0.9915,  ..., -2.0437, -2.0837, -2.2086],
         [-1.4624, -1.6142, -1.3722,  ..., -1.7880, -2.0405, -2.3680],
         [-1.5119, -1.5796, -2.1517,  ..., -1.9141, -1.9322, -2.3005]]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], dtype=torch.int32)}


Featuring audios...:   0%|          | 1/36229 [00:20<203:10:51, 20.19s/it]

{'input_features': tensor([[[-1.1680, -1.2422, -1.6384,  ..., -1.0660, -0.8002, -0.8915],
         [ 1.0473,  0.5893, -0.1861,  ..., -1.0661, -0.8358, -0.7779],
         [ 0.8739,  0.2553, -1.0427,  ..., -1.3063, -0.9323, -0.6504],
         ...,
         [-1.9303, -1.9827, -1.5939,  ..., -0.6047, -0.6532, -0.5503],
         [-2.0955, -1.9896, -2.1080,  ..., -0.6099, -0.4032, -0.7376],
         [-0.4362, -0.7825, -1.2963,  ..., -0.7788, -0.8508, -0.8202]]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], dtype=torch.int32)}


Featuring audios...:   0%|          | 2/36229 [00:47<247:49:38, 24.63s/it]

{'input_features': tensor([[[-1.1603, -1.1525, -1.5938,  ...,  0.0879, -0.1084,  0.0872],
         [-1.8909, -1.5700, -1.3928,  ..., -1.2129, -1.3107, -0.6013],
         [-1.4001, -1.4755, -1.7590,  ..., -0.9588, -1.1852, -1.0900],
         ...,
         [-2.1934, -2.2943, -2.1813,  ..., -1.0207, -0.8625, -0.9266],
         [-1.4797, -1.4244, -1.5559,  ..., -1.2874, -1.0508, -1.2251],
         [-1.2681, -1.5807, -1.5856,  ..., -0.8362, -0.9099, -0.7761]]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], dtype=torch.int32)}


Featuring audios...:   0%|          | 3/36229 [01:17<270:49:14, 26.91s/it]

{'input_features': tensor([[[-1.4349, -1.3107, -1.4911,  ..., -1.1577, -0.7583, -0.9191],
         [-0.7582, -0.9312, -1.6653,  ..., -1.1246, -0.9544, -0.7987],
         [-0.7178, -0.9280, -1.4560,  ..., -0.8724, -1.1473, -1.1088],
         ...,
         [-1.4297, -1.3277, -1.3861,  ..., -0.8922, -1.0281, -0.9067],
         [-3.1451, -2.5281, -1.5369,  ..., -1.2487, -1.2704, -1.0643],
         [-2.1456, -1.7944, -1.8482,  ..., -0.6234, -0.5146, -0.3861]]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], dtype=torch.int32)}


KeyboardInterrupt: 

In [None]:
semantic_tokens

In [32]:
len(semantic_tokens)

23998

In [None]:
from transformers import Wav2Vec2BertModel, AutoFeatureExtractor
import torch
from datasets import load_dataset

# Carica il dataset
dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
dataset = dataset.sort("id")
sampling_rate = dataset.features["audio"].sampling_rate

# Carica il modello e l'estrattore di feature
processor = AutoFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")
model = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0")

inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

In [None]:
dataset[0]["audio"]["array"]

array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
       0.0010376 ])

In [None]:
outputs = model(inputs['input_features'], output_hidden_states=True, return_dict=True)
seventh_layer_output = outputs.hidden_states[6]
seventh_layer_output


In [None]:
seventh_layer_output


tensor([[[-0.1476,  0.0026, -0.2191,  ..., -0.3142, -0.1849, -0.2700],
         [-0.0255,  0.1388,  0.1583,  ..., -0.0269,  0.2433, -0.2006],
         [ 0.0260,  0.1063,  0.0102,  ..., -0.0240, -0.0181, -0.0301],
         ...,
         [ 0.2395,  0.1679,  0.1687,  ..., -0.2784,  0.0184, -0.0730],
         [ 0.1917,  0.2821,  0.2182,  ..., -0.2251, -0.0072,  0.0939],
         [ 0.2303,  0.0882, -0.2248,  ..., -0.2103,  0.0014,  0.0202]]])

In [None]:
seventh_layer_output.shape

torch.Size([1, 292, 1024])