## 0. Colab Code

## 1. Preparing the imports

In [1]:
import torch
import torchaudio
import torch.nn.functional as F

import os
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from IPython.display import Audio
import time

## 0.5. Creating the Dataset

## 2. HuBERT TEST

In [2]:
from transformers import HubertModel,AutoProcessor,AutoFeatureExtractor,Wav2Vec2Processor,HubertForCTC
import joblib
import torch.nn as nn
from hubertKM import SemanticTokenizer, visualizeEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


#### Importing the pretrained models for HuBERT and KMeans

In [3]:
# A semantic tokenizer
# Input: (w2vCheckpointPath, kmeansCheckpointPath)
# Output: (semanticTokens, normalizedEmbeddings)
w2vBERT = SemanticTokenizer("facebook/hubert-base-ls960","./hubertKM/hubert_base_ls960_L9_km500.bin")  

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

#### Computing semantic tokens

In [4]:
#semanticTokens, embeddings

#### Visualizing embeddings

In [5]:
# Create some plots using PCA and t-SNE

# visualizeEmbeddings(embeddings, semanticTokens)

#### Check with another implementation (OPTIONAL)

## 3. Creating the Semantic Transformer

In [6]:
from SoundStream import soundstream_16khz, audio_to_tokens, tokens_to_audio, encode_audio, decode_audio

In [7]:
soundStream = soundstream_16khz()

## TEST

In [8]:
from data import storeTokens,  TokensDataset, store_from_librilight

tokenPath = "out"
tokenFile = "out.csv"
audioPath = "data_cut"


In [9]:
#fileCount = store_from_librilight(tokenPath, tokenFile, w2vBERT, soundStream, fileCountCheckpoint = 10, subset = "1h") #More than 2700 files in the 10h, for a smaller dataset use 1h (10 min is too small)

In [10]:
#tokenDataset = TokensDataset(tokenPath, tokenFile, requiredDuration = 30, includeSemanticTokens = True, includeCoarseTokens = True, includeFineTokens = True) 
#semanticDataset = TokensDataset(tokenPath, tokenFile, requiredDuration = 30, includeSemanticTokens = True)
coarseDataset = TokensDataset(tokenPath, tokenFile, requiredDuration = 10, includeCoarseTokens = True)
#fineDataset = TokensDataset(tokenPath, tokenFile, requiredDuration = 30, includeFineTokens = True)

In [11]:
from torch.utils.data import DataLoader, random_split
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from TransformerModel import Decoder
import pytorch_lightning as pl

In [12]:

train_dataset, valid_dataset = random_split(coarseDataset, [0.8, 0.2])
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=6, persistent_workers=True) #num_workers should be a fraction of the number of cores of the CPU (or GPU if in use)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False, num_workers=6, persistent_workers=True)

In [13]:

#Change vocab_size and seq_len to initiate, k should be d_model/num_heads
model = Decoder( d_model=256, num_layers = 3, num_heads=4, dim_feedforward=1024, dropout=0.1, k=64, vocab_size=10000, seq_len=1503)


In [18]:
checkpoint_callback = ModelCheckpoint(
    dirpath='checkpoints',           # Directory where checkpoints will be saved
    filename='latest-checkpoint',    # Filename for the latest checkpoint
    save_last=True,                  # Always save the last checkpoint
    save_top_k=1,                    # Save the best model according to monitor
    monitor='val_loss',              # Metric to monitor (optional if you just want the last checkpoint)
    mode='min',                      # 'min' for loss, 'max' for accuracy, etc.
    verbose=True,                     # Print messages when checkpoints are saved
	every_n_epochs=1
)

# Trainer configuration
trainer = pl.Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=10, 
    accelerator='gpu' if torch.cuda.is_available() else 'cpu',
    devices=1,
    log_every_n_steps=1,
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [19]:
# Fit the model
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=valid_loader, ckpt_path="checkpoints/latest-checkpoint.ckpt")

c:\Users\fabri\AppData\Local\Programs\Python\Python312\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:652: Checkpoint directory C:\Users\fabri\Documents\GitHub\NN_Project-AudioLM\checkpoints exists and is not empty.
Restoring states from the checkpoint path at checkpoints/manual-checkpoint.ckpt

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | embedding | Embedding        | 2.6 M  | train
1 | layers    | ModuleList       | 2.4 M  | train
2 | norm      | LayerNorm        | 512    | train
3 | linear    | Linear           | 2.6 M  | train
4 | loss_fn   | CrossEntropyLoss | 0      | train
-------------------------------------------------------
7.5 M     Trainable params
0         Non-trainable params
7.5 M     Total params
30.194    Total estimated model params size (MB)
Restored all states from the checkpoint at checkpoints/manual-checkpoint.ckpt


Epoch 5:  71%|███████▏  | 80/112 [32:59<13:11,  0.04it/s, v_num=15] 

In [16]:
trainer.save_checkpoint("checkpoints/manual-checkpoint.ckpt")

In [None]:
#model.eval()
x_input = coarseDataset.__getitem__(0)

print(x_input)
with torch.no_grad():
	predictions = model(x_input[0])

In [None]:
pred = torch.argmax(predictions, dim=-1)
for elem in pred[0]:
	print(elem)

In [None]:
fineDataset = TokensDataset(tokenPath, tokenFile, requiredDuration = 10, includeFineTokens = True)

In [None]:
y_input = fineDataset.__getitem__(0)[0]

y_input = y_input.squeeze(0)
pred_new = pred.squeeze(0)
#waveform = tokens_to_audio(pred_new, y_input, soundStream)
y_input.shape