## 0. Colab Code

## 1. Preparing the imports

In [1]:
import torch
import torchaudio
import torch.nn.functional as F

import os
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from IPython.display import Audio
import time

## 0.5. Creating the Dataset

## 2. HuBERT TEST

In [2]:
from transformers import HubertModel,AutoProcessor,AutoFeatureExtractor,Wav2Vec2Processor,HubertForCTC
import joblib
import torch.nn as nn
from hubertKM import SemanticTokenizer, visualizeEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


#### Importing the pretrained models for HuBERT and KMeans

In [3]:
# A semantic tokenizer
# Input: (w2vCheckpointPath, kmeansCheckpointPath)
# Output: (semanticTokens, normalizedEmbeddings)
w2vBERT = SemanticTokenizer("facebook/hubert-base-ls960","./hubertKM/hubert_base_ls960_L9_km500.bin")  

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

#### Computing semantic tokens

In [4]:
#semanticTokens, embeddings

#### Visualizing embeddings

In [5]:
# Create some plots using PCA and t-SNE

# visualizeEmbeddings(embeddings, semanticTokens)

#### Check with another implementation (OPTIONAL)

## 3. Creating the Semantic Transformer

In [6]:
from SoundStream import soundstream_16khz, audio_to_tokens, tokens_to_audio, encode_audio, decode_audio

In [7]:
soundStream = soundstream_16khz()

## TEST

In [8]:
from data import storeTokens,  TokensDataset, store_from_librilight

tokenPath = "out-1h"
tokenFile = "out.csv"
audioPath = "data_cut"


In [9]:
#fileCount = store_from_librilight(tokenPath, tokenFile, w2vBERT, soundStream, fileCountCheckpoint = 10, subset = "10min") #More than 2700 files in the 10h, for a smaller dataset use 1h (10 min is too small)

In [10]:
#tokenDataset = TokensDataset(tokenPath, tokenFile, requiredDuration = 30, includeSemanticTokens = True, includeCoarseTokens = True, includeFineTokens = True) 
#semanticDataset = TokensDataset(tokenPath, tokenFile, requiredDuration = 30, includeSemanticTokens = True)
coarseDataset = TokensDataset(tokenPath, tokenFile, requiredDuration = 10,includeSemanticTokens=True, includeCoarseTokens = True, maxDur=2010)
#fineDataset = TokensDataset(tokenPath, tokenFile, requiredDuration = 30, includeFineTokens = True)

In [11]:
print(coarseDataset.__getitem__(0))

(tensor([17, 17, 17,  ...,  0,  0,  0]), tensor([ 767, 1412, 3021,  ...,  284, 1988, 2453]), tensor([ True,  True,  True,  ..., False, False, False]))


In [12]:
from torch.utils.data import DataLoader, random_split
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from TransformerModel import Decoder
import pytorch_lightning as pl

In [13]:
num_workers = torch.cuda.device_count() if torch.cuda.is_available() else os.cpu_count()

In [14]:

train_dataset, valid_dataset = random_split(coarseDataset, [0.8, 0.2])
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=num_workers, persistent_workers=True) #num_workers should be a fraction of the number of cores of the CPU (or GPU if in use)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False, num_workers=num_workers, persistent_workers=True)

In [15]:

#Change vocab_size and seq_len to initiate, k should be d_model/num_heads
model = Decoder( d_model=256, num_layers = 3, num_heads=4, dim_feedforward=1024, dropout=0.1, k=64, vocab_size=3072, seq_len=2010)


In [16]:
# Trainer configuration
trainer = pl.Trainer(
    max_epochs=500, 
    accelerator='gpu' if torch.cuda.is_available() else 'cpu',
    log_every_n_steps=1,
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\fabri\AppData\Local\Programs\Python\Python312\Lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [17]:
# Fit the model
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=valid_loader, ckpt_path="checkpoints/prova-checkpoint.ckpt")#, ckpt_path="checkpoints/prova-checkpoint.ckpt")#, val_dataloaders=valid_loader)#, ckpt_path="checkpoints/manual-checkpoint.ckpt") #Change the checkpoint name to match the last one


  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | embedding | Embedding        | 786 K  | train
1 | layers    | ModuleList       | 2.4 M  | train
2 | norm      | LayerNorm        | 512    | train
3 | linear    | Linear           | 789 K  | train
4 | loss_fn   | CrossEntropyLoss | 0      | train
-------------------------------------------------------
4.0 M     Trainable params
0         Non-trainable params
4.0 M     Total params
15.978    Total estimated model params size (MB)


Epoch 23:   0%|          | 0/12 [00:00<?, ?it/s, v_num=51]                 

c:\Users\fabri\AppData\Local\Programs\Python\Python312\Lib\site-packages\pytorch_lightning\trainer\call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [20]:
itera = iter(train_loader)
first = next(itera)

In [33]:
print(first[2].shape)
seq_len = first[2].shape[1]

torch.Size([16, 2010])


In [29]:
causal_mask = torch.tril(torch.ones((seq_len, seq_len), device=first[0].device))

In [41]:
padding_mask = first[2].to(torch.int)  # (batch_size, seq_len)

# Expand padding mask to match the shape of causal_mask
padding_mask = padding_mask.unsqueeze(1)  # (batch_size, 1, seq_len)
padding_mask = padding_mask.expand(-1, seq_len, -1)  # (batch_size, seq_len, seq_len)
padding_mask.shape
combined_mask = causal_mask * padding_mask  # (batch_size, seq_len, seq_len)
for elem in combined_mask[0]:
	print(elem)

tensor([1., 0., 0.,  ..., 0., 0., 0.])
tensor([1., 1., 0.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ..., 0., 0., 0.])
tensor([1., 1., 1.,  ...,

In [18]:
trainer.save_checkpoint("checkpoints/prova-checkpoint.ckpt")

In [None]:
x_input = coarseDataset.__getitem__(0)[0]

In [None]:
model.eval()
x_input = coarseDataset.__getitem__(0)[0].unsqueeze(0)

print(x_input.shape)
with torch.no_grad():
	predictions = model.generate_tokens(x_input, 600)

In [None]:
prova = predictions

## Trying the coarse without needing the fine tokens

In [None]:
coarse_shaped = prova.reshape((-1,3))
embedding = coarse_shaped

size = predictions.shape[0]
offsets = torch.tensor([(i % 3) * 1024 for i in range(size * 3)]).reshape((-1,3))
embedding = (embedding - offsets).reshape((1,-1,3))
waveform = decode_audio(embedding, soundStream)

In [None]:
embedding.shape

In [None]:
for element in predictions.squeeze(0):
	print(element)

In [None]:
Audio(waveform, rate = 16000)