In [1]:
import torch
import torchaudio
import torch.nn.functional as F

import os
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from IPython.display import Audio
import time

## 1. Creating the Dataset

### 2. HuBERT TEST

In [2]:
from transformers import HubertModel,AutoProcessor,AutoFeatureExtractor,Wav2Vec2Processor,HubertForCTC
import joblib
import torch.nn as nn
from hubertKM import SemanticTokenizer, visualizeEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


#### Importing the pretrained models for HuBERT and KMeans

In [3]:
# A semantic tokenizer
# Input: (w2vCheckpointPath, kmeansCheckpointPath)
# Output: (semanticTokens, normalizedEmbeddings)
w2vBERT = SemanticTokenizer("facebook/hubert-base-ls960","./hubertKM/hubert_base_ls960_L9_km500.bin")  

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

#### Computing semantic tokens

In [4]:
#semanticTokens, embeddings

#### Visualizing embeddings

In [5]:
# Create some plots using PCA and t-SNE

# visualizeEmbeddings(embeddings, semanticTokens)

#### Check with another implementation (OPTIONAL)

## 3. Creating the Semantic Transformer

In [6]:
from SoundStream import soundstream_16khz, audio_to_tokens, tokens_to_audio, encode_audio, decode_audio

In [7]:
soundStream = soundstream_16khz()

## TEST

In [8]:
from data import storeTokens,  TokensDataset

tokenPath = "out"
tokenFile = "out.csv"
audioPath = "data_cut"


In [9]:
#fileCount = storeTokens(audioPath, tokenPath, tokenFile, w2vBERT, soundStream, fileCountCheckpoint = 10)

In [10]:
#tokenDataset = TokensDataset(tokenPath, tokenFile, requiredDuration = 30, includeSemanticTokens = True, includeCoarseTokens = True, includeFineTokens = True) 
semanticDataset = TokensDataset(tokenPath, tokenFile, requiredDuration = 30, includeSemanticTokens = True)
#coarseDataset = TokensDataset(tokenPath, tokenFile, requiredDuration = 30, includeCoarseTokens = True)
#fineDataset = TokensDataset(tokenPath, tokenFile, requiredDuration = 30, includeFineTokens = True)

In [11]:
#firstItem = tokenDataset.__getitem__(0)
firstSemantic = semanticDataset.__getitem__(0)
##firstCoarse = coarseDataset.__getitem__(0)
#firstFine = fineDataset.__getitem__(0)

print("\nSEMANTIC TOKEN:")
print(firstSemantic[0].shape)

RuntimeError: Only Tensors of floating point and complex dtype can require gradients

In [None]:
from torch.utils.data import DataLoader, random_split
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from TransformerModel import DecoderOnlyTransformer
import pytorch_lightning as pl
from typing import List

In [None]:
#dataset = semanticDataset
train_dataset = semanticDataset

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

input_size = 0
for batch in train_loader:
    input, labels = batch
    print(labels.shape)
    break


torch.Size([2, 1, 1500])


In [None]:
class RelativePosition(nn.Module):

    def __init__(self, d_model: int, nhead: int):
        super().__init__()
        self.d_model = d_model
        self.nhead = nhead
        self.relative_position_bias_table = nn.Parameter(torch.zeros((2 * self.d_model - 1, self.nhead)))
        nn.init.xavier_uniform_(self.relative_position_bias_table)

    def forward(self, length_query: int, length_key: int) -> torch.Tensor:
        indices_query = torch.arange(length_query, device=self.relative_position_bias_table.device)
        indices_key = torch.arange(length_key, device=self.relative_position_bias_table.device)
        distance_matrix = indices_key.unsqueeze(0) - indices_query.unsqueeze(1)
        distance_matrix_clipped = torch.clamp(distance_matrix, -(self.d_model-1), self.d_model-1)
        final_matrix = distance_matrix_clipped + self.d_model - 1
        embeddings = self.relative_position_bias_table[final_matrix.to(torch.long)]
        return embeddings
    


class AttentionHead(nn.Module):

    def __init__(self, hidden_size, d_model, k_bias_matrix, v_bias_matrix):
        super().__init__()
        self.d_model = d_model
        self.query_weights: nn.Linear = nn.Linear(hidden_size, self.d_model)
        self.key_weights: nn.Linear = nn.Linear(hidden_size, self.d_model)
        self.value_weights: nn.Linear = nn.Linear(hidden_size, self.d_model)
        self.k_bias_matrix = k_bias_matrix
        self.v_bias_matrix = v_bias_matrix

    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
        query: torch.Tensor = self.query_weights(query) # (b_s, n_t, head_dim)
        key: torch.Tensor = self.key_weights(key) # (b_s, n_t, head_dim)
        value: torch.Tensor = self.value_weights(value) # (b_s, n_t, head_dim)
        # Self-Attention scores
        attn_1: torch.Tensor = torch.matmul(query, key.transpose(1, 2)) # Q*K^T:(b_s, n_t, n_t)
        # Relative Position Attention scores
        attn_2: torch.Tensor = torch.matmul(query.permute(1, 0, 2), self.k_bias_matrix.transpose(1, 2)).transpose(0, 1) # Q*K_shifting^T:(b_s, n_t, n_t)
        # Relation-aware Self-Attention scores
        att_scores: torch.Tensor = (attn_1 + attn_2)/(self.d_model ** 0.5)
        if mask is not None:
            mask = mask.to(torch.int)
            att_scores: torch.Tensor = att_scores.masked_fill(mask.unsqueeze(1) == 0, -1e9)
        att_weights: torch.Tensor = F.softmax(att_scores, dim=-1)
        # Weighted sum of values
        values_1: torch.Tensor = torch.matmul(att_weights, value) # (b_s, n_t, head_dim)
        # Relative Position Representation for values
        values_2: torch.Tensor = torch.matmul(att_weights.permute(1, 0, 2), self.v_bias_matrix).transpose(0, 1) # (b_s, n_t, head_dim)
        # Relation-aware values
        n_value  = values_1 + values_2
        return n_value


class MultiHeadAttention(nn.Module):
    def __init__(self, hidden_size = 1024, num_heads = 16, k = 64, seq_len = 1500):
        super().__init__()
        self.hidden_size: int = hidden_size
        self.num_heads: int = num_heads
        self.head_dim: int = hidden_size // num_heads
        assert hidden_size % num_heads == 0, "hidden_size must be divisible by num_heads"
        self.relative_position_k: torch.Tensor = RelativePosition(self.head_dim, k)
        self.relative_position_v: torch.Tensor = RelativePosition(self.head_dim, k)
        self.k_bias_matrix: torch.Tensor = self.relative_position_k(seq_len, seq_len)
        self.v_bias_matrix: torch.Tensor = self.relative_position_v(seq_len, seq_len)
        self.attention_heads: nn.ModuleList = nn.ModuleList([AttentionHead(self.hidden_size, self.head_dim, self.k_bias_matrix, self.v_bias_matrix) for _ in range(self.num_heads)])
        self.fc: nn.Linear = nn.Linear(hidden_size, hidden_size)

    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
        attention_outputs: List[torch.Tensor] = [attention_head(query, key, value, mask=mask) for attention_head in self.attention_heads]
        hidden_state: torch.Tensor = torch.cat(attention_outputs, dim=-1)
        hidden_state: torch.Tensor = self.fc(hidden_state)
        return hidden_state


In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model=1024, num_heads=16, dim_feedforward=4096, dropout=0.1, k=64, seq_len=1500):
        super(DecoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model, num_heads, k, seq_len)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),
            nn.ReLU(),
            nn.Linear(dim_feedforward, d_model)
        )
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Multi-head attention
        attn_output = self.attention(x, x, x)
        x = self.layer_norm1(x + self.dropout(attn_output))
        # Feed-forward network
        ff_output = self.feed_forward(x)
        x = self.layer_norm2(x + self.dropout(ff_output))
        return x
    
class Decoder(pl.LightningModule):
    def __init__(self, d_model=1024, num_layers = 12, num_heads=16, dim_feedforward=4096, dropout=0.1, k=64, seq_len=1500, vocab_size = 1500):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, dim_feedforward, dropout, k, seq_len)
            for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(d_model)
        self.linear = nn.Linear(d_model, vocab_size)
        self.loss = nn.CrossEntropyLoss()

    def forward(self, x):
        x = self.embedding(x)
        for layer in self.layers:
            x = layer(x)
        output = self.linear(self.norm(x))    
        return torch.argmax(output, dim=-1)
    
    def training_step(self, batch, batch_idx):
        input_ids, target_ids = batch
        output = self(input_ids)
        print(output.require)
        loss = self.loss(output.float(), target_ids.float())
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-4)  # Replace with your preferred optimizer
        return optimizer

model = Decoder()


In [None]:
#with torch.no_grad():
    #predictions = model(firstSemantic[0])
    #predicted_ids = torch.argmax(predictions, dim=-1)

#predicted_ids = predicted_ids.squeeze(0).tolist()

In [None]:
# Callbacks for saving the best model and early stopping
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    dirpath='checkpoints',
    filename='best-checkpoint',
    save_top_k=1,
    mode='min'
)

early_stopping_callback = EarlyStopping(
    monitor='val_loss',
    patience=3,
    mode='min'
)

# Trainer configuration
trainer = pl.Trainer(
    max_epochs=10,  # Set the number of epochs
    accelerator='gpu' if torch.cuda.is_available() else 'cpu',  # Use GPU if available
    devices=1 if torch.cuda.is_available() else 1,  # Number of devices for both GPU and CPU
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\fabri\AppData\Local\Programs\Python\Python312\Lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [None]:
trainer.fit(model, train_dataset)


  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | embedding | Embedding        | 1.5 M  | train
1 | layers    | ModuleList       | 151 M  | train
2 | norm      | LayerNorm        | 2.0 K  | train
3 | linear    | Linear           | 1.5 M  | train
4 | loss      | CrossEntropyLoss | 0      | train
-------------------------------------------------------
154 M     Trainable params
0         Non-trainable params
154 M     Total params
617.701   Total estimated model params size (MB)


Epoch 0:   0%|          | 0/51 [00:00<?, ?it/s] 

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [20]:
for batch in semanticDataset:
	input, label = batch
	print(input.requires_grad)
	break

False
