In [1]:
import argparse
import math
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Union

import datasets
import torch
from accelerate import Accelerator
from accelerate.logging import get_logger
from datasets import DatasetDict, concatenate_datasets, load_dataset
from huggingface_hub import HfApi
from torch.utils.data.dataloader import DataLoader
from tqdm.auto import tqdm

device = 'cuda'

import transformers
from transformers import (
    AdamW,
    SchedulerType,
    get_scheduler,
    is_wandb_available,
    set_seed,
    Wav2Vec2FeatureExtractor
)
from model import (
    Wav2Vec2ForPreTraining,
    Wav2Vec2Config,
    Wav2Vec2FeatureEncoder
)

from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices, _sample_negative_indices
from transformers.utils import send_example_telemetry

In [2]:
@dataclass
class DataCollatorForWav2Vec2Pretraining:


    model: Wav2Vec2ForPreTraining
    feature_extractor: Wav2Vec2FeatureExtractor
    padding: Union[bool, str] = "longest"
    pad_to_multiple_of: Optional[int] = None
    mask_time_prob: Optional[float] = 0.65
    mask_time_length: Optional[int] = 10

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # reformat list to dict and set to pytorch format

        input_values = [feature['input_values'][0] for feature in features]  # Note the [0] to get the tensor from the list

        # Wrap input_values in a dictionary
        inputs_dict = {'input_values': input_values}


        batch = self.feature_extractor.pad(
            inputs_dict,
            padding=self.padding,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        device = batch["input_values"].device
        batch_size = batch["input_values"].shape[0]

        mask_indices_seq_length = self.model._get_feat_extract_output_lengths(batch["input_values"].shape[-1])
        # make sure masked sequence length is a Python scalar
        mask_indices_seq_length = int(mask_indices_seq_length)

        # make sure that no loss is computed on padded inputs
        if batch.get("attention_mask") is not None:
            # compute real output lengths according to convolution formula
            batch["sub_attention_mask"] = self.model._get_feature_vector_attention_mask(
                mask_indices_seq_length, batch["attention_mask"]
            )

        features_shape = (batch_size, mask_indices_seq_length)

        # sample randomly masked indices
        mask_time_indices = _compute_mask_indices(
            features_shape,
            self.mask_time_prob,
            self.mask_time_length,
            attention_mask=batch.get("sub_attention_mask"),
        )

        # sample negative indices
        sampled_negative_indices = _sample_negative_indices(
            features_shape,
            self.model.config.num_negatives,
            mask_time_indices=mask_time_indices,
        )
        batch["mask_time_indices"] = torch.tensor(mask_time_indices, dtype=torch.long, device=device)
        batch["sampled_negative_indices"] = torch.tensor(sampled_negative_indices, dtype=torch.long, device=device)

        return batch


def multiply_grads(params, c):
    """Multiplies grads by a constant *c*."""
    for p in params:
        if p.grad is not None:
            if torch.is_tensor(c):
                c = c.to(p.grad.device)
            p.grad.data.mul_(c)


def get_grad_norm(params, scale=1):
    """Compute grad norm given a gradient scale."""
    total_norm = 0.0
    for p in params:
        if p.grad is not False:
            print('not none')
            print(p)
            param_norm = (p.grad.detach().data / scale).norm(2)
            total_norm += param_norm.item() ** 2
    total_norm = total_norm**0.5
    return total_norm

In [3]:
#accelerator = Accelerator()

In [4]:
#set_seed(0)

In [5]:
from dataset import AudioDataset
import random

parent_dir = 'data/mp3_train_files'
file_list = [os.path.join(root, file) 
             for root, _, files in os.walk(parent_dir) 
             for file in files]

random.seed(42)
random.shuffle(file_list)

train_size = int(0.8 * len(file_list))
val_size = int(0.1 * len(file_list))
test_size = len(file_list) - train_size - val_size

train_files = file_list[:train_size]
val_files = file_list[train_size:train_size + val_size]
test_files = file_list[train_size + val_size:]

train_dataset = AudioDataset(train_files)
val_dataset = AudioDataset(val_files)
test_dataset = AudioDataset(test_files)

In [6]:
config = Wav2Vec2Config()
feature_extractor = Wav2Vec2FeatureExtractor()

In [7]:
model = Wav2Vec2ForPreTraining(config).to(device)

mask_time_prob = config.mask_time_prob
mask_time_length = config.mask_time_length 



In [8]:
for name, param in model.named_parameters():
    print('true' if param.requires_grad else 'false')
    break


true


In [9]:
data_collator = DataCollatorForWav2Vec2Pretraining(
        model=model,
        feature_extractor=feature_extractor,
        #pad_to_multiple_of=args.pad_to_multiple_of,
        mask_time_prob=mask_time_prob,
        mask_time_length=mask_time_length,
    )

train_dataloader = DataLoader(
        train_dataset,
        shuffle=True,
        collate_fn=data_collator,
        batch_size=8,
    )

eval_dataloader = DataLoader(
        val_dataset, collate_fn=data_collator, batch_size=8
    )

optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=0.004, # 5e-5
        betas=[0.9, 0.999],
        eps=1e-6,
    )

# model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
#         model, optimizer, train_dataloader, eval_dataloader
#     )

In [10]:
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / 1)
max_train_steps = 3 * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
        name='linear',
        optimizer=optimizer,
        num_warmup_steps=10,
        num_training_steps=max_train_steps,
    )

num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)

In [11]:
total_batch_size = 4 
max_gumbel_temperature = 2.0
min_gumbel_temperature = 0.5
gumbel_temperature_decay = 0.999995
logging_steps = 10
gradient_accumulation_steps = 8
saving_steps = 500
push_to_hub = False
output_dir = 'weights'


In [12]:
from torch.utils.tensorboard import SummaryWriter

log_dir = "runs/2"  # Change this to your desired log directory
writer = SummaryWriter(log_dir=log_dir)

In [20]:
batch = next(iter(train_dataloader))
input_values = batch['input_values']
print("Input Values:", input_values)

# Check for extreme values
print(f"Min: {input_values.min()}, Max: {input_values.max()}")

Input Values: tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0910e-01,
         -9.8474e-02, -1.1041e-01],
        [ 8.9839e-10,  1.6265e-09,  1.7210e-09,  ..., -9.8750e-02,
         -7.6209e-02, -1.3102e-01],
        [ 1.0619e-01,  2.2158e-01,  2.0276e-01,  ...,  6.2053e-01,
          6.5487e-01,  8.0091e-01],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  2.4637e-01,
          2.3559e-01,  2.1247e-01],
        [ 6.6088e-11,  1.2577e-10,  9.7566e-11,  ..., -3.2955e-02,
         -4.2624e-02, -6.0856e-02],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  8.2496e-02,
          8.7817e-02,  1.0291e-01]])
Min: -1.0, Max: 1.0


In [21]:
model = Wav2Vec2ForPreTraining(config)


optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6)
progress_bar = tqdm(range(max_train_steps))
completed_steps = 0
starting_epoch = 0

for epoch in range(starting_epoch, num_train_epochs):
    model.train()
    for step, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss

        if torch.isnan(loss).any() or torch.isinf(loss).any():
            print("NaN or Inf detected in loss")
            continue

        print("Loss value:", loss.item())
        print("Requires grad:", loss.requires_grad)

        loss.backward()
        
        grads = [p.grad for p in model.parameters() if p.grad is not None]
        if not grads:
            print("No gradients computed")
            continue
        else:
            for name, param in model.named_parameters():
                if param.grad is not None:
                    print(f"{name}: grad norm = {param.grad.norm().item()}")

        optimizer.step()
        optimizer.zero_grad()

        completed_steps += 1
        progress_bar.update(1)

        if completed_steps >= max_train_steps:
            break

    # Validation code here (omitted for brevity)

progress_bar.close()


Reinitializing wav2vec2.feature_extractor.conv_layers.0.conv.weight
Reinitializing wav2vec2.feature_extractor.conv_layers.0.layer_norm.weight


ValueError: Fan in and fan out can not be computed for tensor with fewer than 2 dimensions

In [13]:
progress_bar = tqdm(range(max_train_steps))
completed_steps = 0
starting_epoch = 0

for epoch in range(starting_epoch,num_train_epochs):
    model.train()
    for step,batch in enumerate(train_dataloader):
        batch = batch.to(device)

        num_losses = batch["mask_time_indices"].sum()
        
        sub_attention_mask = batch.pop('sub_attention_mask',None)
        sub_attention_mask = (
                sub_attention_mask if sub_attention_mask is not None else torch.ones_like(batch["mask_time_indices"])
            )
        if sub_attention_mask is None:
             print('HERE')
        percent_masked = num_losses / sub_attention_mask.sum()


        outputs = model(**batch)


       # print(outputs.loss)

      

        loss = outputs.loss

        
      #  print("Loss value:", loss.item())
      #  print("Requires grad:", loss.requires_grad)

        loss.backward()

        #print("Gradients after backward pass:", [p.grad for p in model.parameters() if p.grad is not None])

        # if all(p.grad is None for p in model.parameters()):
        #     print("No gradients computed")
        #     continue


        #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=10.0)

        multiply_grads(model.parameters(), 1 / num_losses)

        if (step + 1) % 1 == 0 or step == len(train_dataloader) - 1:

            optimizer.zero_grad()
            optimizer.step()
            
         

           # print(scale)

            grad_norm = get_grad_norm(model.parameters())

           # print("Parameters:", [p for p in model.parameters() if p.grad is not False])
            

     
            # update gumbel temperature
            gumbel_temperature = max(
                        max_gumbel_temperature * gumbel_temperature_decay**completed_steps,
                        min_gumbel_temperature,
                )
            if hasattr(model, "module"):
                    model.module.set_gumbel_temperature(gumbel_temperature)
            else:
                    model.set_gumbel_temperature(gumbel_temperature)

            progress_bar.update(1)
            completed_steps += 1

        if (step + 1) % (1 * logging_steps) == 0:
            loss.detach()
            outputs.contrastive_loss.detach()
            outputs.diversity_loss.detach()

            train_logs = {
                    "loss": (loss * gradient_accumulation_steps) / num_losses,
                    "constrast_loss": outputs.contrastive_loss / num_losses,
                    "div_loss": outputs.diversity_loss / num_losses,
                    "%_mask_idx": percent_masked / 1,
                    "ppl": outputs.codevector_perplexity,
                    "lr": torch.tensor(optimizer.param_groups[0]["lr"]),
                    "temp": torch.tensor(gumbel_temperature),
                    "grad_norm": torch.tensor(grad_norm),
            }
            log_str = ""
            for k, v in train_logs.items():
                log_str += "| {}: {:.3e}".format(k, v.item())

            if True:
                progress_bar.write(log_str)
                for k, v in train_logs.items():
                    writer.add_scalar(f'train/{k}', v.item(), completed_steps)
        
    

        if completed_steps >= max_train_steps:
                break
        

    # 7. Validate!

    model.eval()

    # init logs
    val_logs = {
        "val_loss": 0,
        "val_contrastive_loss": 0,
        "val_diversity_loss": 0,
        "val_num_losses": 0,
    }
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            batch.pop("sub_attention_mask", None)
            outputs = model(**batch)

        val_logs["val_loss"] += outputs.loss
        val_logs["val_contrastive_loss"] += outputs.contrastive_loss
        val_logs["val_diversity_loss"] += outputs.diversity_loss
        val_logs["val_num_losses"] += batch["mask_time_indices"].sum()


    val_logs = {k: v / val_logs["val_num_losses"] for k, v in val_logs.items()}

    log_str = ""
    for k, v in val_logs.items():
        log_str += "| {}: {:.3e}".format(k, v.item())

    if accelerator.is_local_main_process:
        progress_bar.write(log_str)
        # Log validation metrics to TensorBoard
        for k, v in val_logs.items():
            writer.add_scalar(f'val/{k}', v.item(), epoch)
        
    if output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(
            output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
        )

        

                
writer.close()


  0%|          | 0/4650 [00:00<?, ?it/s]

not none
Parameter containing:
tensor([8.4824e-02, 9.8099e-01, 4.5328e-01, 4.3967e-01, 5.0145e-01, 1.8659e-01,
        2.7512e-01, 2.2131e-02, 1.4914e-01, 1.1665e-01, 2.8491e-01, 4.4491e-01,
        9.8502e-01, 3.4413e-01, 2.9262e-01, 1.1975e-01, 4.6609e-01, 1.9719e-01,
        4.1189e-01, 3.7040e-01, 5.7756e-01, 3.1124e-01, 8.7557e-01, 7.7711e-02,
        3.9245e-01, 2.3488e-01, 8.5205e-01, 8.7894e-01, 3.0212e-01, 7.3275e-02,
        7.8919e-01, 6.7912e-01, 2.2240e-01, 2.6755e-01, 5.2048e-01, 1.6625e-01,
        7.6600e-01, 7.9738e-01, 2.7697e-01, 2.3656e-01, 7.1059e-01, 1.8312e-01,
        5.6537e-01, 7.8787e-01, 6.5091e-01, 4.5836e-01, 6.0524e-01, 8.9491e-02,
        2.2072e-01, 3.9741e-01, 6.5188e-01, 4.2748e-01, 3.5178e-01, 2.7553e-01,
        9.9817e-01, 5.9067e-01, 1.4937e-02, 9.8916e-01, 9.2044e-01, 5.5102e-01,
        8.6186e-01, 4.1171e-01, 5.8003e-01, 3.1871e-01, 6.7465e-02, 7.6799e-01,
        9.1963e-01, 9.1382e-01, 2.5272e-02, 2.3235e-01, 7.9589e-01, 1.1008e-01,
        9

AttributeError: 'NoneType' object has no attribute 'detach'

In [14]:
def get_grad_norm(params, scale=1):
    """Compute grad norm given a gradient scale."""
    total_norm = 0.0
    for p in params:
        if p.grad is not False:
            print('not none')
            print(p)
            param_norm = (p.grad.detach().data / scale).norm(2)
            total_norm += param_norm.item() ** 2
    total_norm = total_norm**0.5
    return total_norm

In [15]:
for p in model.parameters():
    if p.grad is not False:
        print(p.grad)

        

None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None


In [16]:
get_grad_norm(model.parameters())

not none
Parameter containing:
tensor([8.4824e-02, 9.8099e-01, 4.5328e-01, 4.3967e-01, 5.0145e-01, 1.8659e-01,
        2.7512e-01, 2.2131e-02, 1.4914e-01, 1.1665e-01, 2.8491e-01, 4.4491e-01,
        9.8502e-01, 3.4413e-01, 2.9262e-01, 1.1975e-01, 4.6609e-01, 1.9719e-01,
        4.1189e-01, 3.7040e-01, 5.7756e-01, 3.1124e-01, 8.7557e-01, 7.7711e-02,
        3.9245e-01, 2.3488e-01, 8.5205e-01, 8.7894e-01, 3.0212e-01, 7.3275e-02,
        7.8919e-01, 6.7912e-01, 2.2240e-01, 2.6755e-01, 5.2048e-01, 1.6625e-01,
        7.6600e-01, 7.9738e-01, 2.7697e-01, 2.3656e-01, 7.1059e-01, 1.8312e-01,
        5.6537e-01, 7.8787e-01, 6.5091e-01, 4.5836e-01, 6.0524e-01, 8.9491e-02,
        2.2072e-01, 3.9741e-01, 6.5188e-01, 4.2748e-01, 3.5178e-01, 2.7553e-01,
        9.9817e-01, 5.9067e-01, 1.4937e-02, 9.8916e-01, 9.2044e-01, 5.5102e-01,
        8.6186e-01, 4.1171e-01, 5.8003e-01, 3.1871e-01, 6.7465e-02, 7.6799e-01,
        9.1963e-01, 9.1382e-01, 2.5272e-02, 2.3235e-01, 7.9589e-01, 1.1008e-01,
        9

AttributeError: 'NoneType' object has no attribute 'detach'

# example pretraining 
https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py