In [7]:
import os 
import time

import pandas as pd
import math
import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt
from IPython.display import Audio

import torch
import torch.nn.functional as F
import torch.nn as nn
import torchaudio
from torchaudio.functional import spectrogram
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pad_sequence

def check_for_nans(tensor, tensor_name):
    if torch.isnan(tensor).any():
        print(f"NaNs found in {tensor_name}")
    if torch.isinf(tensor).any():
        print(f"Infinities found in {tensor_name}")
        
def mask(sequence_lengths, max_length):
    ones = sequence_lengths.new_ones(sequence_lengths.size(0), max_length)
    range_tensor = ones.cumsum(dim=1)
    return sequence_lengths.unsqueeze(1) >= range_tensor

In [8]:
paths=[]
labels=[]
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        paths.append(os.path.join(dirname, filename))
        label = filename[::-1].split('_')[0][::-1]
        labels.append(label.lower())

data = pd.DataFrame({'speech':paths,'label':labels})

In [9]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()


data['encoded_label'] = label_encoder.fit_transform(data['label'])

In [10]:
train_df = data[0:int(len(data)*0.65)]
test_df = data[int(len(data)*0.65):-1]

In [11]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df

    def __getitem__(self, index):
        row = self.df.iloc[index]
        wav_path = row["speech"]
        label = row['encoded_label'] 
        
        
        waveform, sample_rate = torchaudio.load(wav_path) 
        
        return waveform,label

    def __len__(self):
        return len(self.df)
    
def collate_fn(batch):

    wav_lgt_max = torch.tensor([wav.shape[-1] for wav,_  in batch],dtype=torch.int32).max()
    
    wav_lgts_l = []
    wav_pads_l = []
    
    labels = []
    
    for wav, label in batch:
        
        wav_lgt = wav.shape[-1]
        
        
        wav_pad = torch.nn.functional.pad(wav,
                                          pad=[0, wav_lgt_max-wav_lgt],
                                               value=0)
        labels.append(label)    
        wav_lgts_l.append(wav_lgt)       
        wav_pads_l.append(wav_pad)
        
    labels = torch.tensor(labels, dtype=torch.int32)
    wav_lgts_l = torch.tensor(wav_lgts_l, dtype=torch.int32)
    wav_pads_l = torch.stack(wav_pads_l, 0)

    
    return wav_pads_l,wav_lgts_l,labels


In [22]:
class CnnEmbedding(nn.Module):
    def __init__(self,d_model=512):
        super(CnnEmbedding,self).__init__()
        self.conv1 = nn.Conv1d(in_channels = 1 ,out_channels = d_model,
                                kernel_size=10,stride=5)
        
        self.conv2 = nn.Conv1d(in_channels = d_model ,out_channels = d_model,
                                kernel_size=8,stride=4)
        
        self.conv3 = nn.Conv1d(in_channels = d_model ,out_channels = d_model,
                                kernel_size=4,stride=2)
        
        self.bn1 = nn.BatchNorm1d(d_model)
        self.bn2 = nn.BatchNorm1d(d_model)
        self.bn3 = nn.BatchNorm1d(d_model)
        
        self.dropout1 = nn.Dropout(0.5)
        self.dropout2 = nn.Dropout(0.5)
        self.dropout3 = nn.Dropout(0.5)
        
    def forward(self,x):
        
        x = self.conv1(x)
        x = self.bn1(x)
        x = torch.tanh(x)
        x = self.dropout1(x)
        
        x = self.conv2(x)
        x = self.bn2(x)
        x = torch.tanh(x)
        x = self.dropout2(x)
        
        x = self.conv3(x)
        x = self.bn3(x)
        x = torch.tanh(x)
        x = self.dropout3(x)
        
        return x 
    
class VectorQuantizer(nn.Module):
    def __init__(self, num_codebooks=320, codebook_dim=512):
        super(VectorQuantizer, self).__init__()
        self.codebook = nn.Parameter(torch.randn(num_codebooks, codebook_dim))

    def forward(self, x):
        # x shape: (batch_size, reduced_samples, d_model)
        # Compute L2 distance between each latent vector and codebook entries
        x_flattened = x.reshape(-1, x.shape[-1])  # Flatten to shape (batch_size * reduced_samples, d_model)
        distances = torch.cdist(x_flattened, self.codebook, p=2)  # Compute pairwise L2 distance
        
        # Get the index of the closest codebook entry for each latent vector
        closest_indices = torch.argmin(distances, dim=-1)  # Shape: (batch_size * reduced_samples)
        
        # Quantize by replacing the latent vectors with the closest codebook vectors
        quantized = self.codebook[closest_indices].view(x.shape)  # Reshape back to original shape
        
        return quantized, closest_indices
        

In [23]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model=512, max_len=5000):
        super(PositionalEncoding, self).__init__()
    
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)  # Apply sine to even dimensions
        pe[:, 1::2] = torch.cos(position * div_term)  # Apply cosine to odd dimensions
        
        pe = pe.unsqueeze(0).transpose(0, 1)  # Shape: (max_len, 1, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):

        x = x + self.pe[:x.size(0), :]
        return x


In [24]:
class Encoder(nn.Module):
    def __init__(self,d_model=512):
        super(Encoder, self).__init__()
    
        self.attn = torch.nn.MultiheadAttention(embed_dim = d_model,
                                                num_heads=4,
                                                dropout=0.1,
                                                batch_first=True)
        
        self.linear1 = nn.Linear(in_features = d_model,out_features = d_model*2)
        self.linear2 = nn.Linear(in_features = d_model*2,out_features = d_model)
        
        self.norm1 = nn.LayerNorm(normalized_shape = d_model)
        self.norm2 = nn.LayerNorm(normalized_shape = d_model)
        
        self.dropout1 = torch.nn.Dropout(0.2)
        self.dropout2 = torch.nn.Dropout(0.2)
        self.dropout3 = torch.nn.Dropout(0.2)
        
    def forward(self,x,attn_mask=None,key_padding_mask=None):

        x_tmp = self.norm1(x)

        x_tmp, _ = self.attn(query=x_tmp,
                             key=x_tmp,
                             value=x_tmp,
                             attn_mask=attn_mask,
                             key_padding_mask=key_padding_mask)
    

        x_tmp = self.dropout1(x_tmp)
        x = x + x_tmp    
 
        x_tmp = self.norm2(x) 
        x_tmp = self.linear1(x_tmp)
        x_tmp = F.relu(x_tmp)

            
        x_tmp = self.dropout2(x_tmp)
        x_tmp = self.linear2(x_tmp)
        x_tmp = self.dropout3(x_tmp)
   
        x = x + x_tmp
            
        return x

In [54]:
class ContrastiveLoss(nn.Module):
    def __init__(self,device = 'cuda', temperature=0.1):
        super(ContrastiveLoss, self).__init__()
        self.temperature = temperature
        self.device = device

    def forward(self, x_pre, x_vq, negatives):
        
        #x_pre and x_vq are shape (Batch_size,reduced_samples,d_model)
        
        positive_sim = F.cosine_similarity(x_pre[:,0,:], x_vq[:,0,:], dim=-1)  #here we get positive_sim of shape (batch_size)
        

        
        negative_sim = F.cosine_similarity(x_pre[:,0,:].unsqueeze(1), negatives[:,:,0,:], dim=-1)  #here we get negative_sim of shape (batch_size,num_negative)
        

        all_similarities = torch.cat([positive_sim.unsqueeze(1), negative_sim], dim=1)  #here we get all_similarities of shape (batch_size,num_negative+1)

        #Compute loss
        logits = all_similarities / self.temperature 
        log_prob = F.log_softmax(logits, dim=1) 
        targets = torch.zeros(log_prob.size(0), dtype=torch.long,device = self.device)

        loss = F.nll_loss(log_prob, targets)
        
        
        return loss

class AudioTransformers(nn.Module):
    def __init__(self,device = 'cuda',d_model=512,num_classes = 1,pretrained = True,num_codebooks=320,codebook_dim=512):
        super(AudioTransformers,self).__init__()
        
        self.d_model = d_model
        self.pretrained = pretrained
        self.wave_embedding = CnnEmbedding(d_model=self.d_model)
        
        
        self.vq = VectorQuantizer(num_codebooks=320, codebook_dim=self.d_model)
        self.pos_encoding_pretrained =  PositionalEncoding(d_model=self.d_model)
        self.encoder_pretrained1 = Encoder(d_model=self.d_model)
        self.encoder_pretrained2 = Encoder(d_model=self.d_model)
        self.encoder_pretrained3 = Encoder(d_model=self.d_model)

        self.pos_encoding_finetune = PositionalEncoding(d_model=self.d_model)
        self.encoder_finetune1 = Encoder(d_model=self.d_model)
        self.encoder_finetune2 = Encoder(d_model=self.d_model)
        self.encoder_finetune3 = Encoder(d_model=self.d_model)
        
        self.fc = nn.Linear(in_features = self.d_model ,out_features = num_classes)
        self.device = device
        
    def forward(self,x,wav_lgts):
        ## Embedding block
        batch_size,_ ,seq_lenght = x.shape
        x = self.wave_embedding(x)
        x = x.transpose(1,2)
        ## Embedding block
        
        
        if self.pretrained:
            ## Pretained block
            sequence_length = x.shape[1]
            x_vq,_ = self.vq(x)
            x_masked,_ = self.apply_span_masking(x)
 
            x_masked = self.pos_encoding_pretrained(x_masked)
            x_pre = self.encoder_pretrained1(x_masked)
            x_pre = self.encoder_pretrained2(x_pre)
            x_pre = self.encoder_pretrained3(x_pre)
            
            return x_pre,x_vq
            ## Pretrained block
        
        else:
            ## Transformers tasks block
            x = self.encoder_finetune1(x)
            x = self.encoder_finetune2(x)
            x = self.encoder_finetune3(x)
            ## Transformers tasks block

            ## Tasks Block
            # TO-DO get in features for fc layer and apply fc layer
            # Global mean pooling
            x = x.mean(dim=1)
            x = self.fc(x)
            ## Tasks Block
            
            return x
        

    def apply_span_masking(self,x, mask_prob=0.65, mask_length=10):
        """
        Apply span masking on the latent space feature representation.

        Args:
        - features: Latent feature sequence, shape (batch_size, seq_len, feature_dim)
        - mask_prob: Probability of masking a portion of the sequence.
        - mask_length: The length of each mask span.

        Returns:
        - masked_features: Masked feature tensor.
        - mask_indices: Indices where masking was applied.
        """
        batch_size, seq_len, feature_dim = x.shape

        num_masked_spans = int(mask_prob * seq_len / mask_length)

        mask_indices = torch.zeros((batch_size, seq_len), dtype=torch.bool)

        for i in range(batch_size):
            span_starts = torch.randint(0, seq_len - mask_length, (num_masked_spans,))
            for start in span_starts:
                mask_indices[i, start:start + mask_length] = 1

        masked_features = x.clone()
        masked_features[mask_indices] = -1e-8

        return masked_features, mask_indices
    
    def calculate_test_loss(self, test_loader, criterion):
        self.to(self.device) 
        loss_mean = 0.0
        self.eval()
        dice_coeff_mean = 0.0
        with torch.no_grad():
            for i, batch in enumerate(test_loader):
                patches, target = batch[0].to(self.device), batch[1].to(self.device)
          
                output = self(patches)
                test_loss,dice_coeff = criterion(output, target)

                loss_mean += test_loss.item()
                dice_coeff_mean += dice_coeff.item()
                

        loss_mean = np.round(loss_mean/(i+1), 5)
        dice_coeff_mean = np.round(dice_coeff_mean/(i+1), 5)
        return loss_mean,dice_coeff_mean
        

In [59]:
dataset_train = CustomDataset(train_df)
dataset_test = CustomDataset(test_df)
dataloader_train = DataLoader(dataset_train , batch_size=2, shuffle=True,collate_fn=collate_fn)
dataloader_test = DataLoader(dataset_test, batch_size=8, shuffle=False,collate_fn=collate_fn)

model = AudioTransformers(device = 'cuda',d_model=512,num_classes = 6,pretrained = True,num_codebooks=320,codebook_dim=512).cuda()
criterion =  ContrastiveLoss(temperature=0.1).cuda()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
scaler = torch.cuda.amp.GradScaler()  

  scaler = torch.cuda.amp.GradScaler()


In [60]:
epochs = 3
for ep in range(epochs):
    i = 0
    loss_ = 0.0
    for batch in dataloader_train:
        wav_pad,wav_lgts, label = batch[0].cuda(),batch[1].cuda(),batch[2].cuda()
        
        model.train(True)
        optimizer.zero_grad()
        
        with torch.cuda.amp.autocast():
            x_pre, x_vq = model(wav_pad,wav_lgts)
        
            batch_size, sequence_length, d_model = x_pre.shape
            num_negatives = 1
            negatives = torch.randn(batch_size, num_negatives, sequence_length, d_model).to(wav_pad.device)  # Make sure it's on the same device

            loss = criterion(x_pre, x_vq, negatives)

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        
        loss_ += loss.item()
        i += 1
        
    print(f"Epoch : {ep+1} , Loss: {loss_/(i+1)}")


  with torch.cuda.amp.autocast():


OutOfMemoryError: CUDA out of memory. Tried to allocate 202.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 107.12 MiB is free. Process 2526 has 15.78 GiB memory in use. Of the allocated memory 14.94 GiB is allocated by PyTorch, and 558.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)