In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.autograd import Variable

import pytorch_lightning as pl
from pytorch_lightning import loggers as pl_loggers
from torchmetrics.functional import retrieval_normalized_dcg

In [2]:
cpu_count = os.cpu_count()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
time_size = 10
embed_dim = 128
embed_max = 256
label_num = 527
top_label_num = 10
bs = 16  # batch_size

In [65]:
def df_to_tensor(data_names, top_label_num=top_label_num, embed_dim=embed_dim, embed_max=embed_max, label_num=label_num):
    """
    Args:
        data_names (list[str]): list of dataset versions to use
        top_label_num (int, optional): Number of most frequent labels to use. Defaults to 10.
        embed_dim (int, optional): Embedding dimension per second. Defaults to 128.
        embed_max (int, optional): Maximum Possible value in the embedding vectors. Defaults to 256.
        label_num (int, optional): Total number of distinct labels in dataset. Defaults to 527.

    Returns:
        tuple(normalized_embedding_tensor, label_tensor): tuple of embedding tensor after normalization, and the 527-dim label tensor
    """
    
    # Concat dataframes corresponding to the data_names
    df = pd.DataFrame([])
    for data_name in data_names:
        add_df = pd.read_parquet(f'{data_name}.parquet')
        add_df = add_df[~add_df.isnull()]
        print(f"{data_name}_df shape: ",add_df.shape)
        add_df = add_df[add_df['audio_embedding'].apply(lambda x: len(x)) == time_size]
        df = pd.concat([df,add_df],axis=0)
        del add_df

    def label_converter(x):
        output = np.zeros(label_num,dtype=int)
        for label in x:
            output[label] = 1
        return output
    # Until here, same code as in the tfrecord_to_df function

    # Convert "labels" column of df into np array
    label_df = pd.DataFrame(np.vstack(df['labels'].apply(lambda x: label_converter(x))).reshape(-1,label_num))
    top_label_idx = list(label_df.sum(axis=0).nlargest(top_label_num).index)  # Top 10 most frequent labels
    label_df = label_df[top_label_idx]  # Only use top10 label data 
    label_df = label_df[label_df.sum(axis=1)>0]  # Drop audio data that does not contain any of top 10 label data
    label_tensor = torch.Tensor(label_df.to_numpy())  # Obtain pytorch tensor
    print(f"Total label shape: ",label_tensor.size())

    # Drop audio data that does not contain any of top 10 label data
    df['label'] = df['label'].apply(lambda x: x[top_label_idx])
    df = df[df['label'].apply(lambda x: sum(x))>0]

    # Convert "audio embedding" column of df into np array and torch tensor
    embeddings = np.vstack(df['audio_embedding'].apply(lambda x: np.vstack(x))).reshape(-1,time_size,embed_dim)
    embedding_tensor = torch.Tensor(embeddings)
    print(f"Total embedding shape: ", embedding_tensor.size())
    
    assert embedding_tensor.shape[0] == label_tensor.shape[0], "Feature and label dim does not coincide!"
    
    return embedding_tensor/embed_max, label_tensor   # normalize embedding_tensor by its maximum possible value

In [182]:
# Used bal_train and eval version of AudioSet data
embedding_all, label_all = df_to_tensor(['bal_train','eval'], top_label_num)

# Randomly splitting into train / validation / test data (8:1:1)
permute_idx = np.random.permutation(np.arange(embedding_all.shape[0]))
train_idx = permute_idx[:(permute_idx.shape[0]//5)*4]
val_idx = permute_idx[(permute_idx.shape[0]//5)*4:(permute_idx.shape[0]//10)*9]
test_idx = permute_idx[(permute_idx.shape[0]//10)*9:]
print(f"Train data Size: {len(train_idx)} , Val data Size: {len(val_idx)},Test data Size : {len(test_idx)}")

train_set = TensorDataset(embedding_all[train_idx], label_all[train_idx])
val_set = TensorDataset(embedding_all[val_idx], label_all[val_idx])
test_set = TensorDataset(embedding_all[test_idx], label_all[test_idx])

train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=bs, shuffle=True, num_workers=cpu_count)
val_loader = torch.utils.data.DataLoader(dataset=val_set, batch_size=1, shuffle=False, num_workers=cpu_count)
test_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=1, shuffle=False, num_workers=cpu_count)

bal_train_df shape:  (21782, 5)
eval_df shape:  (19976, 5)
Total label shape:  torch.Size([22582, 10])
Total embedding shape:  torch.Size([22582, 10, 128])
Train data Size: 18064 , Val data Size: 2258,Test data Size : 2260


In [185]:
# Model Definition
class YoutubeAudioClassifier(nn.Module):
    def __init__(self, time_size=time_size, embed_dim=embed_dim, fc_dim=64, output_dim=top_label_num):
        """Consists of intra-temporal CNN blocks and inter-temporal CNN blocks, followed by 1 FC layer.

        Args:
            time_size (int, optional): Time horizon per each audio data. Defaults to 10.
            embed_dim (int, optional): Embedding dimension per second. Defaults to 128.
            fc_dim (int, optional): Dimension of final fully connected layer. Defaults to 64.
            output_dim (int, optional): Output dimension of the model. Defaults to 10.
        """
        
        super(YoutubeAudioClassifier, self).__init__()
        self.time_size = time_size
        self.embed_dim = embed_dim
        self.embed_dim1, self.embed_dim2 = self.split_embed_dim()
        self.fc_dim = fc_dim
        self.output_dim = output_dim
        
        
        ### For each layers, input and output dimensions are specified in the comment.
        
        # Level1 intra-temporal CNN block
        self.intra_init_conv = nn.Sequential(
            nn.Conv2d(self.time_size, self.time_size//2, 3, padding=1),    # 10*16*8 -> 5*16*8
            nn.BatchNorm2d(self.time_size//2),
            nn.ReLU()
        )
        
        # Three Level2 intra-temporal CNN blocks
        self.intra_stride = nn.Sequential(
            nn.Conv2d(self.time_size//2, self.time_size//4, 2, stride=2),  # 5*16*8 -> 2*8*4
            nn.BatchNorm2d(self.time_size//4),
            nn.ReLU()
        )
        self.intra_dim1_dil = nn.Sequential(
            nn.Conv2d(self.time_size//2, self.time_size//2, 3, dilation=(2,1)),  # 5*16*8 -> 5*12*6
            nn.BatchNorm2d(self.time_size//2),
            nn.ReLU(),
            nn.Conv2d(self.time_size//2, self.time_size//4, 3, dilation=(2,1)),  # 5*12*6 -> 2*8*4
            nn.BatchNorm2d(self.time_size//4),
            nn.ReLU()
        )
        self.intra_dim2_dil = nn.Sequential(
            nn.Conv2d(self.time_size//2, self.time_size//4, (2,3), dilation=(1,2), stride=(2,1)),  # 5*16*8 -> 2*8*4
            nn.BatchNorm2d(self.time_size//4),
            nn.ReLU()
        )
        
        
        # Two Level1 inter-termporal CNN blocks
        self.inter_conv1 = nn.Sequential(
            nn.Conv2d(1, 4, (3,5), stride=(1,2)),  # 1*10*128 -> 4*8*62
            nn.BatchNorm2d(4),
            nn.ReLU(),
            nn.Conv2d(4, 8, (3,8), dilation=(1,3)),  # 4*8*62 -> 8*6*41
            nn.BatchNorm2d(8),
            nn.ReLU()
        )
        self.inter_max_conv = nn.Sequential(
            nn.MaxPool2d((2,6), stride=(2,3), padding=(1,0)),  # 1*10*128 -> 1*6*41
            nn.Conv2d(1, 8, 3, padding=1), # 1*6*41 -> 8*6*41
            nn.BatchNorm2d(8),
            nn.ReLU()
        )
        
        # Level2 inter-temporal CNN block
        self.inter_conv2 = nn.Sequential(
            nn.Conv2d(8, 4, (3,3), stride=(1,2)),  # 8*6*41 -> 4*4*20
            nn.BatchNorm2d(4),
            nn.ReLU(),
            nn.Conv2d(4, 2, (3,5), dilation=(1,3), padding=(1,0)),  # 4*4*20 -> 2*4*8
            nn.BatchNorm2d(2),
            nn.ReLU()
        )
        
        # Concatenating two CNN blocks and normalize
        self.combine_norm = nn.Sequential(
            nn.BatchNorm2d(2),
            nn.ReLU()
        )
        
        self.fc = nn.Linear(self.fc_dim, self.output_dim)  # 64 -> 10


    
    # For balanced Width * Height split of input data for intra blocks    
    def split_embed_dim(self):
        for i in reversed(np.arange(np.ceil(np.sqrt(self.embed_dim))+1)):
            if self.embed_dim % i == 0:
                return self.embed_dim // int(i) , int(i)
                break
            
    
    def forward(self, data):
        intra_data = data.view(-1, self.time_size, self.embed_dim1, self.embed_dim2)  # Input Data for intra-temporal CNN block
        inter_data = data.view(-1, 1, self.time_size, self.embed_dim)  # Input data for inter-temporal CNN block
        
        intra_block1_out = self.intra_init_conv(intra_data)
        # Concatenate three intra_block2 components
        intra_block2_out = self.intra_stride(intra_block1_out) + self.intra_dim1_dil(intra_block1_out) + self.intra_dim2_dil(intra_block1_out)
        
        # Concatenate two inter_block1 components
        inter_block1_out = self.inter_conv1(inter_data) + self.inter_max_conv(inter_data)
        inter_block2_out = self.inter_conv2(inter_block1_out).transpose(-2,-1)
        
        # Concatenate output of intra and inter temporal blocks, and normalize
        cnn_out = self.combine_norm(intra_block2_out + inter_block2_out).view(-1,self.fc_dim)
        
        # Final fully connected layer
        fc_out = self.fc(cnn_out)
        
        return F.softmax(fc_out,dim=1)
        

In [211]:
# Pytorch_lightning configuration 
class YoutubeAudioClassifierLight(pl.LightningModule):
    def __init__(self, model, train_loss_type, valid_metric_type, test_metric_type):
        """
        Args:
            model (nn.Module): Pytorch-implemened model.
            train_loss_type (string): Type of loss function used for training. "msml" or "CE" are allowed.
            valid_metric_type (string): Type of metric used while validation. "ndcg" or "Precision@k" are allowed.
            test_metric_type (string): Type of metric used while testing. "ndcg" or "Precision@k" are allowed.
        """
        
        super().__init__()
        self.model = model
        self.train_lossF = self.define_loss(train_loss_type)
        self.valid_lossF = self.define_loss(valid_metric_type)
        self.test_lossF = self.define_loss(test_metric_type)
    
    
    # Defining loss (or metric) function for given parameter
    def define_loss(self, loss_type, Precision_k=1):
        """
        Args:
            loss_type (string): "msml" or "CE" are allowed for training, and "ndcg" or "Precision@k" are allowed for validation/testing.
            Precision_k (int, optional): Number of topk used for Precision@k metric. Defaults to 1.

        Raises:
            ValueError: For "msml", if reduction_type parameter is not valid
            ValueError: If loss_type parameter is not among "msml","ce","ndcg","Precision@k".

        Returns:
           function: Loss / Metric function to be used, corresponding to loss_type parameter.
        """
        if loss_type == 'msml':
            if device == torch.device("mps"):
                # In MPS device, F.logsigmoid is not supported.
                # It is well-known that softplus(beta = -1) is identical to logsigmoid.
                def multilabel_soft_margin_loss(input, target, weight = None, size_average = None, reduce = None, reduction = "mean"):
                    loss = -(target * F.softplus(input, beta = -1) + (1 - target) * F.softplus(-input, beta = -1)) 

                    if weight is not None:
                        loss = loss * weight

                    class_dim = input.dim() - 1
                    C = input.size(class_dim)
                    loss = loss.sum(dim=class_dim) / C  # only return N loss values

                    if reduction == "none":
                        ret = loss
                    elif reduction == "mean":
                        ret = loss.mean()
                    elif reduction == "sum":
                        ret = loss.sum()
                    else:
                        ret = input
                        raise ValueError(reduction + " is not valid")
                    return ret
                return multilabel_soft_margin_loss
            else:
                return nn.MultiLabelSoftMarginLoss()
        elif loss_type == 'ndcg':
            return retrieval_normalized_dcg
        elif loss_type == 'Precision@k':
            def Precision_at_k(input, target):
                topk_idx = torch.topk(input,Precision_k,dim=1).indices
                return torch.stack([(target[i][topk_idx[i]]).float() for i in range(target.shape[0])]).mean()
            return Precision_at_k
        elif loss_type == 'CE':
            # Since label data in AudioSet dataset is not one-hot encoded (b/c possibly multi-valued),
            # Need to properly transform CrossEntropyLoss functon into specific form
            def MultiCELoss(input, target):
                loss = nn.CrossEntropyLoss()
                nonzeros = target.nonzero()
                idx = nonzeros[:,0]
                label = nonzeros[:,1]
                norm_input = input[idx] / torch.bincount(idx)[idx].view(-1,1)  # Normalize each inputdata in batch by its duplicate count
                return loss(norm_input, label)
            return MultiCELoss
        else:
            raise ValueError("Unsupported Loss function")
        

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        embedding, labels = batch
        train_loss = self.train_lossF(self.model(embedding), labels)
        self.log("train_loss", train_loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return train_loss
    
    def validation_step(self, batch, batch_idx):
        # this is the validation loop
        embedding, labels = batch
        valid_loss = self.valid_lossF(self.model(embedding), labels)
        self.log("validation_metric", valid_loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return valid_loss

    def test_step(self, batch, batch_idx):
        # this is the test loop
        embedding, labels = batch
        test_loss = self.test_lossF(self.model(embedding), labels)
        self.log("test_metric", test_loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return test_loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
        return optimizer

In [217]:
# Instantiate the model and the corresponding Pytorch_lightning object
YACLight = YoutubeAudioClassifierLight(YoutubeAudioClassifier(), "msml","ndcg","ndcg")

In [218]:
tb_logger = pl_loggers.TensorBoardLogger(save_dir="./lightning_logs")
n_epoch = 1000

if device == torch.device("mps"):
    trainer = pl.Trainer(accelerator="mps", logger=tb_logger, max_epochs=n_epoch)
elif device == torch.device("cuda"):
    trainer = pl.Trainer(accelerator="gpu", logger=tb_logger, max_epochs=n_epoch)
else:
    trainer = pl.Trainer(accelerator="cpu", logger=tb_logger, max_epochs=n_epoch)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [219]:
# Baseline metric (Before Training)
trainer.test(model=YACLight, dataloaders=test_loader)

Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
    test_metric_epoch       0.41614019870758057
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_metric_epoch': 0.41614019870758057}]

In [230]:
trainer.fit(model=YACLight, train_dataloaders=train_loader, val_dataloaders=val_loader)


  | Name        | Type                     | Params
---------------------------------------------------------
0 | model       | YoutubeAudioClassifier   | 3.0 K 
1 | train_lossF | MultiLabelSoftMarginLoss | 0     
---------------------------------------------------------
3.0 K     Trainable params
0         Non-trainable params
3.0 K     Total params
0.012     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [227]:
# Test Result (After training)
trainer.test(model=YACLight, dataloaders=test_loader)

Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
    test_metric_epoch       0.9122195839881897
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_metric_epoch': 0.9122195839881897}]