In [2]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.autograd import Variable

import pytorch_lightning as pl
from torchmetrics.functional import retrieval_normalized_dcg

In [3]:
cpu_count = os.cpu_count()
device = torch.device("cpu")
time_size = 10
embed_dim = 128
embed_max = 256
label_num = 527
top_label_num = 10

In [65]:
def df_to_tensor(data_names, top_label_num=10, top_label_idx=None):
    df = pd.DataFrame([])
    for data_name in data_names:
        add_df = pd.read_parquet(f'{data_name}.parquet')
        add_df = add_df[~add_df.isnull()]
        print(f"{data_name}_df shape: ",add_df.shape)
        add_df = add_df[add_df['audio_embedding'].apply(lambda x: len(x)) == time_size]
        df = pd.concat([df,add_df],axis=0)
        del add_df

    def label_converter(x):
        output = np.zeros(label_num,dtype=int)
        for label in x:
            output[label] = 1
        return output

    label_df = pd.DataFrame(np.vstack(df['labels'].apply(lambda x: label_converter(x))).reshape(-1,label_num))
    top_label_idx = list(label_df.sum(axis=0).nlargest(10).index)
    label_df = label_df[top_label_idx]
    label_df = label_df[label_df.sum(axis=1)>0]
    label_tensor = torch.Tensor(label_df.to_numpy())
    print(f"Total label shape: ",label_tensor.size())

    df['label'] = df['label'].apply(lambda x: x[top_label_idx])
    df = df[df['label'].apply(lambda x: sum(x))>0]

    embeddings = np.vstack(df['audio_embedding'].apply(lambda x: np.vstack(x))).reshape(-1,time_size,embed_dim)
    embedding_tensor = torch.Tensor(embeddings)
    print(f"Total embedding shape: ", embedding_tensor.size())
    
    assert embedding_tensor.shape[0] == label_tensor.shape[0], "Feature and label dim does not coincide!"
    
    return embedding_tensor/embed_max, label_tensor

In [69]:
embedding_all, label_all = df_to_tensor(['bal_train','eval'], top_label_num)

permute_idx = np.random.permutation(np.arange(embedding_all.shape[0]))
train_idx = permute_idx[:(permute_idx.shape[0]//5)*4]
test_idx = permute_idx[(permute_idx.shape[0]//5)*4:]
print(f"Train data Size: {len(train_idx)} , Test data Size : {len(test_idx)}")

train_embedding, train_label = embedding_all[train_idx], label_all[train_idx]
test_embedding, test_label = embedding_all[test_idx], label_all[test_idx]

bal_train_df shape:  (21782, 5)
eval_df shape:  (19976, 5)
Total label shape:  torch.Size([22582, 10])
Total embedding shape:  torch.Size([22582, 10, 128])
Train data Size: 18064 , Test data Size : 4518


In [70]:
train_set = TensorDataset(train_embedding, train_label)
test_set = TensorDataset(test_embedding, test_label)

In [71]:
bs = 16
train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=bs, shuffle=True, num_workers=cpu_count)
test_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=1, shuffle=False, num_workers=cpu_count)

In [72]:
class YoutubeAudioClassifier(nn.Module):
    def __init__(self, time_size, embed_dim, fc_dim, output_dim):
        super(YoutubeAudioClassifier, self).__init__()
        self.time_size = time_size
        self.embed_dim = embed_dim
        self.embed_dim1, self.embed_dim2 = self.split_embed_dim()
        self.fc_dim = fc_dim
        self.output_dim = output_dim
        
        self.intra_init_conv = nn.Sequential(
            nn.Conv2d(self.time_size, self.time_size//2, 3, padding=1),    # 10*16*8 -> 5*16*8
            nn.BatchNorm2d(self.time_size//2),
            nn.ReLU()
        )
        
        self.intra_stride = nn.Sequential(
            nn.Conv2d(self.time_size//2, self.time_size//4, 2, stride=2),  # 5*16*8 -> 2*8*4
            nn.BatchNorm2d(self.time_size//4),
            nn.ReLU()
        )
        self.intra_dim1_dil = nn.Sequential(
            nn.Conv2d(self.time_size//2, self.time_size//2, 3, dilation=(2,1)),  # 5*16*8 -> 5*12*6
            nn.BatchNorm2d(self.time_size//2),
            nn.ReLU(),
            nn.Conv2d(self.time_size//2, self.time_size//4, 3, dilation=(2,1)),  # 5*12*6 -> 2*8*4
            nn.BatchNorm2d(self.time_size//4),
            nn.ReLU()
        )
        self.intra_dim2_dil = nn.Sequential(
            nn.Conv2d(self.time_size//2, self.time_size//4, (2,3), dilation=(1,2), stride=(2,1)),  # 5*16*8 -> 2*8*4
            nn.BatchNorm2d(self.time_size//4),
            nn.ReLU()
        )
        
        
        self.inter_conv1 = nn.Sequential(
            nn.Conv2d(1, 4, (3,5), stride=(1,2)),  # 1*10*128 -> 4*8*62
            nn.BatchNorm2d(4),
            nn.ReLU(),
            nn.Conv2d(4, 8, (3,8), dilation=(1,3)),  # 4*8*62 -> 8*6*41
            nn.BatchNorm2d(8),
            nn.ReLU()
        )
        self.inter_max_conv = nn.Sequential(
            nn.MaxPool2d((2,6), stride=(2,3), padding=(1,0)),  # 1*10*128 -> 1*6*41
            nn.Conv2d(1, 8, 3, padding=1), # 1*6*41 -> 8*6*41
            nn.BatchNorm2d(8),
            nn.ReLU()
        )
        
        self.inter_conv2 = nn.Sequential(
            nn.Conv2d(8, 4, (3,3), stride=(1,2)),  # 8*6*41 -> 4*4*20
            nn.BatchNorm2d(4),
            nn.ReLU(),
            nn.Conv2d(4, 2, (3,5), dilation=(1,3), padding=(1,0)),  # 4*4*20 -> 2*4*8
            nn.BatchNorm2d(2),
            nn.ReLU()
        )
        
        self.combine_norm = nn.Sequential(
            nn.BatchNorm2d(2),
            nn.ReLU()
        )
        
        self.fc = nn.Linear(self.fc_dim, self.output_dim)  # 64 -> 10


    
    # For balanced Width * Height split of input data for intra blocks    
    def split_embed_dim(self):
        for i in reversed(np.arange(np.ceil(np.sqrt(self.embed_dim))+1)):
            if self.embed_dim % i == 0:
                return self.embed_dim // int(i) , int(i)
                break
    
    def forward(self, data):
        intra_data = data.view(-1, self.time_size, self.embed_dim1, self.embed_dim2)
        inter_data = data.view(-1, 1, self.time_size, self.embed_dim)
        
        intra_block1_out = self.intra_init_conv(intra_data)
        intra_block2_out = self.intra_stride(intra_block1_out) + self.intra_dim1_dil(intra_block1_out) + self.intra_dim2_dil(intra_block1_out)
        
        inter_block1_out = self.inter_conv1(inter_data) + self.inter_max_conv(inter_data)
        inter_block2_out = self.inter_conv2(inter_block1_out).transpose(-2,-1)
        
        cnn_out = self.combine_norm(intra_block2_out + inter_block2_out).view(-1,self.fc_dim)
        fc_out = self.fc(cnn_out)
        
        return F.softmax(fc_out,dim=1)
        

In [154]:
class YoutubeAudioClassifierLight(pl.LightningModule):
    def __init__(self, model, train_lossF, test_lossF):
        super().__init__()
        self.model = model
        self.train_lossF = train_lossF
        self.test_lossF = test_lossF

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        embedding, labels = batch
        train_loss = self.train_lossF(self.model(embedding), labels.bool())
        self.log("train_loss", train_loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return train_loss
    
    def test_step(self, batch, batch_idx):
        # this is the test loop
        embedding, labels = batch
        test_loss = self.test_lossF(self.model(embedding), labels.bool())
        self.log("test_metric", test_loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return test_loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
        return optimizer

In [155]:
def define_loss(loss_type, Precision_k=1):
    if loss_type == 'msml':
        if device == torch.device("mps"):
            def multilabel_soft_margin_loss(input, target, weight = None, size_average = None, reduce = None, reduction = "mean"):
                # In MPS device, F.logsigmoid is not supported.
                # It is well-known that softplus(beta = -1) is identical to logsigmoid.
                loss = -(target * F.softplus(input, beta = -1) + (1 - target) * F.softplus(-input, beta = -1)) 

                if weight is not None:
                    loss = loss * weight

                class_dim = input.dim() - 1
                C = input.size(class_dim)
                loss = loss.sum(dim=class_dim) / C  # only return N loss values

                if reduction == "none":
                    ret = loss
                elif reduction == "mean":
                    ret = loss.mean()
                elif reduction == "sum":
                    ret = loss.sum()
                else:
                    ret = input
                    raise ValueError(reduction + " is not valid")
                return ret
            return multilabel_soft_margin_loss
        else:
            return nn.MultiLabelSoftMarginLoss()
    elif loss_type == 'ndcg':
        return retrieval_normalized_dcg
    elif loss_type == 'Precision@k':
        def Precision_at_k(input, target):
            topk_idx = torch.topk(input,Precision_k,dim=1).indices
            return torch.stack([(target[i][topk_idx[i]]).float() for i in range(target.shape[0])]).mean()
        return Precision_at_k
    elif loss_type == 'CE':
        def MultiCELoss(input, target):
            loss = nn.CrossEntropyLoss()
            nonzeros = target.nonzero()
            idx, label = nonzeros[:,0], nonzeros[:,1]
            norm_val = torch.bincount(idx)[idx]
            norm_input = input[idx] / torch.bincount(idx)[idx].view(-1,1)
            return loss(norm_input, label)
        return MultiCELoss
    else:
        raise ValueError("Unsupported Loss function")

In [160]:
YACLight = YoutubeAudioClassifierLight(YoutubeAudioClassifier(time_size, embed_dim, 64, top_label_num), define_loss("CE"),define_loss("Precision@k"))

In [161]:
if device == torch.device("mps"):
    trainer = pl.Trainer(accelerator="mps")
elif device == torch.device("cuda"):
    trainer = pl.Trainer(accelerator="gpu")
else:
    trainer = pl.Trainer(accelerator="cpu")

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [162]:
# Baseline precision (Before Training)
trainer.test(model=YACLight, dataloaders=test_loader)

Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     test_loss_epoch       0.060174934566020966
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss_epoch': 0.060174934566020966}]

In [None]:
trainer.fit(model=YACLight, train_dataloaders=train_loader)


  | Name  | Type                   | Params
-------------------------------------------------
0 | model | YoutubeAudioClassifier | 3.0 K 
-------------------------------------------------
3.0 K     Trainable params
0         Non-trainable params
3.0 K     Total params
0.012     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

In [159]:
trainer.test(model=YACLight, dataloaders=test_loader)

Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     test_loss_epoch        0.8976417183876038
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss_epoch': 0.8976417183876038}]