In [1]:
import gc
import os
import pickle
import random
import time
from collections import Counter, defaultdict
from functools import partial
from pathlib import Path
from psutil import cpu_count
from torchvision.models import resnet50, vgg19_bn, vgg11_bn
import librosa
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split,KFold

import torch
import torch.nn as nn
import torch.nn.functional as F
from fastprogress import master_bar, progress_bar
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import transforms

In [2]:
torch.cuda.is_available()

True

### utils

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 520
seed_everything(SEED)

In [4]:
# from official code https://colab.research.google.com/drive/1AgPdhSp7ttY18O3fEoHOQKlt_3HJDLi8#scrollTo=cRCaCIb9oguU
def _one_sample_positive_class_precisions(scores, truth):
    """Calculate precisions for each true class for a single sample.

    Args:
      scores: np.array of (num_classes,) giving the individual classifier scores.
      truth: np.array of (num_classes,) bools indicating which classes are true.

    Returns:
      pos_class_indices: np.array of indices of the true classes for this sample.
      pos_class_precisions: np.array of precisions corresponding to each of those
        classes.
    """
    num_classes = scores.shape[0]
    pos_class_indices = np.flatnonzero(truth > 0)
    # Only calculate precisions if there are some true classes.
    if not len(pos_class_indices):
        return pos_class_indices, np.zeros(0)
    # Retrieval list of classes for this sample.
    retrieved_classes = np.argsort(scores)[::-1]
    # class_rankings[top_scoring_class_index] == 0 etc.
    class_rankings = np.zeros(num_classes, dtype=np.int)
    class_rankings[retrieved_classes] = range(num_classes)
    # Which of these is a true label?
    retrieved_class_true = np.zeros(num_classes, dtype=np.bool)
    retrieved_class_true[class_rankings[pos_class_indices]] = True
    # Num hits for every truncated retrieval list.
    retrieved_cumulative_hits = np.cumsum(retrieved_class_true)
    # Precision of retrieval list truncated at each hit, in order of pos_labels.
    precision_at_hits = (
            retrieved_cumulative_hits[class_rankings[pos_class_indices]] /
            (1 + class_rankings[pos_class_indices].astype(np.float)))
    return pos_class_indices, precision_at_hits


def calculate_per_class_lwlrap(truth, scores):
    """Calculate label-weighted label-ranking average precision.

    Arguments:
      truth: np.array of (num_samples, num_classes) giving boolean ground-truth
        of presence of that class in that sample.
      scores: np.array of (num_samples, num_classes) giving the classifier-under-
        test's real-valued score for each class for each sample.

    Returns:
      per_class_lwlrap: np.array of (num_classes,) giving the lwlrap for each
        class.
      weight_per_class: np.array of (num_classes,) giving the prior of each
        class within the truth labels.  Then the overall unbalanced lwlrap is
        simply np.sum(per_class_lwlrap * weight_per_class)
    """
    assert truth.shape == scores.shape
    num_samples, num_classes = scores.shape
    # Space to store a distinct precision value for each class on each sample.
    # Only the classes that are true for each sample will be filled in.
    precisions_for_samples_by_classes = np.zeros((num_samples, num_classes))
    for sample_num in range(num_samples):
        pos_class_indices, precision_at_hits = (
            _one_sample_positive_class_precisions(scores[sample_num, :],
                                                  truth[sample_num, :]))
        precisions_for_samples_by_classes[sample_num, pos_class_indices] = (
            precision_at_hits)
    labels_per_class = np.sum(truth > 0, axis=0)
    weight_per_class = labels_per_class / float(np.sum(labels_per_class))
    # Form average of each column, i.e. all the precisions assigned to labels in
    # a particular class.
    per_class_lwlrap = (np.sum(precisions_for_samples_by_classes, axis=0) /
                        np.maximum(1, labels_per_class))
    # overall_lwlrap = simple average of all the actual per-class, per-sample precisions
    #                = np.sum(precisions_for_samples_by_classes) / np.sum(precisions_for_samples_by_classes > 0)
    #           also = weighted mean of per-class lwlraps, weighted by class label prior across samples
    #                = np.sum(per_class_lwlrap * weight_per_class)
    return per_class_lwlrap, weight_per_class

In [5]:
os.listdir('../input/argument-data')

['label_argument_clear_large.csv',
 'label_argument_clear.csv',
 'argument_multi.pickle',
 'argument_label_multi.csv',
 'argument.pickle',
 'argument_large.pickle',
 'argument_clear.pickle',
 'argument_clear_large.pickle',
 'argument_label_large.csv',
 'argument_larger.pickle',
 'argument_label_larger.csv',
 'argument_label.csv']

### dataset

In [6]:
dataset_dir = Path('../input/freesound-audio-tagging-2019')
preprocessed_dir = Path('../input/fat2019_prep_mels1')

In [7]:
csvs = {
    'train_curated': dataset_dir / 'train_curated.csv',
    'train_noisy': dataset_dir / 'train_noisy.csv',
    'sample_submission': dataset_dir / 'sample_submission.csv',
    'argument': '../input/argument-data/label_argument_clear_large.csv'
}

dataset = {
    'train_curated': dataset_dir / 'train_curated',
    'train_noisy': dataset_dir / 'train_noisy',
    'test': dataset_dir / 'test',
}

mels = {
    'train_curated': preprocessed_dir / 'mels_train_curated.pkl',
    'train_noisy': preprocessed_dir / 'mels_trn_noisy_best50s.pkl',
    'test': preprocessed_dir / 'mels_test.pkl',  # NOTE: this data doesn't work at 2nd stage
}

In [8]:
train_curated = pd.read_csv(csvs['train_curated'])
train_noisy   = pd.read_csv(csvs['train_noisy'])
train_argument = pd.read_csv(csvs['argument'])
train_df      = pd.concat([train_curated, train_noisy], sort=True, ignore_index=True)
train_df.head()

Unnamed: 0,fname,labels
0,0006ae4e.wav,Bark
1,0019ef41.wav,Raindrop
2,001ec0ad.wav,Finger_snapping
3,0026c7cb.wav,Run
4,0026f116.wav,Finger_snapping


In [9]:
test_df = pd.read_csv(csvs['sample_submission'])

In [10]:
labels = test_df.columns[1:].tolist()

In [11]:
num_classes = len(labels)

In [12]:
y_train = np.zeros((len(train_df), num_classes)).astype(int)
y_train_argument = np.zeros((len(train_argument), num_classes)).astype(int)

for i, row in enumerate(train_df['labels'].str.split(',')):
    for label in row:
        idx = labels.index(label)
        y_train[i, idx] = 1

for i, row in enumerate(train_argument['labels'].str.split(',')):
    for label in row:
        idx = labels.index(label)
        y_train_argument[i, idx] = 1
        
print(y_train.shape, y_train_argument.shape)


(24785, 80) (5695, 80)


In [13]:
with open('../input/all-data/train_curated.pickle', 'rb') as curated:
    x_train = pickle.load(curated)

for i in range(5):
    with open('../input/all-data/train_noisy_{}.pickle'.format(i+1), 'rb') as noisy:
        x_train.extend(pickle.load(noisy))

with open('../input/argument-data/argument_clear_large.pickle', 'rb') as argument:
    x_train_argument = pickle.load(argument)

len(x_train), len(x_train_argument)

(24785, 5695)

In [14]:
class FATTrainDataset(Dataset):
    def __init__(self, mels, labels, transforms):
        super().__init__()
        self.mels = mels
        self.labels = labels
        self.transforms = transforms
        
    def __len__(self):
        return len(self.mels)
    
    def __getitem__(self, idx):
        # crop 1sec
        image = Image.fromarray(self.mels[idx], mode='RGB')        
        time_dim, base_dim = image.size
        crop = random.randint(0, time_dim - base_dim)
        image = image.crop([crop, 0, crop + base_dim, base_dim])
        image = self.transforms(image).div_(255)
        
        label = self.labels[idx]
        label = torch.from_numpy(label).float()
        
        return image, label

In [15]:
class FATTestDataset(Dataset):
    def __init__(self, fnames, mels, transforms, tta=5):
        super().__init__()
        self.fnames = fnames
        self.mels = mels
        self.transforms = transforms
        self.tta = tta
        
    def __len__(self):
        return len(self.fnames) * self.tta
    
    def __getitem__(self, idx):
        
        new_idx = idx % len(self.fnames)
        
        image = Image.fromarray(self.mels[new_idx], mode='RGB')
        time_dim, base_dim = image.size
        crop = random.randint(0, time_dim - base_dim)
        image = image.crop([crop, 0, crop + base_dim, base_dim])
        image = self.transforms(image).div_(255)

        fname = self.fnames[new_idx]
        
        return image, fname

In [16]:
transforms_dict = {
    'train': transforms.Compose([
        transforms.RandomHorizontalFlip(0.5),
        transforms.ToTensor(),
    ]),
    'test': transforms.Compose([
        transforms.ToTensor(),
    ]),
}

### model

In [17]:
class Gate_ConvBlock(nn.Module):

    def __init__(self, in_channels, out_channels, drop_rate=0.0):
        super(Gate_ConvBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, 3, 1, 1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, 3, 1, 1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.pool  = nn.MaxPool2d(2, stride=2)
        self.drop_rate = drop_rate
        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.zeros_(m.bias)

    def forward(self, x):
        
        layer1_L      = self.conv1(x)
        layer1_L      = self.bn1(layer1_L)
        layer1_S      = torch.sigmoid(layer1_L)
        layer1_output = layer1_L * layer1_S
        layer2_L      = self.conv2(layer1_output)
        layer2_L      = self.bn2(layer2_L)
        layer2_S      = torch.sigmoid(layer2_L)
        layer2_output = layer2_L * layer2_S
        output        = self.pool(layer2_output)
        
        if self.drop_rate > 0:
            return F.dropout(output, p=self.drop_rate, training=self.training)
        else:
            return output
        
class CLDNN(nn.Module):

    def __init__(self):
        super(CLDNN, self).__init__()
        self.prepreocess = nn.Sequential(nn.BatchNorm2d(3), nn.ReLU())
        self.cnn = nn.Sequential(Gate_ConvBlock(3, 64),   #(N, 64, 64, 64) #Because overfit trainset in early stage, i add dropout.
                                 Gate_ConvBlock(64, 128), #(N, 128, 32, 32)
                                 Gate_ConvBlock(128, 256),#(N, 256, 16, 16)
                                 Gate_ConvBlock(256, 512),#(N, 512, 8, 8)
                                 Gate_ConvBlock(512, 512))#(N, 512, 4, 4)
        self.gru = nn.GRU(512*4, 512, batch_first=True, bidirectional=True) 
        self.relu = nn.ReLU()
        self.fc = nn.Sequential(nn.Dropout(0.5), nn.Linear(1024, 80))
        self._init_weights()
        
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.zeros_(m.bias)

    def forward(self, x):
        
        x = self.prepreocess(x)
        x = self.cnn(x)
        x = x.view(x.size(0), x.size(1)*x.size(2), x.size(-1)) #(N, 2048, 4)  
        x = x.permute((0, 2, 1)) #(N, 4, 2048)
        x, _ = self.gru(x) #(N, 4, 1024)
        x = self.relu(x)
        x = torch.mean(x, dim=1)
        x = x.view(x.size(0), -1) #(N, 1024)
        x = self.fc(x) #(N, 80)
        
        return x

In [18]:
class SoftBootstrappingLoss(nn.Module):
    """
    https://arxiv.org/pdf/1901.01189.pdf
    Loss(t, p) = - (beta * t + (1 - beta) * p) * log(p)
    The idea is to payless attention to the noisy labels, 
    in favour of the model predictions,which are more reliable as the learning progresses
    """
    def __init__(self, beta=0.95, reduce=True):
        super(SoftBootstrappingLoss, self).__init__()
        self.beta = beta
        self.reduce = reduce

    def forward(self, input, target):
        # cross_entropy = - t * log(p)
        beta_xentropy = self.beta * torch.sum(F.binary_cross_entropy_with_logits(input, target, reduction='none'), dim=1)
        
        # second term = - (1 - beta) * p * log(p)
        bootstrap = - (1.0 - self.beta) * torch.sum(torch.sigmoid(input) * F.logsigmoid(input), dim=1)
        
        if self.reduce:
            return torch.mean(beta_xentropy + bootstrap)
        
        return beta_xentropy + bootstrap
    

### train

In [19]:
def train_model(x_train, y_train, train_index, val_index, n_fold, train_transforms, enable_checkpoint_ensemble=False):
    
    num_epochs = 120
    batch_size = 128
    test_batch_size = 128
    lr = 1e-3
    eta_min = 1e-5
    t_max = 5
    
    num_classes = y_train.shape[1]

    x_trn= [x_train[idx] for idx in train_index] + x_train_argument
    x_val= [x_train[idx] for idx in val_index]
    y_trn= np.asarray([y_train[idx] for idx in train_index])
    y_trn= np.vstack((y_trn, y_train_argument))
    y_val= [y_train[idx] for idx in val_index]
    
    train_dataset = FATTrainDataset(x_trn, y_trn, train_transforms)
    valid_dataset = FATTrainDataset(x_val, y_val, train_transforms)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=test_batch_size, shuffle=False)
    
    model = CLDNN().cuda()    
    criterion = SoftBootstrappingLoss(beta=0.3).cuda()
    
    """
    An Effective Label Noise Model for DNN Text Classiﬁcation
    https://arxiv.org/abs/1903.07507
    
    find the l2 regularization with a small penaltyworks better than a large penalty since, 
    for low label noise, learning a less diffuse noise is beneficial.
    without regulariza-tion, the noise model has less ability to diffuse the diagonal elements which leads to poor classifica-tion performance."""
    optimizer = Adam(params=model.parameters(), lr=lr, amsgrad=False, weight_decay=1e-6)
    scheduler = CosineAnnealingLR(optimizer, T_max=t_max, eta_min=eta_min)

    best_epoch = -1
    best_lwlrap = 0.
    
    for epoch in range(num_epochs):
        checkpoint_weights = [2 ** epoch for epoch in range(epoch+1)]
        start_time = time.time()
        model.train()
        avg_loss = 0.
        
        for x_batch, y_batch in train_loader:
            preds = model(x_batch.cuda())
            loss = criterion(preds, y_batch.cuda())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            avg_loss += loss.item() / len(train_loader)

        model.eval()
        valid_preds = np.zeros((len(x_val), num_classes))
        avg_val_loss = 0.

        for i, (x_batch, y_batch) in enumerate(valid_loader):
            preds = model(x_batch.cuda()).detach()
            loss = criterion(preds, y_batch.cuda())
            preds = torch.sigmoid(preds)
            valid_preds[i * test_batch_size: (i+1) * test_batch_size] = preds.cpu().numpy()
            
            avg_val_loss += loss.item() / len(valid_loader)

        score, weight = calculate_per_class_lwlrap(np.asarray(y_val), valid_preds)
        lwlrap = (score * weight).sum()
        
        scheduler.step()

        if (epoch + 1) % 1 == 0:
            elapsed = time.time() - start_time
            print(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  val_lwlrap: {lwlrap:.6f}  time: {elapsed:.0f}s')
    
        if lwlrap > best_lwlrap:
            best_epoch = epoch + 1
            best_lwlrap = lwlrap
            torch.save(model.state_dict(), 'weight_best_{}.pt'.format(n_fold))
            
    return {
        'best_epoch': best_epoch,
        'best_lwlrap': best_lwlrap,
    }

In [20]:
train_s = pd.read_csv('../input/train-stratified/train_stratified.csv')

In [21]:
FOLD_NO = [0]
for FOLD in FOLD_NO:
    val_index = train_s[train_s.fold == FOLD].index
    tra_index = train_s[train_s.fold != FOLD].index
    result = train_model(x_train, y_train, tra_index, val_index, FOLD, transforms_dict['train'])
    print(result)

Epoch 1 - avg_train_loss: 3.8355  avg_val_loss: 2.9226  val_lwlrap: 0.076550  time: 142s
Epoch 2 - avg_train_loss: 3.2111  avg_val_loss: 2.7788  val_lwlrap: 0.150974  time: 141s
Epoch 3 - avg_train_loss: 2.8954  avg_val_loss: 2.6036  val_lwlrap: 0.248914  time: 141s
Epoch 4 - avg_train_loss: 2.6832  avg_val_loss: 2.5086  val_lwlrap: 0.297426  time: 141s
Epoch 5 - avg_train_loss: 2.5343  avg_val_loss: 2.4348  val_lwlrap: 0.338809  time: 141s
Epoch 6 - avg_train_loss: 2.4300  avg_val_loss: 2.3568  val_lwlrap: 0.370708  time: 141s
Epoch 7 - avg_train_loss: 2.3912  avg_val_loss: 2.3462  val_lwlrap: 0.380122  time: 141s
Epoch 8 - avg_train_loss: 2.3816  avg_val_loss: 2.3415  val_lwlrap: 0.382427  time: 141s
Epoch 9 - avg_train_loss: 2.3714  avg_val_loss: 2.3510  val_lwlrap: 0.377465  time: 141s
Epoch 10 - avg_train_loss: 2.3428  avg_val_loss: 2.3604  val_lwlrap: 0.371075  time: 141s
Epoch 11 - avg_train_loss: 2.2768  avg_val_loss: 2.4301  val_lwlrap: 0.347978  time: 141s
Epoch 12 - avg_trai

### predict

In [22]:
def random_predict_model(test_fnames, x_test, test_transforms, num_classes, n_fold, tta=35):
    batch_size = 256

    test_dataset = FATTestDataset(test_fnames, x_test, test_transforms, tta=tta)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    model = CLDNN().cuda()
    model.load_state_dict(torch.load('../input/feature-level-attention/weight_best_{}.pt'.format(n_fold)))
    model.cuda()
    model.eval()

    all_outputs, all_fnames = [], []

    pb = progress_bar(test_loader)
    for images, fnames in pb:
        preds = torch.sigmoid(model(images.cuda()).detach())
        all_outputs.append(preds.cpu().numpy())
        all_fnames.extend(fnames)

    test_preds = pd.DataFrame(data=np.concatenate(all_outputs),
                              index=all_fnames,
                              columns=map(str, range(num_classes)))
    test_preds = test_preds.groupby(level=0).mean()

    return test_preds