#Requirements

In [None]:
!ls

sample_data


In [None]:
!nvidia-smi

Tue Dec  1 21:01:25 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.38       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   58C    P8    10W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Scripts

In [None]:
%%writefile init.sh

pip install soundfile catalyst -q
pip install torchlibrosa -q

mkdir data/ data/raw/

unzip -q data/raw/audio_files.zip -d data/
unzip -q data/raw/AdditionalUtterances.zip -d data/
unzip -q data/raw/nlp_keywords_29Oct2020.zip -d data/

Writing init.sh


In [None]:
%%writefile init.py

import os, sys, gc, glob
import argparse
import pandas as pd
import numpy as np


parser = argparse.ArgumentParser()

parser.add_argument('-prefix', type=str, default='./data/here/', help="postprocess folder")
parser.add_argument('-data', type=str, default='data/raw/', help="where to fing the zipfiles")


def preprocessing(args):
	def what(splits, ns=-3):
		return '_'.join([ splits[ns], splits[-1][:-4] ]).lower()
	
	additionnal = glob.glob("data/latest_keywords/*/*.wav")
	additionnal += glob.glob("data/nlp_keywords/*/*.wav")

	os.makedirs(args.prefix, exist_ok=True)

	add = pd.DataFrame({'fn': additionnal})
	add['bname'] = add['fn'].apply(lambda x: what(x.split('/')))
	add['type'] = 'add'
	add['target'] = add['fn'].apply(lambda x: x.split('/')[-2])
	add.to_csv(args.prefix + 'AddTrain.csv', index=False)

	train = pd.read_csv(args.data + 'Train.csv')
	train['bname'] = train['fn'].apply(lambda x: what(x.split('/'), -2))
	train['type'] = 'base'
	train['fn'] = 'data/' + train['fn']
	train.rename(columns = {'label': 'target'}, inplace=True)
	train.to_csv(args.prefix + 'BaseTrain.csv', index=False)

	train = pd.concat([train, add], axis=0)
	train.to_csv(args.prefix + 'Train.csv', index=False)

	subs = pd.read_csv(args.data + 'SampleSubmission.csv')
	subs['fn'] = 'data/' + subs['fn']
	subs['bname'] = subs['fn'].apply(lambda x: what(x.split('/'), -2))
	cols = subs.columns.tolist()
	subs = subs[[cols[0]] + [cols[-1]] + cols[1:-1]]
	subs.to_csv(args.prefix + 'SampleSubmission.csv', index=False)


if __name__ == '__main__':
	args = parser.parse_args()
	preprocessing(args)

Writing init.py


## Data Download

In [None]:
!chmod +x init.sh
!./init.sh

In [None]:
!python init.py

#Imports

In [None]:
import warnings
warnings.simplefilter('ignore')

In [None]:
import os
import gc
import sys
import h5py
import cv2
import glob
import math
import random
import librosa
import zipfile
import numpy as np
import pandas as pd
from librosa import display as libdisplay
from tqdm.notebook import tqdm

In [None]:
import torchlibrosa
from torchlibrosa.stft import Spectrogram, LogmelFilterBank
from torchlibrosa.augmentation import SpecAugmentation

In [None]:
from torchvision import models

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import log_loss

In [None]:
from catalyst.contrib.nn.criterion import FocalLossMultiClass

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from keras.utils import to_categorical

In [None]:
import IPython.display as ipd
from matplotlib import pyplot as plt

#Envs

In [None]:
path = 'data/'

In [None]:
seed = 1999

In [None]:
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

if torch.cuda.is_available(): 
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
os.makedirs('MODELS/', exist_ok=True)

In [None]:
# #Placeholder for the training and test spectogram's images
# #It is going to store the spec, we will shortly generate.
# os.makedirs('Imgs/Train/', exist_ok=True)
# os.makedirs('Imgs/Test/', exist_ok=True)

#Utilities

## Basic functions

In [None]:
from joblib import Parallel, delayed
import multiprocessing
import time

In [None]:
def pad_or_truncate(x, audio_length):
  """Pad all audio to specific length."""
  if len(x) <= audio_length:
      return np.concatenate((x, np.zeros(audio_length - len(x))), axis=0)
  return x[:audio_length]

def load_hdf5(hdf5_path):
  hf = h5py.File(hdf5_path, 'r')
  audio_name = hf['audio_name'][:].tolist()
  waveform = hf['waveform']
  target = hf['target'][:].tolist()
  return audio_name, waveform, target, hf

def load_npy(npy_path):
  return np.load(npy_path)

def pack_waveforms_to_npy(npy_path, df, sr=44100, secs=3):
  """Pack waveform and target of several audio clips to a single hdf5 file. 
  This can speed up loading and training.
  """
  def __parallel(df, w, n):
    row = df.loc[n, ['fn', 'bname', 'label']].values
    audio_path, audio_name, target = row

    if os.path.isfile(audio_path):
      (audio, _) = librosa.core.load(audio_path, sr=sr, mono=True)
      audio = pad_or_truncate(audio, clip_samples)

      w[n] = audio
    else:
      print('{} File does not exist! {}'.format(n, audio_path))

  # Arguments & parameters
  clip_samples = sr*secs
  audios_num = len(df)

  # Pack waveform to hdf5
  total_time = time.time()

  wavs = np.empty((audios_num, clip_samples), dtype=np.float32)
  _ =  Parallel()( delayed(__parallel)(df, wavs, n) for n in tqdm(range(audios_num)) )

  np.save(npy_path, wavs)

  print('Write to {}'.format(npy_path))
  print('Pack npy time: {:.3f}'.format(time.time() - total_time))

  return wavs

## Blocks functions

In [None]:
def init_layer(layer):
  """Initialize a Linear or Convolutional layer. """
  nn.init.xavier_uniform_(layer.weight)

  if hasattr(layer, 'bias'):
    if layer.bias is not None:
      layer.bias.data.fill_(0.)
            
def init_bn(bn):
  """Initialize a Batchnorm layer. """
  bn.bias.data.fill_(0.)
  bn.weight.data.fill_(1.)

class AttBlock(nn.Module):
  def __init__(self, in_features: int, out_features: int, activation="linear", temperature=1.0):
    super().__init__()

    self.activation = activation
    self.temperature = temperature
    self.att = nn.Conv1d(
        in_channels=in_features,
        out_channels=out_features,
        kernel_size=1,
        stride=1,
        padding=0,
        bias=True)
    self.cla = nn.Conv1d(
        in_channels=in_features,
        out_channels=out_features,
        kernel_size=1,
        stride=1,
        padding=0,
        bias=True)

    self.bn_att = nn.BatchNorm1d(out_features)
    self.init_weights()

  def init_weights(self):
    init_layer(self.att)
    init_layer(self.cla)
    init_bn(self.bn_att)

  def forward(self, x):
    # x: (n_samples, n_in, n_time)
    norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1)
    cla = self.nonlinear_transform(self.cla(x))
    x = torch.sum(norm_att * cla, dim=2)
    return x, norm_att, cla

  def nonlinear_transform(self, x):
    if self.activation == 'linear':
      return x
    elif self.activation == 'sigmoid':
      return torch.sigmoid(x)

In [None]:
def get_model(config):
  return PANNsResnetAtt(**config)

## ResNet

In [None]:
class PANNsResnetAtt(nn.Module):
  def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, 
               classes_num, arch='resnet34', fc=512, apply_aug=True, top_db=None, **args):
    super(PANNsResnetAtt, self).__init__()
    
    window = 'hann'
    center = True
    pad_mode = 'reflect'
    ref = 1.0
    amin = 1e-10

    self.interpolate_ratio = 32  # Downsampled ratio
    self.apply_aug = apply_aug

    # Spectrogram extractor
    self.spectrogram_extractor = Spectrogram(
        n_fft=window_size,
        hop_length=hop_size,
        win_length=window_size,
        window=window,
        center=center,
        pad_mode=pad_mode,
        freeze_parameters=True)

    # Logmel feature extractor
    self.logmel_extractor = LogmelFilterBank(
        sr=sample_rate,
        n_fft=window_size,
        n_mels=mel_bins,
        fmin=fmin,
        fmax=fmax,
        ref=ref,
        amin=amin,
        top_db=top_db,
        freeze_parameters=True)

    # Spec augmenter
    self.spec_augmenter = SpecAugmentation(
        time_drop_width=64,
        time_stripes_num=2,
        freq_drop_width=8,
        freq_stripes_num=2)

    self.bn0 = nn.BatchNorm2d(mel_bins)

    att_size = 1024

    self.fc1 = nn.Linear(fc, att_size, bias=True)
    self.att_block = AttBlock(att_size, classes_num, activation='linear')


    resnet = getattr(models, arch)(pretrained=True, progress=False)
    self.resnet_features = nn.Sequential(
        resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool,
        resnet.layer1, resnet.layer2, resnet.layer3, resnet.layer4
    )
    # del self.resnet_features.avgpool
    # del self.resnet_features.fc

    self.init_weight()

  def init_weight(self):
    init_bn(self.bn0)
    init_layer(self.fc1)
      
  def cnn_feature_extractor(self, x):
    x = self.resnet_features(x)
    return x
  
  def preprocess(self, input, mixup_lambda=None):
    x = self.spectrogram_extractor(input)  # (batch_size, 1, time_steps, freq_bins)
    x = self.logmel_extractor(x)  # (batch_size, 1, time_steps, mel_bins)

    frames_num = x.shape[2]

    x = x.transpose(1, 3)
    x = self.bn0(x)
    x = x.transpose(1, 3)

    if self.training and self.apply_aug:
        x = self.spec_augmenter(x)

    # Mixup on spectrogram
    if self.training  and self.apply_aug and mixup_lambda is not None:
        x = do_mixup(x, mixup_lambda)

    return x, frames_num
      
  def forward(self, input, mixup_lambda=None):
    """
    Input: (batch_size, data_length)"""
    x, frames_num = self.preprocess(input, mixup_lambda=mixup_lambda)

    if mixup_lambda is not None:
        b = (b*c)//2
        c = 1
    
    # Output shape (batch size, channels, time, frequency)
    x = x.expand(x.shape[0], 3, x.shape[2], x.shape[3])
    x = self.cnn_feature_extractor(x)
    
    # Aggregate in frequency axis
    x = torch.mean(x, dim=3)

    x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
    x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
    x = x1 + x2

    x = F.dropout(x, p=0.5, training=self.training)
    x = x.transpose(1, 2)
    x = F.relu_(self.fc1(x))
    x = x.transpose(1, 2)
    x = F.dropout(x, p=0.5, training=self.training)

    (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
    # segmentwise_output = segmentwise_output.transpose(1, 2)

    # # Get framewise output
    # framewise_output = interpolate(segmentwise_output,
    #                                 self.interpolate_ratio)
    # framewise_output = pad_framewise_output(framewise_output, frames_num)
    # frame_shape =  framewise_output.shape
    # clip_shape = clipwise_output.shape
    # output_dict = {
    #     'framewise_output': framewise_output.reshape(b, c, frame_shape[1],frame_shape[2]),
    #     'clipwise_output': clipwise_output.reshape(b, c, clip_shape[1]),
    # }

    return clipwise_output

## Dataset

In [None]:
class AudioDataset(torch.utils.data.Dataset):
  def __init__(self, df, task='train', **kwargs):
    super(AudioDataset, self).__init__()
    self.df = df
    self.task = task
    self.c = len(words)
    self.classes = words

  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    row = self.df.loc[idx]
    iidx, label = row["index"], 0

    if self.task=='train':
      label = row.label
      waveform = waveforms[iidx]
    else:
      waveform = waveforms_[iidx]

    return {
        'wav': torch.tensor( waveform, dtype=torch.float ),
        'target': torch.tensor( label, dtype=torch.long )
    }

## Training functions

In [None]:
def training_fn(dataloader, model, opt, criterion, scheduler=None):
  avg_loss = 0
  avg_acc = 0
  size = len(dataloader)

  model.train()

  for i, data in enumerate(dataloader):
    x,y = data['wav'].to(device), data['target'].to(device)

    opt.zero_grad()

    pred = model(x)
    loss = criterion(pred, y)

    avg_loss += loss.item()
    
    pred = pred.detach().cpu()
    ys = y.detach().cpu()

    avg_acc += (ys == pred.argmax(1)).float().mean().item()

    loss.backward()
    opt.step()
    
    if scheduler:
      scheduler.step()

    print('\r[Training][{}/{}] Loss: {:.5f} - Acc : {:.5f}'.format(
        i+1, size, avg_loss/(i+1), avg_acc/(i+1) ), end='')
  print()
  

In [None]:
def evaluate(dataloader, model, criterion):
  avg_loss = 0
  avg_acc = 0
  size = len(dataloader)

  model.eval()

  with torch.no_grad():
    for i, data in enumerate(dataloader):
      x,y = data['wav'].to(device), data['target'].to(device)

      pred = model(x)
      
      avg_loss += criterion(pred, y).item()

      pred = pred.detach().cpu()
      ys = y.detach().cpu()

      avg_acc += (ys == pred.argmax(1)).float().mean().item()

      print('\r[Evaluation][{}/{}] Loss: {:.5f} - Acc : {:.5f}'.format(
          i+1, size, avg_loss/(i+1), avg_acc/(i+1) ), end='')
    print()
    avg_loss /= size
    avg_acc /= size
    
  return avg_loss

In [None]:
def predict(df, bs=2):
  test_ds = AudioDataset(df, task='test')
  testloader = torch.utils.data.DataLoader(test_ds, bs, shuffle=False)

  predictions_labels = []
  predictions_proba = []

  out = None

  with torch.no_grad():
    for data in tqdm(testloader):
      x = data['wav'].to(device)

      for i in range(n_folds):
        if i == 0: out = F.softmax( MODELS[i](x), 1 )
        else: out += F.softmax( MODELS[i](x), 1 )

      out /= n_folds
      out_labels = out.argmax(1).cpu().detach().numpy()
      out_probas = out.cpu().detach().numpy()

      predictions_labels += out_labels.tolist()
      predictions_proba += out_probas.tolist()

  return predictions_labels ,predictions_proba

In [None]:
def run_fold(fold, config, bs=16, eval_bs=8, lr=1e-4, path='MODELS/'):
  with torch.cuda.device(device):
      torch.cuda.empty_cache()

  best_logloss = np.inf

  fold_train = train[train.fold != fold].reset_index(drop=False)
  fold_val = train[train.fold == fold].reset_index(drop=False)

  train_ds = AudioDataset(fold_train)
  val_ds = AudioDataset(fold_val)

  trainloader = torch.utils.data.DataLoader(train_ds, batch_size=bs, shuffle=True)
  validloader = torch.utils.data.DataLoader(val_ds, batch_size=eval_bs, shuffle=False)

  model = get_model(config)
  criterion = torch.nn.CrossEntropyLoss()
  opt = torch.optim.AdamW(model.parameters(), lr=lr)

  scheduler = None
  if config["schedule"]:
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        opt, max_lr=1e-3, div_factor=4, steps_per_epoch=len(trainloader), epochs=epochs
    )
  
  model.to(device)

  loader = tqdm(range(epochs), desc=f'Fold {fold}')

  for epoch in loader:
    print(f"[Epoch {epoch}]")

    training_fn(trainloader, model, opt, criterion, scheduler)
    avg_logloss = evaluate(validloader, model, criterion)

    if avg_logloss < best_logloss:
      best_logloss = avg_logloss
      torch.save(model.state_dict(), f'{path}model_state_dict_{fold}.bin')

  return best_logloss

#Loading the CSVs' files

In [None]:
train = pd.read_csv(path+'here/Train.csv')
train.head()

Unnamed: 0,fn,target,bname,type
0,data/audio_files/IV38R7F.wav,akawuka,audio_files_iv38r7f,base
1,data/audio_files/KM4SKWT.wav,banana,audio_files_km4skwt,base
2,data/audio_files/F5POSU9.wav,obulwadde,audio_files_f5posu9,base
3,data/audio_files/MMVDXG2.wav,nnyaanya,audio_files_mmvdxg2,base
4,data/audio_files/9TVM96F.wav,pampu,audio_files_9tvm96f,base


In [None]:
sub = pd.read_csv(path+'here/SampleSubmission.csv')
sub.head(1)

Unnamed: 0,fn,bname,maize streak virus,disease,okukkoola,muwogo,mpeke,mucungwa,greens,garden,mango,bulimi,obuwuka,ebikoola,obulimi,ebisoolisooli,kaamulali,eddagala,beans,omuyembe,leaf,kisaanyi,leaves,butterfly,okuzifuuyira,micungwa,ppaapaali,emboga,kikolo,harvest,olusuku,coffee,super grow,rice,ensujju,okulima,worm,ebbugga,onion,ensigo,...,ejjobyo,omulimi,okusimba,sweet potatoes,okufuuyira,farming instructor,nnasale beedi,passion fruit,ekitooke,ebisaanyi,ekyeya,enva endiirwa,emisiri,emiyembe,amatooke,ebiwuka,farm,ebinyebwa,amappapaali,ebimera,kassooli,harvesting,emmwanyi,akamonde,obumonde,cabbages,akasaanyi,spread,ebirime,drought,kasaanyi,suckers,insects,fertilizer,nakavundira,ekiwojjolo,akawuka,ddagala,ebiwojjolo,obutungulu
0,data/audio_files/00118N3.wav,audio_files_00118n3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
words = sub.columns[2:]

In [None]:
label = np.linspace(0, len(words)-1, len(words), dtype=np.int16)
mapper = dict(zip(words, label))

In [None]:
train['label'] = train['target'].map(mapper).astype(int)

In [None]:
train.head()

Unnamed: 0,fn,target,bname,type,label
0,data/audio_files/IV38R7F.wav,akawuka,audio_files_iv38r7f,base,189
1,data/audio_files/KM4SKWT.wav,banana,audio_files_km4skwt,base,114
2,data/audio_files/F5POSU9.wav,obulwadde,audio_files_f5posu9,base,130
3,data/audio_files/MMVDXG2.wav,nnyaanya,audio_files_mmvdxg2,base,136
4,data/audio_files/9TVM96F.wav,pampu,audio_files_9tvm96f,base,83


# Save wavs as npy

In [None]:
num_cores = multiprocessing.cpu_count()

In [None]:
sr = 44100
sec = 3

In [None]:
npy_path = f'drive/My Drive/Zindi/GIZ/train_sr={sr}_sec={sec}.npy'
test_npy_path = f'drive/My Drive/Zindi/GIZ/test_sr={sr}_sec={sec}.npy'

In [None]:
if not os.path.exists(npy_path):
  waveforms = pack_waveforms_to_npy(npy_path, train, sr=sr, secs=sec)

In [None]:
if not os.path.exists(test_npy_path):
  test = sub[['fn', 'bname']]
  test['label'] = 0

  waveforms_ = pack_waveforms_to_npy(test_npy_path, test, sr=sr, secs=sec)

# Load wavs

In [None]:
%%time
waveforms = load_npy(npy_path)
waveforms_ = load_npy(test_npy_path)

CPU times: user 14.8 ms, sys: 2.66 s, total: 2.68 s
Wall time: 46.9 s


In [None]:
gc.collect()

269

#Training

In [None]:
n_folds = 10

In [None]:
def make_fold(n_folds, shuffle=True):

  train['fold'] = -1
  indexes = train[train['type']=='base'].index
  
  fold = StratifiedKFold(n_splits = n_folds, random_state=seed, shuffle=shuffle)
  for i, (tr, vr) in enumerate(fold.split(indexes, train.loc[indexes, 'label'])):
    train.loc[indexes[vr], 'fold'] = i

In [None]:
make_fold(n_folds, shuffle=True)

In [None]:
train.fold.nunique()

11

In [None]:
epochs = 50
device = 'cuda:0'
lr = 1e-4

classes_num = 193
batch_size = 16

config = {
    "sample_rate": sr,
    "window_size": 1024,
    "hop_size": 320,
    "mel_bins": 64,
    "fmin": 50,
    "fmax": 14000,
    "classes_num": classes_num,
    'arch': 'resnet34',
    'fc': 512,
    "schedule": True,
}

In [None]:
gc.collect()

0

In [None]:
%%time
avg_logloss = 0

for fold in range(n_folds):
  
  _fold_logloss = run_fold(fold, config, bs=batch_size, eval_bs=batch_size, lr=lr)
  avg_logloss += _fold_logloss

print()
print("Avg LogLoss: ", avg_logloss/n_folds)
print()

#Loading models

In [None]:
%%time
MODELS = []

for i in range(n_folds):
  MODELS.append( get_model(config) )
  MODELS[i].load_state_dict(torch.load(f'MODELS/model_state_dict_{i}.bin'))
  MODELS[i].to(device)
  MODELS[i].eval()

CPU times: user 9.92 s, sys: 2.13 s, total: 12.1 s
Wall time: 33.3 s


#Prediction

In [None]:
predictions_labels, predictions_proba = predict(sub.reset_index(), bs=2)

HBox(children=(FloatProgress(value=0.0, max=509.0), HTML(value='')))




# Making a submission

In [None]:
submission = pd.DataFrame()
submission['fn'] = sub['fn'].apply(lambda x: '/'.join( x.split('/')[1:] ))
for i, label in enumerate(words):
  submission[label] = 0.
for (label, i) in mapper.items():
  submission.loc[:,label] = np.array(predictions_proba)[:,i]

In [None]:
submission.head()

In [None]:
csv_file = 'resnet34_with_scheduler.csv'
submission.to_csv(csv_file, index=False)