In [1]:
import numpy as np
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torchaudio
import torchvision
from torchaudio import transforms
from efficientnet_pytorch import EfficientNet
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score
from tqdm import tqdm


import random
import numpy as np

In [2]:
# path to your train/test/meta folders
DATA_PATH = 'data/'

# names of valuable files/folders
train_meta_fname = 'train.csv'
test_meta_fname = 'sample_submission.csv'
train_data_folder = 'train'
test_data_folder = 'test'

# Architectural constants.
NUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.

# Hyperparameters used in feature and example generation.
SAMPLE_RATE = 16000
STFT_WINDOW_LENGTH_SECONDS = 0.025
STFT_HOP_LENGTH_SECONDS = 0.010
NUM_MEL_BINS = NUM_BANDS
BATCH_SIZE = 64
LOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.
EXAMPLE_WINDOW_SECONDS = 0.96  # Each example contains 96 10ms frames
EXAMPLE_HOP_SECONDS = 0.96  # with zero overlap.
segment_size = 10

In [3]:
# set seeds
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True

In [4]:
df_train = pd.read_csv(os.path.join(DATA_PATH, train_meta_fname))
df_test = pd.read_csv(os.path.join(DATA_PATH, test_meta_fname))

In [5]:
n_classes = df_train.label.nunique()
print(n_classes)
classes_dict = {cl:i for i,cl in enumerate(df_train.label.unique())}
df_train['label_encoded'] = df_train.label.map(classes_dict)
df_train.head()

41


Unnamed: 0,fname,label,label_encoded
0,8bcbcc394ba64fe85ed4.wav,Finger_snapping,0
1,00d77b917e241afa06f1.wav,Squeak,1
2,17bb93b73b8e79234cb3.wav,Electric_piano,2
3,7d5c7a40a936136da55e.wav,Harmonica,3
4,17e0ee7565a33d6c2326.wav,Snare_drum,4


In [6]:
df_train['fname'] = df_train['fname'].apply(lambda x: os.path.join(train_data_folder, x))
df_test['fname'] = df_test['fname'].apply(lambda x: os.path.join(test_data_folder, x))

In [12]:
class VGGishModel(nn.Module):
    
    def __init__(self, sample_rate=16000, n_classes=41):
        super().__init__()
        self.ms = torchaudio.transforms.MelSpectrogram(
            sample_rate=SAMPLE_RATE,
            n_fft = int(STFT_WINDOW_LENGTH_SECONDS * SAMPLE_RATE),
            win_length = int(STFT_WINDOW_LENGTH_SECONDS * SAMPLE_RATE),
            hop_length = int(STFT_HOP_LENGTH_SECONDS * SAMPLE_RATE),
            n_mels = NUM_BANDS)
        self.features = torch.hub.load('harritaylor/torchvggish', 'vggish').features
        self.pool = nn.AdaptiveMaxPool2d((4,1))
        self.lin1 = nn.Linear(512*4, 256)
        self.dropout1 = nn.Dropout(p=0.6)
        self.lin2 = nn.Linear(256, 128)
        self.lin3 = nn.Linear(128, n_classes)
        
    def forward(self, x):
        x = self.ms(x)
        x = torch.log(x + LOG_OFFSET)
        x = self.features(x)
        x = self.pool(x)
        x = x.view(x.shape[0], -1)
        x = self.dropout1(F.relu(x))
        x = F.relu(self.lin1(x))
        x = F.relu(self.lin2(x))
        x = self.lin3(x)
        return x
    
    def inference(self, x):
        x = self.forward(x)
        x = F.softmax(x)
        return x
    
    
class EfficientNetModel(nn.Module):
    
    def __init__(self, sample_rate=16000, n_classes=41):
        super().__init__()
        self.ms = torchaudio.transforms.MelSpectrogram(
            sample_rate=SAMPLE_RATE,
            n_fft = int(STFT_WINDOW_LENGTH_SECONDS * SAMPLE_RATE),
            win_length = int(STFT_WINDOW_LENGTH_SECONDS * SAMPLE_RATE),
            hop_length = int(STFT_HOP_LENGTH_SECONDS * SAMPLE_RATE),
            n_mels = NUM_BANDS)
        self.cnn1 = nn.Conv2d(in_channels=1, out_channels=10, kernel_size=3, padding=1)
        self.cnn3 = nn.Conv2d(in_channels=10, out_channels=3, kernel_size=3, padding=1)
        self.features = EfficientNet.from_pretrained('efficientnet-b3')
        self.lin1 = nn.Linear(1000, 256)
        self.dropout1 = nn.Dropout(p=0.5)
        self.lin2 = nn.Linear(256, 128)
        self.lin3 = nn.Linear(128, n_classes)
        
    def forward(self, x):
        x = self.ms(x)
        x = torch.log(x + LOG_OFFSET)
        x = F.relu(self.cnn1(x))
        x = F.relu(self.cnn3(x))
        x = self.features(x)
        x = x.view(x.shape[0], -1)
        x = self.dropout1(F.relu(x))
        x = F.relu(self.lin1(x))
        x = F.relu(self.lin2(x))
        x = self.lin3(x)
        return x
    
    def inference(self, x):
        x = self.forward(x)
        x = F.softmax(x)
        return x

In [16]:
def eval_model(model, eval_dataset):
    model.eval()
    forecast, true_labs = [], []
    with torch.no_grad():
        for wavs, labs in tqdm(eval_dataset):
            wavs, labs = wavs.cuda(), labs.detach().numpy()
            true_labs.append(labs)
            outputs = model.inference(wavs)
            
            outputs = outputs.detach().cpu().numpy().argmax(axis=1)
            forecast.append(outputs)
    forecast = [x for sublist in forecast for x in sublist]
    true_labs = [x for sublist in true_labs for x in sublist]
    return f1_score(forecast, true_labs, average='macro')

def sample_or_pad(waveform, wav_len=10):
    wav_len = 16000 * wav_len
    m, n = waveform.shape
    if n < wav_len:
        padded_wav = torch.zeros(1, wav_len)
        padded_wav[:, :n] = waveform
        return padded_wav
    elif n > wav_len:
        offset = np.random.randint(0, n - wav_len)
        sampled_wav = waveform[:, offset:offset+wav_len]
        return sampled_wav
    else:
        return waveform
        
        
class EventDetectionDataset(Dataset):
    def __init__(self, data_path, x, y=None, wav_len=10):
        self.x = x
        self.y = y
        self.data_path = data_path
        self.wav_len = wav_len
    
    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        path2wav = os.path.join(self.data_path, self.x[idx])
        waveform, sample_rate = torchaudio.load(path2wav, normalization=True)
        waveform = sample_or_pad(waveform, self.wav_len)
        if self.y is not None:
            return waveform, self.y[idx]
        return waveform


def get_data_loaders(df, splitter, batch_size=64, segment_size=10):
    train_idx, test_idx = next(splitter.split(df.fname.values, df.label_encoded.values))
    X_train = df.fname.values[train_idx]
    X_val = df.fname.values[test_idx]
    y_train = df.label_encoded.values[train_idx]
    y_val = df.label_encoded.values[test_idx]
    
    train_loader = DataLoader(
                        EventDetectionDataset(DATA_PATH, X_train, y_train, segment_size),
                        batch_size=batch_size
                )
    val_loader = DataLoader(
                        EventDetectionDataset(DATA_PATH, X_val, y_val, segment_size),
                        batch_size=batch_size
                )
    return train_loader, val_loader


def compute_and_save_eval(model, test_loader, postfix, best_f1):
    model.eval()
    forecast = []
    df_eval = pd.read_csv(os.path.join(DATA_PATH, test_meta_fname))
    with torch.no_grad():
        for wavs in tqdm(test_loader):
            wavs = wavs.cuda()
            outputs = model.inference(wavs)
            outputs = outputs.detach().cpu().numpy()
            forecast.extend(outputs)
    df_eval['probs'] = forecast
    df_eval.to_csv(f'eval/{postfix}_{best_f1}.csv', index=None)
    
    return np.array(forecast)


def upgrade_df_train(forecast, threashhold=0.9):
    labels = [x.argmax() for x in forecast]
    probs = [x.max() for x in forecast]
    decoder = {classes_dict[cl]:cl for cl in classes_dict}
    labels = pd.Series(labels).map(decoder)
    
    df_train = pd.read_csv(os.path.join(DATA_PATH, train_meta_fname))
    df_test = pd.read_csv(os.path.join(DATA_PATH, test_meta_fname))
    df_train['label_encoded'] = df_train.label.map(classes_dict)
    df_train['fname'] = df_train['fname'].apply(lambda x: os.path.join(train_data_folder, x))
    df_test['fname'] = df_test['fname'].apply(lambda x: os.path.join(test_data_folder, x))
    
    df_test['label'] = labels
    df_test['probs'] = probs
    df_test['label_encoded'] = df_test.label.map(classes_dict)
    test_semi_subset = df_test[df_test['probs'] > threashhold]
    merged = df_train.append(test_semi_subset)
    return merged


def train(model, train_loader, val_loader, postfix, n_epoch=80, lr_decay=0.95):
    best_f1 = 0
    criterion = nn.CrossEntropyLoss()
    model = model.cuda()
    lr = 1e-3

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(n_epoch):
        model.train()
        for wavs, labs in tqdm(train_loader):
            optimizer.zero_grad()
            wavs, labs = wavs.cuda(), labs.cuda()
            outputs = model(wavs)
            loss = criterion(outputs, labs)
            loss.backward()
            optimizer.step()
        f1 = eval_model(model, val_loader)
        print(f'epoch: {epoch}, f1_test: {f1}')
        if f1 > best_f1:
            best_f1 = f1
            torch.save(model.state_dict(), f'best_models/{postfix}_{best_f1}.pt')
        lr = lr * lr_decay
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
    return best_f1

In [10]:
n_epoch = 60
lr_decay = 0.95
current_threashhold = 0.95
threashhold_step = 0.05
semi_loops = 4

splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=42)
models_dict = {"efficient": EfficientNetModel, "vggish": VGGishModel}
results = {}

current_train_df = df_train

for i in range(semi_loops):
    print(f"current semi loop {i}")
    loop_forecast = np.zeros((len(df_test), n_classes))
    k = 0
    for segment_size in [10,5]:
        test_loader = DataLoader(
                        EventDetectionDataset(DATA_PATH, df_test.fname.values, None, segment_size),
                        batch_size=64, shuffle=False
                )
        for model_type in ["efficient", "vggish"]:
            postfix = f"{model_type}_loop_{i}_{segment_size}_sec"
            model = models_dict[model_type]()
            train_loader, val_loader = get_data_loaders(current_train_df,
                                                        splitter,
                                                        batch_size=BATCH_SIZE,
                                                        segment_size=segment_size)
            best_f1 = train(model, train_loader, val_loader, postfix, n_epoch , lr_decay)
            results[postfix] = best_f1
            print(f"{postfix} best f1 {best_f1}")
            forecast = compute_and_save_eval(model, test_loader, postfix, best_f1)
            loop_forecast += forecast
            k += 1
    
    loop_forecast /= k
    len_before = len(current_train_df)
    current_train_df = upgrade_df_train(loop_forecast, current_threashhold)
    len_after = len(current_train_df)
    print(f"loop df upgrade for {len_after - len_before}")
    current_threashhold -= threashhold_step
    n_epoch+=10

current semi loop 0
Loaded pretrained weights for efficientnet-b3


  0%|          | 0/76 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 72.00 MiB (GPU 0; 10.91 GiB total capacity; 9.46 GiB already allocated; 18.50 MiB free; 9.62 GiB reserved in total by PyTorch)

In [None]:
def extract_probs(str_probs):
    return np.array([float(x) for x in str_probs[1:-1].split()])

In [None]:
df_eval = pd.read_csv(os.path.join(DATA_PATH, test_meta_fname))
forecast_list = os.listdir("eval/")
forecast = np.zeros((len(df_test), n_classes))
for forecast_df_name in forecast_list:
    print(f"read df {forecast_df_name}")
    df = pd.read_csv(os.path.join("eval", forecast_df_name))
    df_probs = np.array([extract_probs(str_probs) for str_probs in df['probs'].values])
    forecast += df_probs
labels = [x.argmax() for x in forecast]
decoder = {classes_dict[cl]:cl for cl in classes_dict}
labels = pd.Series(labels).map(decoder)
df_eval['label'] = labels
df_eval.to_csv(f'all_models.csv', index=None)