# Imports

In [1]:
import json
import os
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import random
from collections import Counter
from torch.utils.tensorboard import SummaryWriter
import numpy as np

random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

In [2]:
train_data_path = "./data/train_data/train_opus"
test_data_path = "./data/test_data/test_opus"
train_audio_path = f"{train_data_path}/audio"
test_audio_path = f"{test_data_path}/audio"
train_bounds_path = f"{train_data_path}/word_bounds.json"

# Data preparation

In [3]:
train_audio_paths = []
train_audio_labels = []

with open(train_bounds_path, "r") as f:
    train_bounds = json.load(f)

for audio_path in tqdm(os.listdir(train_audio_path)):
    if audio_path.startswith('.'):
        continue
    audio_id = audio_path.split('.')[0]
    label = 1 if audio_id in train_bounds else 0
    train_audio_labels.append(label)
    train_audio_paths.append(os.path.join(train_audio_path, audio_path))

print("Label distribution: ", Counter(train_audio_labels))

100%|██████████| 180000/180000 [00:00<00:00, 1832934.49it/s]

Label distribution:  Counter({1: 45000, 0: 45000})





In [4]:
train_audio_paths, val_audio_paths, train_audio_labels, val_audio_labels = train_test_split(
    train_audio_paths,
    train_audio_labels,
    test_size=0.2,
    stratify=train_audio_labels,
    random_state=42
)

In [5]:
class KeywordDataset(Dataset):
    def __init__(self, audio_paths, audio_labels, transform=None, sr=16000):
        self.audio_labels = audio_labels
        self.transform = transform
        self.sr = sr
        self.data = []
        
        for audio_path in tqdm(audio_paths):
            waveform, sample_rate = torchaudio.load(audio_path)
            if sample_rate != self.sr:
                resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=self.sr)
                waveform = resampler(waveform)
            if transform:
                waveform = transform(waveform)
            self.data.append(waveform)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        label = self.audio_labels[idx]
        waveform = self.data[idx]
        # if self.transform:
        #     waveform = self.transform(waveform)
        return waveform, label
    
mel_spectrogram_transform = nn.Sequential(
    MelSpectrogram(sample_rate=16000, n_mels=64),
    AmplitudeToDB()
)

# Model

In [6]:
# https://arxiv.org/pdf/1904.03814

class TemporalResBlock(nn.Module):
    def __init__(self, in_ch, out_ch, kernel_size=9):
        super().__init__()
        padding = kernel_size // 2

        self.conv1 = nn.Conv1d(
            in_ch, out_ch, kernel_size,
            padding=padding, bias=False
        )
        self.bn1 = nn.BatchNorm1d(out_ch)

        self.conv2 = nn.Conv1d(
            out_ch, out_ch, kernel_size,
            padding=padding, bias=False
        )
        self.bn2 = nn.BatchNorm1d(out_ch)

        self.relu = nn.ReLU(inplace=True)
        
        self.conv3 = nn.Conv1d(
            in_ch, out_ch, kernel_size=1,
            bias=False
        )
        self.bn3 = nn.BatchNorm1d(out_ch)

    def forward(self, x):
        skip = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)

        skip = self.conv3(skip)
        skip = self.bn3(skip)
        skip = self.relu(skip)

        out += skip
        out = self.relu(out)
        return out

class TCResNet8(nn.Module):
    def __init__(self, num_classes=1, channels=(16, 24, 32, 48)):
        super().__init__()

        self.input_conv = nn.Sequential(
            nn.Conv1d(64, channels[0], kernel_size=9, padding=4, bias=False),
            nn.BatchNorm1d(channels[0]),
            nn.ReLU(inplace=True)
        )

        self.layers = nn.ModuleList()
        for channel_idx in range(len(channels) - 1):
            self.layers.append(TemporalResBlock(channels[channel_idx], channels[channel_idx + 1]))

        self.pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(channels[-1], num_classes)

    def forward(self, x):
        x = x.squeeze(1)
        x = self.input_conv(x)

        for layer in self.layers:
            x = layer(x)

        x = self.pool(x).squeeze(-1)
        x = self.fc(x)
        return x


In [7]:
model = TCResNet8()

print("Total params: ", sum(p.numel() for p in model.parameters()))

Total params:  71937


# Util functions

In [8]:
@torch.no_grad()
def get_metrics(preds, targets):
    preds = (torch.sigmoid(preds) > 0.5).float()
    acc = (preds == targets).float().mean().item()
    fn = (preds < targets).int().sum().item()
    fp = (preds > targets).int().sum().item()
    pos = targets.int().sum().item()
    neg = targets.shape[0] - pos
    f1 = 2 * ((pos - fn) / (2 * pos - fn + fp)) if (2 * pos - fn + fp) > 0 else 0.0
    far = fp / neg if neg > 0 else 0.0
    frr = fn / pos if pos > 0 else 0.0
    score = 2 * ((1 - frr) * (1 - far)) / ((1 - frr) + (1 - far)) if ((1 - frr) + (1 - far)) > 0 else 0.0
 
    return {
        'Accuracy': acc,
        'f1': f1,
        'score': score
    }

def eval_step(model, test_dataloader, criterion, device='cuda', max_steps=None):
    model.eval()
    valid_loss = 0.0
    steps = 0
    all_preds, all_targets = [], []
    with torch.no_grad():
        for i, batch in enumerate(tqdm(test_dataloader, desc="Validation", leave=False)):
            if max_steps is not None and i >= max_steps:
                break
            mels, labels = batch
            mels = mels.to(device)
            labels = labels.float().to(device).unsqueeze(1)
            
            outputs = model(mels)
            loss = criterion(outputs, labels)
            
            preds = torch.sigmoid(outputs).cpu().numpy() > 0.5
            targets = labels.cpu().numpy()
            
            all_preds.extend(preds)
            all_targets.extend(targets)
            
            valid_loss += loss.item()
            steps += 1
    
    if steps > 0:
        valid_loss /= steps
        
    metrics = get_metrics(torch.tensor(all_preds), torch.tensor(all_targets))
    metrics['loss'] = valid_loss
    return metrics


def train_loop(model, train_dataloader, val_dataloader, epochs, optimizer, criterion, val_every=100, device='cuda', log_dir='./logs', best_path=None, val_steps=None):
    writer = SummaryWriter(log_dir)
    global_step = 0
    model.to(device)
    
    best_val_score = 0.0
    
    for epoch in range(epochs):
        train_loss = 0.0
        pbar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        for i, batch in enumerate(pbar):
            model.train()
            mels, labels = batch
            mels = mels.to(device)
            labels = labels.float().to(device).unsqueeze(1)
            
            optimizer.zero_grad()
            outputs = model(mels)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
            metrics = get_metrics(outputs, labels)
            for key, value in metrics.items():
                writer.add_scalar(f'Train/{key}', value, global_step)
            
            writer.add_scalar('Train/Batch_Loss', loss.item(), global_step)
            global_step += 1
            
            pbar.set_postfix({'loss': loss.item(), 'acc': metrics['Accuracy'], 'score': metrics['score']})
            
            if val_every and i % val_every == 0 and i > 0:
                metrics = eval_step(model, val_dataloader, criterion, device=device, max_steps=val_steps)
                writer.add_scalar('Valid/Loss', metrics['loss'], global_step)
                writer.add_scalar('Valid/Accuracy', metrics['Accuracy'], global_step)
                writer.add_scalar('Valid/F1', metrics['f1'], global_step)
                writer.add_scalar('Valid/Score', metrics['score'], global_step)
                
        train_loss /= len(train_dataloader)
        metrics = eval_step(model, val_dataloader, criterion, device=device)
        
        print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {train_loss:.4f}, Valid Loss: {metrics['loss']:.4f}", f"Valid Acc: {metrics['Accuracy']:.4f}", f"Valid F1: {metrics['f1']:.4f}", f"Valid Score: {metrics['score']:.4f}")
        writer.add_scalar('Train/Epoch_Loss', train_loss, epoch)
        writer.add_scalar('Valid/Epoch_Loss', metrics['loss'], epoch)
        writer.add_scalar('Valid/Epoch_Accuracy', metrics['Accuracy'], epoch)
        writer.add_scalar('Valid/Epoch_F1', metrics['f1'], epoch)
        writer.add_scalar('Valid/Epoch_Score', metrics['score'], epoch)
        
        if best_path is not None and metrics['score'] > best_val_score:
            best_val_score = metrics['score']
            torch.save(model.state_dict(), best_path)
            print(f"New best model saved")
    
    writer.close()

In [9]:
import pickle

train_pkl_path = "train_dataset.pkl"
val_pkl_path = "val_dataset.pkl"

# train_dataset = KeywordDataset(train_audio_paths, train_audio_labels, transform=mel_spectrogram_transform)
# val_dataset = KeywordDataset(val_audio_paths, val_audio_labels, transform=mel_spectrogram_transform)

# with open(train_pkl_path, "wb") as f:
#     pickle.dump(train_dataset, f)
# with open(val_pkl_path, "wb") as f:
#     pickle.dump(val_dataset, f)


with open(train_pkl_path, "rb") as f:
    train_dataset = pickle.load(f)
with open(val_pkl_path, "rb") as f:
    val_dataset = pickle.load(f)

# Train

In [10]:
device = 'cuda'
batch_size = 64
epochs = 25
lr = 1e-3

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

model = TCResNet8().to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCEWithLogitsLoss()

train_loop(
    model=model,
    train_dataloader=train_loader,
    val_dataloader=val_loader,
    epochs=epochs,
    optimizer=optimizer,
    criterion=criterion,
    device=device,
    best_path="best_model.pth",
    val_every=None,
    val_steps=None
)

Epoch 1/25: 100%|██████████| 1125/1125 [00:07<00:00, 141.59it/s, loss=0.457, acc=0.781, score=0.78] 
Epoch 1/25: 100%|██████████| 1125/1125 [00:07<00:00, 141.59it/s, loss=0.457, acc=0.781, score=0.78]
  metrics = get_metrics(torch.tensor(all_preds), torch.tensor(all_targets))
  metrics = get_metrics(torch.tensor(all_preds), torch.tensor(all_targets))


Epoch [1/25], Train Loss: 0.5751, Valid Loss: 0.4852 Valid Acc: 0.7761 Valid F1: 0.7587 Valid Score: 0.7694
New best model saved


Epoch 2/25: 100%|██████████| 1125/1125 [00:07<00:00, 158.04it/s, loss=0.254, acc=0.875, score=0.874]
Epoch 2/25: 100%|██████████| 1125/1125 [00:07<00:00, 158.04it/s, loss=0.254, acc=0.875, score=0.874]
                                                              

Epoch [2/25], Train Loss: 0.4322, Valid Loss: 0.3871 Valid Acc: 0.8313 Valid F1: 0.8224 Valid Score: 0.8283
New best model saved


Epoch 3/25: 100%|██████████| 1125/1125 [00:07<00:00, 159.44it/s, loss=0.316, acc=0.906, score=0.906]
Epoch 3/25: 100%|██████████| 1125/1125 [00:07<00:00, 159.44it/s, loss=0.316, acc=0.906, score=0.906]
                                                              

Epoch [3/25], Train Loss: 0.3714, Valid Loss: 0.4483 Valid Acc: 0.7867 Valid F1: 0.7443 Valid Score: 0.7518


Epoch 4/25: 100%|██████████| 1125/1125 [00:07<00:00, 160.04it/s, loss=0.375, acc=0.828, score=0.827]
Epoch 4/25: 100%|██████████| 1125/1125 [00:07<00:00, 160.04it/s, loss=0.375, acc=0.828, score=0.827]
                                                              

Epoch [4/25], Train Loss: 0.3427, Valid Loss: 0.3277 Valid Acc: 0.8666 Valid F1: 0.8657 Valid Score: 0.8666
New best model saved


Epoch 5/25: 100%|██████████| 1125/1125 [00:07<00:00, 160.46it/s, loss=0.304, acc=0.859, score=0.852]
Epoch 5/25: 100%|██████████| 1125/1125 [00:07<00:00, 160.46it/s, loss=0.304, acc=0.859, score=0.852]
                                                              

Epoch [5/25], Train Loss: 0.3203, Valid Loss: 0.3315 Valid Acc: 0.8652 Valid F1: 0.8617 Valid Score: 0.8645


Epoch 6/25: 100%|██████████| 1125/1125 [00:07<00:00, 156.26it/s, loss=0.297, acc=0.828, score=0.836]
Epoch 6/25: 100%|██████████| 1125/1125 [00:07<00:00, 156.26it/s, loss=0.297, acc=0.828, score=0.836]
                                                              

Epoch [6/25], Train Loss: 0.3033, Valid Loss: 0.3791 Valid Acc: 0.8260 Valid F1: 0.7994 Valid Score: 0.8047


Epoch 7/25: 100%|██████████| 1125/1125 [00:07<00:00, 150.26it/s, loss=0.371, acc=0.859, score=0.856]
Epoch 7/25: 100%|██████████| 1125/1125 [00:07<00:00, 150.26it/s, loss=0.371, acc=0.859, score=0.856]
                                                              

Epoch [7/25], Train Loss: 0.2890, Valid Loss: 0.3002 Valid Acc: 0.8770 Valid F1: 0.8729 Valid Score: 0.8758
New best model saved


Epoch 8/25: 100%|██████████| 1125/1125 [00:07<00:00, 150.29it/s, loss=0.358, acc=0.859, score=0.858]
Epoch 8/25: 100%|██████████| 1125/1125 [00:07<00:00, 150.29it/s, loss=0.358, acc=0.859, score=0.858]
                                                              

Epoch [8/25], Train Loss: 0.2789, Valid Loss: 0.3191 Valid Acc: 0.8681 Valid F1: 0.8591 Valid Score: 0.8634


Epoch 9/25: 100%|██████████| 1125/1125 [00:07<00:00, 150.43it/s, loss=0.372, acc=0.891, score=0.89] 
Epoch 9/25: 100%|██████████| 1125/1125 [00:07<00:00, 150.43it/s, loss=0.372, acc=0.891, score=0.89] 
                                                              

Epoch [9/25], Train Loss: 0.2700, Valid Loss: 0.2895 Valid Acc: 0.8886 Valid F1: 0.8889 Valid Score: 0.8886
New best model saved


Epoch 10/25: 100%|██████████| 1125/1125 [00:07<00:00, 148.49it/s, loss=0.187, acc=0.953, score=0.959]
Epoch 10/25: 100%|██████████| 1125/1125 [00:07<00:00, 148.49it/s, loss=0.187, acc=0.953, score=0.959]
                                                              

Epoch [10/25], Train Loss: 0.2584, Valid Loss: 0.2974 Valid Acc: 0.8894 Valid F1: 0.8920 Valid Score: 0.8887
New best model saved


Epoch 11/25: 100%|██████████| 1125/1125 [00:07<00:00, 151.11it/s, loss=0.325, acc=0.797, score=0.803]
Epoch 11/25: 100%|██████████| 1125/1125 [00:07<00:00, 151.11it/s, loss=0.325, acc=0.797, score=0.803]
                                                              

Epoch [11/25], Train Loss: 0.2529, Valid Loss: 0.2911 Valid Acc: 0.8908 Valid F1: 0.8931 Valid Score: 0.8903
New best model saved


Epoch 12/25: 100%|██████████| 1125/1125 [00:07<00:00, 151.31it/s, loss=0.2, acc=0.906, score=0.908]  
Epoch 12/25: 100%|██████████| 1125/1125 [00:07<00:00, 151.31it/s, loss=0.2, acc=0.906, score=0.908]  
                                                              

Epoch [12/25], Train Loss: 0.2457, Valid Loss: 0.2727 Valid Acc: 0.8927 Valid F1: 0.8922 Valid Score: 0.8927
New best model saved


Epoch 13/25: 100%|██████████| 1125/1125 [00:07<00:00, 150.04it/s, loss=0.171, acc=0.938, score=0.931]

                                                              

Epoch [13/25], Train Loss: 0.2401, Valid Loss: 0.2672 Valid Acc: 0.8955 Valid F1: 0.8940 Valid Score: 0.8953
New best model saved


Epoch 14/25: 100%|██████████| 1125/1125 [00:07<00:00, 149.39it/s, loss=0.173, acc=0.969, score=0.973] 
Epoch 14/25: 100%|██████████| 1125/1125 [00:07<00:00, 149.39it/s, loss=0.173, acc=0.969, score=0.973]
                                                              

Epoch [14/25], Train Loss: 0.2339, Valid Loss: 0.2894 Valid Acc: 0.8814 Valid F1: 0.8756 Valid Score: 0.8789


Epoch 15/25: 100%|██████████| 1125/1125 [00:07<00:00, 149.29it/s, loss=0.365, acc=0.766, score=0.766] 
Epoch 15/25: 100%|██████████| 1125/1125 [00:07<00:00, 149.29it/s, loss=0.365, acc=0.766, score=0.766]
                                                              

Epoch [15/25], Train Loss: 0.2260, Valid Loss: 0.2721 Valid Acc: 0.8982 Valid F1: 0.8994 Valid Score: 0.8981
New best model saved


Epoch 16/25: 100%|██████████| 1125/1125 [00:07<00:00, 151.15it/s, loss=0.204, acc=0.938, score=0.947]
Epoch 16/25: 100%|██████████| 1125/1125 [00:07<00:00, 151.15it/s, loss=0.204, acc=0.938, score=0.947]
                                                              

Epoch [16/25], Train Loss: 0.2214, Valid Loss: 0.2677 Valid Acc: 0.8949 Valid F1: 0.8941 Valid Score: 0.8949


Epoch 17/25: 100%|██████████| 1125/1125 [00:07<00:00, 148.89it/s, loss=0.241, acc=0.891, score=0.891] 
Epoch 17/25: 100%|██████████| 1125/1125 [00:07<00:00, 148.89it/s, loss=0.241, acc=0.891, score=0.891]
                                                              

Epoch [17/25], Train Loss: 0.2149, Valid Loss: 0.3641 Valid Acc: 0.8815 Valid F1: 0.8893 Valid Score: 0.8758


Epoch 18/25: 100%|██████████| 1125/1125 [00:07<00:00, 150.98it/s, loss=0.26, acc=0.906, score=0.901] 
Epoch 18/25: 100%|██████████| 1125/1125 [00:07<00:00, 150.98it/s, loss=0.26, acc=0.906, score=0.901] 
                                                              

Epoch [18/25], Train Loss: 0.2103, Valid Loss: 0.2633 Valid Acc: 0.8997 Valid F1: 0.8986 Valid Score: 0.8995
New best model saved


Epoch 19/25: 100%|██████████| 1125/1125 [00:07<00:00, 151.57it/s, loss=0.294, acc=0.859, score=0.86]  
Epoch 19/25: 100%|██████████| 1125/1125 [00:07<00:00, 151.57it/s, loss=0.294, acc=0.859, score=0.86] 
                                                              

Epoch [19/25], Train Loss: 0.2029, Valid Loss: 0.2789 Valid Acc: 0.8922 Valid F1: 0.8926 Valid Score: 0.8922


Epoch 20/25: 100%|██████████| 1125/1125 [00:07<00:00, 149.91it/s, loss=0.163, acc=0.938, score=0.935] 
Epoch 20/25: 100%|██████████| 1125/1125 [00:07<00:00, 149.91it/s, loss=0.163, acc=0.938, score=0.935]
                                                              

Epoch [20/25], Train Loss: 0.1990, Valid Loss: 0.2601 Valid Acc: 0.8982 Valid F1: 0.8959 Valid Score: 0.8976


Epoch 21/25: 100%|██████████| 1125/1125 [00:07<00:00, 148.04it/s, loss=0.37, acc=0.906, score=0.915]  
Epoch 21/25: 100%|██████████| 1125/1125 [00:07<00:00, 148.04it/s, loss=0.37, acc=0.906, score=0.915] 
                                                              

Epoch [21/25], Train Loss: 0.1957, Valid Loss: 0.2584 Valid Acc: 0.9018 Valid F1: 0.9010 Valid Score: 0.9018
New best model saved


Epoch 22/25: 100%|██████████| 1125/1125 [00:07<00:00, 149.48it/s, loss=0.22, acc=0.938, score=0.935]  
Epoch 22/25: 100%|██████████| 1125/1125 [00:07<00:00, 149.48it/s, loss=0.22, acc=0.938, score=0.935] 
                                                              

Epoch [22/25], Train Loss: 0.1909, Valid Loss: 0.2732 Valid Acc: 0.8951 Valid F1: 0.8921 Valid Score: 0.8942


Epoch 23/25: 100%|██████████| 1125/1125 [00:07<00:00, 145.70it/s, loss=0.182, acc=0.922, score=0.921] 
Epoch 23/25: 100%|██████████| 1125/1125 [00:07<00:00, 145.70it/s, loss=0.182, acc=0.922, score=0.921]
                                                              

Epoch [23/25], Train Loss: 0.1853, Valid Loss: 0.2757 Valid Acc: 0.8902 Valid F1: 0.8863 Valid Score: 0.8889


Epoch 24/25: 100%|██████████| 1125/1125 [00:07<00:00, 150.38it/s, loss=0.164, acc=0.953, score=0.96]  
Epoch 24/25: 100%|██████████| 1125/1125 [00:07<00:00, 150.38it/s, loss=0.164, acc=0.953, score=0.96] 
                                                              

Epoch [24/25], Train Loss: 0.1838, Valid Loss: 0.3178 Valid Acc: 0.8922 Valid F1: 0.8961 Valid Score: 0.8905


Epoch 25/25: 100%|██████████| 1125/1125 [00:07<00:00, 149.08it/s, loss=0.13, acc=0.953, score=0.953]  
Epoch 25/25: 100%|██████████| 1125/1125 [00:07<00:00, 149.08it/s, loss=0.13, acc=0.953, score=0.953]
                                                              

Epoch [25/25], Train Loss: 0.1792, Valid Loss: 0.2788 Valid Acc: 0.9029 Valid F1: 0.9043 Valid Score: 0.9026
New best model saved




# Submission

In [11]:
import pandas as pd

model.load_state_dict(torch.load("best_model.pth", map_location=device))

test_audio_paths = []
test_audio_ids = []

sorted_files = sorted(os.listdir(test_audio_path))

for audio_path in tqdm(sorted_files):
    if audio_path.startswith('.'):
        continue
    audio_id = audio_path.split('.')[0]
    test_audio_ids.append(audio_id)
    test_audio_paths.append(os.path.join(test_audio_path, audio_path))

test_audio_labels = [0] * len(test_audio_paths)

test_dataset = KeywordDataset(test_audio_paths, test_audio_labels, transform=mel_spectrogram_transform)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model.eval()
predictions = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        mels, _ = batch
        mels = mels.to(device)
        outputs = model(mels)
        preds = (torch.sigmoid(outputs) > 0.5).int().cpu().numpy().flatten()
        predictions.extend(preds)

df = pd.DataFrame({
    'id': test_audio_ids,
    'label': predictions
})

df.to_csv('submission.csv', index=False)

100%|██████████| 54000/54000 [00:00<00:00, 2029429.20it/s]
100%|██████████| 54000/54000 [00:00<00:00, 2029429.20it/s]
100%|██████████| 27000/27000 [03:39<00:00, 123.03it/s]
100%|██████████| 27000/27000 [03:39<00:00, 123.03it/s]
100%|██████████| 422/422 [00:01<00:00, 344.09it/s]

