# The Last Frequency: SOTA K-Fold Ensemble Solution

Targeting 0.95+ with a 5-Fold Ensemble of ResNet-18 models trained from scratch.

### Why K-Fold?
In audio tasks, specific samples can be tricky. A single model might miss patterns that another fold catches. By training **5 models** on different data subsets and averaging their confidence (probabilities), we significantly reduce variance and improve generalization on the private leaderboard.

In [None]:
import os
import json
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
import torchvision.models as models
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

### Configuration

In [None]:
class CFG:
    data_dir = '/kaggle/input/the-last-frequency'
    sample_rate = 16000
    n_fft = 1024
    hop_length = 256
    n_mels = 128
    target_frames = 64
    
    # K-Fold specific
    n_splits = 5
    batch_size = 64
    epochs = 35 # 35 epochs per fold is usually enough for scratch ResNet
    lr = 1e-3
    weight_decay = 1e-2
    label_smoothing = 0.1
    mixup_alpha = 0.2
    
    num_classes = 35

### Load Data

In [None]:
print("Loading data...")
train_waveforms = np.load(f'{CFG.data_dir}/train_waveforms.npy')
train_labels = np.load(f'{CFG.data_dir}/train_labels.npy')

with open(f'{CFG.data_dir}/label_map.json') as f:
    label_map = {int(k): v for k, v in json.load(f).items()}

print(f'Train shape: {train_waveforms.shape}')

### Augments & Model Logic

In [None]:
class AudioAugmentor:
    @staticmethod
    def time_shift(waveform, shift_limit=0.1):
        shift = int(random.uniform(-shift_limit, shift_limit) * waveform.shape[0])
        return np.roll(waveform, shift)

class SpecTransform(nn.Module):
    def __init__(self):
        super().__init__()
        self.mel_spec = torchaudio.transforms.MelSpectrogram(
            sample_rate=CFG.sample_rate, n_fft=CFG.n_fft, 
            hop_length=CFG.hop_length, n_mels=CFG.n_mels
        )
        self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
        self.freq_mask = torchaudio.transforms.FrequencyMasking(20)
        self.time_mask = torchaudio.transforms.TimeMasking(15)

    def forward(self, x, augment=False):
        spec = self.mel_spec(x)
        spec = self.amplitude_to_db(spec)
        if spec.shape[-1] > CFG.target_frames:
            spec = spec[..., :CFG.target_frames]
        elif spec.shape[-1] < CFG.target_frames:
            spec = F.pad(spec, (0, CFG.target_frames - spec.shape[-1]))
        if augment:
            spec = self.freq_mask(spec)
            spec = self.time_mask(spec)
        return spec

class AudioResNet(nn.Module):
    def __init__(self, num_classes=35):
        super().__init__()
        model = models.resnet18(weights=None)
        model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        model.fc = nn.Sequential(nn.Dropout(0.3), nn.Linear(model.fc.in_features, num_classes))
        self.backbone = model
        self.spec_layer = SpecTransform()

    def forward(self, x, augment=False):
        x = self.spec_layer(x, augment=augment)
        x = x.unsqueeze(1)
        return self.backbone(x)

class SpeechDataset(Dataset):
    def __init__(self, waveforms, labels=None, augment=False):
        self.waveforms = waveforms
        self.labels = labels
        self.augment = augment
        self.augmentor = AudioAugmentor()

    def __len__(self):
        return len(self.waveforms)

    def __getitem__(self, idx):
        waveform = self.waveforms[idx].copy()
        if self.augment:
            waveform = self.augmentor.time_shift(waveform)
        waveform = torch.from_numpy(waveform).float()
        if self.labels is not None: return waveform, self.labels[idx]
        return waveform

def mixup_data(x, y, alpha=0.2):
    lam = np.random.beta(alpha, alpha) if alpha > 0 else 1
    index = torch.randperm(x.size()[0]).to(device)
    mixed_x = lam * x + (1 - lam) * x[index, :]
    return mixed_x, y, y[index], lam

### K-Fold Training Loop

In [None]:
skf = StratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=42)
oof_preds = np.zeros(len(train_labels))

for fold, (train_idx, val_idx) in enumerate(skf.split(train_waveforms, train_labels)):
    print(f"\n{'='*20} Fold {fold+1}/{CFG.n_splits} {'='*20}")
    
    train_ds = SpeechDataset(train_waveforms[train_idx], train_labels[train_idx], augment=True)
    val_ds = SpeechDataset(train_waveforms[val_idx], train_labels[val_idx], augment=False)
    
    train_loader = DataLoader(train_ds, batch_size=CFG.batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_ds, batch_size=CFG.batch_size, shuffle=False)
    
    model = AudioResNet(CFG.num_classes).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer, max_lr=CFG.lr*2, steps_per_epoch=len(train_loader), epochs=CFG.epochs
    )
    criterion = nn.CrossEntropyLoss(label_smoothing=CFG.label_smoothing)
    
    best_fold_acc = 0
    
    for epoch in range(1, CFG.epochs + 1):
        model.train()
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            if random.random() < 0.5:
                x, y_a, y_b, lam = mixup_data(x, y, CFG.mixup_alpha)
                preds = model(x, augment=True)
                loss = lam * criterion(preds, y_a) + (1 - lam) * criterion(preds, y_b)
            else:
                preds = model(x, augment=True)
                loss = criterion(preds, y)
            
            optimizer.zero_grad(); loss.backward(); optimizer.step(); scheduler.step()
            
        # Validate
        model.eval()
        fold_preds = []
        fold_targets = []
        with torch.no_grad():
            for x, y in val_loader:
                out = model(x.to(device))
                fold_preds.extend(out.argmax(1).cpu().numpy())
                fold_targets.extend(y.numpy())
        
        acc = accuracy_score(fold_targets, fold_preds)
        if acc > best_fold_acc:
            best_fold_acc = acc
            torch.save(model.state_dict(), f'best_model_fold_{fold}.pth')
            
        if epoch % 10 == 0 or epoch == CFG.epochs:
            print(f"Epoch {epoch} | Val Acc: {acc:.4f} | Best: {best_fold_acc:.4f}")

### Multi-Fold Inference (Ensemble)

In [None]:
def get_fold_probs(waveforms):
    all_probs = []
    ds = SpeechDataset(waveforms, augment=False)
    loader = DataLoader(ds, batch_size=CFG.batch_size, shuffle=False)
    
    for fold in range(CFG.n_splits):
        print(f"Predicting with Fold {fold+1}...")
        model = AudioResNet(CFG.num_classes).to(device)
        model.load_state_dict(torch.load(f'best_model_fold_{fold}.pth'))
        model.eval()
        
        fold_prob = []
        with torch.no_grad():
            for x in tqdm(loader):
                out = model(x.to(device))
                probs = F.softmax(out, dim=1)
                fold_prob.append(probs.cpu().numpy())
        all_probs.append(np.concatenate(fold_prob))
        
    return np.mean(all_probs, axis=0)

print("Loading test sets...")
public_test = np.load(f'{CFG.data_dir}/public_test_waveforms.npy')
private_test = np.load(f'{CFG.data_dir}/private_test_waveforms.npy')

print("Ensembling 5 models for Public and Private test sets...")
public_probs = get_fold_probs(public_test)
private_probs = get_fold_probs(private_test)

final_indices = np.concatenate([
    public_probs.argmax(1), 
    private_probs.argmax(1)
])

all_commands = [label_map[idx] for idx in final_indices]
submission = pd.DataFrame({'Id': range(len(all_commands)), 'Command': all_commands})
submission.to_csv('submission.csv', index=False)
print("Submission with K-Fold Ensemble saved!")