# The Last Frequency: Audio SOTA K-Fold Solution

This notebook implements a high-performance audio classification pipeline with **Delta & Delta-Delta features**, transforming a grayscale spectrogram into a 3-channel (RGB-like) representation for ResNet.

### SOTA Features:
1. **Triple-Channel Input**: [Log-Mel, Delta, Delta-Delta] capture static and dynamic sound patterns.
2. **5-Fold Stratified CV**: Ensemble of 5 models trained from scratch.
3. **Heavy Augmentations**: TimeShift, Mixup, and SpecAugment (Frequency/Time masking).
4. **Optimization**: AdamW + OneCycleLR + Label Smoothing.

In [None]:
import os, json, random, numpy as np, pandas as pd, torch, torch.nn as nn, torch.nn.functional as F, torchaudio, torchvision.models as models
from tqdm.auto import tqdm
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

def seed_everything(seed=42):
    random.seed(seed); os.environ['PYTHONHASHSEED'] = str(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True; torch.backends.cudnn.benchmark = False

seed_everything(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

In [None]:
class CFG:
    data_dir = '/kaggle/input/the-last-frequency'
    sample_rate, n_fft, hop_length, n_mels, target_frames = 16000, 1024, 256, 128, 64
    n_splits, batch_size, epochs, lr, weight_decay, label_smoothing, mixup_alpha = 5, 64, 40, 1e-3, 1e-2, 0.1, 0.2
    num_classes = 35

In [None]:
print("Loading data...")
train_waveforms = np.load(f'{CFG.data_dir}/train_waveforms.npy')
train_labels = np.load(f'{CFG.data_dir}/train_labels.npy')
with open(f'{CFG.data_dir}/label_map.json') as f: 
    label_map = {int(k): v for k, v in json.load(f).items()}
print(f'Data loaded: {train_waveforms.shape}')

In [None]:
class SpecTransform(nn.Module):
    """Advanced 3-channel feature extractor"""
    def __init__(self):
        super().__init__()
        self.mel_spec = torchaudio.transforms.MelSpectrogram(
            sample_rate=CFG.sample_rate, n_fft=CFG.n_fft, 
            hop_length=CFG.hop_length, n_mels=CFG.n_mels
        )
        self.amp_to_db = torchaudio.transforms.AmplitudeToDB()
        self.freq_mask = torchaudio.transforms.FrequencyMasking(25)
        self.time_mask = torchaudio.transforms.TimeMasking(20)

    def forward(self, x, augment=False):
        # 1. Base Mel Spectrogram
        spec = self.amp_to_db(self.mel_spec(x))
        
        # 2. Resizing/Padding
        if spec.shape[-1] > CFG.target_frames: spec = spec[..., :CFG.target_frames]
        elif spec.shape[-1] < CFG.target_frames: spec = F.pad(spec, (0, CFG.target_frames - spec.shape[-1]))
            
        if augment:
            spec = self.freq_mask(spec)
            spec = self.time_mask(spec)
            
        # 3. Compute Delta Features
        # spec is (batch, n_mels, time)
        delta = torchaudio.functional.compute_deltas(spec)
        delta2 = torchaudio.functional.compute_deltas(delta)
        
        # 4. Stack into (batch, 3, n_mels, time)
        return torch.stack([spec, delta, delta2], dim=1)

class AudioResNet(nn.Module):
    def __init__(self, num_classes=35):
        super().__init__()
        model = models.resnet18(weights=None)
        # Original ResNet uses 3 channels, which we now provide with Deltas
        model.fc = nn.Sequential(nn.Dropout(0.3), nn.Linear(model.fc.in_features, num_classes))
        self.backbone = model
        self.spec_layer = SpecTransform()

    def forward(self, x, augment=False):
        x = self.spec_layer(x, augment=augment)
        return self.backbone(x)

In [None]:
class SpeechDataset(Dataset):
    def __init__(self, waveforms, labels=None, augment=False):
        self.waveforms, self.labels, self.augment = waveforms, labels, augment
    def __len__(self): return len(self.waveforms)
    def __getitem__(self, idx):
        wav = self.waveforms[idx].copy()
        if self.augment: wav = np.roll(wav, int(random.uniform(-0.1, 0.1) * wav.shape[0]))
        wav = torch.from_numpy(wav).float()
        return (wav, self.labels[idx]) if self.labels is not None else wav

def mixup_data(x, y, alpha=0.2):
    lam = np.random.beta(alpha, alpha) if alpha > 0 else 1
    index = torch.randperm(x.size()[0]).to(device)
    mixed_x = lam * x + (1 - lam) * x[index, :]
    return mixed_x, y, y[index], lam

In [None]:
skf = StratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=42)
for fold, (train_idx, val_idx) in enumerate(skf.split(train_waveforms, train_labels)):
    print(f"\nFold {fold+1}/{CFG.n_splits}")
    train_loader = DataLoader(SpeechDataset(train_waveforms[train_idx], train_labels[train_idx], augment=True), batch_size=CFG.batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(SpeechDataset(train_waveforms[val_idx], train_labels[val_idx], augment=False), batch_size=CFG.batch_size, shuffle=False)
    
    model = AudioResNet(CFG.num_classes).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=CFG.lr*2, steps_per_epoch=len(train_loader), epochs=CFG.epochs)
    criterion = nn.CrossEntropyLoss(label_smoothing=CFG.label_smoothing)
    
    best_acc = 0
    for epoch in range(1, CFG.epochs + 1):
        model.train()
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            if random.random() < 0.5:
                x, y_a, y_b, lam = mixup_data(x, y, CFG.mixup_alpha)
                preds = model(x, augment=True)
                loss = lam * criterion(preds, y_a) + (1 - lam) * criterion(preds, y_b)
            else:
                preds = model(x, augment=True); loss = criterion(preds, y)
            optimizer.zero_grad(); loss.backward(); optimizer.step(); scheduler.step()
            
        model.eval(); vp, vt = [], []
        with torch.no_grad():
            for x, y in val_loader:
                out = model(x.to(device))
                vp.extend(out.argmax(1).cpu().numpy()); vt.extend(y.numpy())
        
        acc = accuracy_score(vt, vp)
        if acc > best_acc:
            best_acc = acc; torch.save(model.state_dict(), f'best_model_fold_{fold}.pth')
        if epoch % 10 == 0: print(f"Epoch {epoch} Val Acc: {acc:.4f} (Best: {best_acc:.4f})")

In [None]:
def get_fold_probs(waveforms):
    all_probs = []
    loader = DataLoader(SpeechDataset(waveforms, augment=False), batch_size=CFG.batch_size, shuffle=False)
    for fold in range(CFG.n_splits):
        m = AudioResNet(CFG.num_classes).to(device)
        m.load_state_dict(torch.load(f'best_model_fold_{fold}.pth'))
        m.eval(); fold_prob = []
        with torch.no_grad():
            for x in tqdm(loader): 
                probs = F.softmax(m(x.to(device)), dim=1)
                fold_prob.append(probs.cpu().numpy())
        all_probs.append(np.concatenate(fold_prob))
    return np.mean(all_probs, axis=0)

pub, priv = np.load(f'{CFG.data_dir}/public_test_waveforms.npy'), np.load(f'{CFG.data_dir}/private_test_waveforms.npy')
final_probs = np.concatenate([get_fold_probs(pub), get_fold_probs(priv)])
final_cmds = [label_map[idx] for idx in final_probs.argmax(1)]
pd.DataFrame({'Id': range(len(final_cmds)), 'Command': final_cmds}).to_csv('submission.csv', index=False)
print("Final submission saved with Delta features!")