# The Last Frequency: ResNet-34 5-Fold Solution

This notebook implements a deeper ResNet-34 architecture to capture more complex audio features.

In [None]:
import os, json, random, numpy as np, pandas as pd, torch, torch.nn as nn, torch.nn.functional as F, torchaudio, torchvision.models as models
from tqdm.auto import tqdm
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

def seed_everything(seed=42):
    random.seed(seed); os.environ['PYTHONHASHSEED'] = str(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True; torch.backends.cudnn.benchmark = False

seed_everything(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class CFG:
    data_dir = '/kaggle/input/the-last-frequency'
    sample_rate, n_fft, hop_length, n_mels, target_frames = 16000, 1024, 256, 128, 64
    n_splits, batch_size, epochs, lr, weight_decay, label_smoothing, mixup_alpha = 5, 64, 35, 1e-3, 1e-2, 0.1, 0.2
    num_classes = 35

train_waveforms = np.load(f'{CFG.data_dir}/train_waveforms.npy')
train_labels = np.load(f'{CFG.data_dir}/train_labels.npy')
with open(f'{CFG.data_dir}/label_map.json') as f: label_map = {int(k): v for k, v in json.load(f).items()}

class SpecTransform(nn.Module):
    def __init__(self):
        super().__init__()
        self.mel_spec = torchaudio.transforms.MelSpectrogram(sample_rate=CFG.sample_rate, n_fft=CFG.n_fft, hop_length=CFG.hop_length, n_mels=CFG.n_mels)
        self.amp_to_db = torchaudio.transforms.AmplitudeToDB()
        self.f_mask, self.t_mask = torchaudio.transforms.FrequencyMasking(20), torchaudio.transforms.TimeMasking(15)
    def forward(self, x, augment=False):
        x = self.amp_to_db(self.mel_spec(x))
        if x.shape[-1] > CFG.target_frames: x = x[..., :CFG.target_frames]
        elif x.shape[-1] < CFG.target_frames: x = F.pad(x, (0, CFG.target_frames - x.shape[-1]))
        if augment: x = self.t_mask(self.f_mask(x))
        return x

class AudioResNet34(nn.Module):
    def __init__(self, num_classes=35):
        super().__init__()
        model = models.resnet34(weights=None)
        model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        model.fc = nn.Sequential(nn.Dropout(0.3), nn.Linear(model.fc.in_features, num_classes))
        self.backbone, self.spec_layer = model, SpecTransform()
    def forward(self, x, augment=False):
        return self.backbone(self.spec_layer(x, augment=augment).unsqueeze(1))

class SpeechDataset(Dataset):
    def __init__(self, waveforms, labels=None, augment=False):
        self.waveforms, self.labels, self.augment = waveforms, labels, augment
    def __len__(self): return len(self.waveforms)
    def __getitem__(self, idx):
        wav = self.waveforms[idx].copy()
        if self.augment: wav = np.roll(wav, int(random.uniform(-0.1, 0.1) * wav.shape[0]))
        wav = torch.from_numpy(wav).float()
        return (wav, self.labels[idx]) if self.labels is not None else wav

skf = StratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=42)
for fold, (train_idx, val_idx) in enumerate(skf.split(train_waveforms, train_labels)):
    print(f"Fold {fold+1}")
    train_loader = DataLoader(SpeechDataset(train_waveforms[train_idx], train_labels[train_idx], augment=True), batch_size=CFG.batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(SpeechDataset(train_waveforms[val_idx], train_labels[val_idx], augment=False), batch_size=CFG.batch_size, shuffle=False)
    model = AudioResNet34(CFG.num_classes).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)
    sched = torch.optim.lr_scheduler.OneCycleLR(opt, max_lr=CFG.lr*2, steps_per_epoch=len(train_loader), epochs=CFG.epochs)
    crit = nn.CrossEntropyLoss(label_smoothing=CFG.label_smoothing)
    best_acc = 0
    for epoch in range(1, CFG.epochs + 1):
        model.train()
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            p = model(x, augment=True)
            l = crit(p, y)
            opt.zero_grad(); l.backward(); opt.step(); sched.step()
        model.eval(); vp, vt = [], []
        with torch.no_grad():
            for x, y in val_loader:
                vp.extend(model(x.to(device)).argmax(1).cpu().numpy()); vt.extend(y.numpy())
        acc = accuracy_score(vt, vp)
        if acc > best_acc:
            best_acc = acc; torch.save(model.state_dict(), f'resnet34_fold_{fold}.pth')
        if epoch % 10 == 0: print(f"Epoch {epoch} Val Acc: {acc:.4f}")

def get_probs(wavs):
    all_p = []
    ld = DataLoader(SpeechDataset(wavs, augment=False), batch_size=CFG.batch_size, shuffle=False)
    for f in range(CFG.n_splits):
        m = AudioResNet34(CFG.num_classes).to(device)
        m.load_state_dict(torch.load(f'resnet34_fold_{f}.pth'))
        m.eval(); fp = []
        with torch.no_grad():
            for x in tqdm(ld): fp.append(F.softmax(m(x.to(device)), dim=1).cpu().numpy())
        all_p.append(np.concatenate(fp))
    return np.mean(all_p, axis=0)

pub, priv = np.load(f'{CFG.data_dir}/public_test_waveforms.npy'), np.load(f'{CFG.data_dir}/private_test_waveforms.npy')
final_cmds = [label_map[i] for i in np.concatenate([get_probs(pub).argmax(1), get_probs(priv).argmax(1)])]
pd.DataFrame({'Id': range(len(final_cmds)), 'Command': final_cmds}).to_csv('resnet34_submission.csv', index=False)