# Implementation: Environmental Sound Classification

**Goal**: ESC-50 Dataset.

In [None]:
import torch
import torch.nn as nn

# 1. 2D CNN (Standard Image Model)
class AudioCNN(nn.Module):
    def __init__(self, num_classes=50):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3)
        self.pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(16, num_classes)
        
    def forward(self, x):
        # x: [Batch, Channel=1, Freq, Time]
        x = self.conv1(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# 2. Input (Spectrogram)
spectrogram = torch.randn(1, 1, 64, 100) # Batch 1, 64 Mel-bands, 100 time frames

# 3. Predict
model = AudioCNN()
out = model(spectrogram)
print(f"Output Probabilities: {out.shape}")

## Conclusion
Treating audio as an image is the 'Baselines' approach.