In [1]:
import os
import torch
import torchaudio
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, random_split

In [2]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

In [3]:
import kagglehub
path = kagglehub.dataset_download("chrisfilo/urbansound8k")
print("Path to dataset files:", path)

Using Colab cache for faster access to the 'urbansound8k' dataset.
Path to dataset files: /kaggle/input/urbansound8k


In [4]:
csv_path = '/kaggle/input/urbansound8k/UrbanSound8K.csv'
data_path = '/kaggle/input/urbansound8k'
df = pd.read_csv(csv_path)

In [5]:
SAMPLE_RATE = 22050
N_MELS = 64
MAX_LEN = 500
BATCH_SIZE = 32
LR = 0.0005

In [6]:
df = pd.read_csv(csv_path)
df.head(3)

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing


In [7]:
labels = df['class'].unique()
print(labels)

['dog_bark' 'children_playing' 'car_horn' 'air_conditioner' 'street_music'
 'gun_shot' 'siren' 'engine_idling' 'jackhammer' 'drilling']


In [8]:
df['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
dog_bark,1000
children_playing,1000
air_conditioner,1000
street_music,1000
jackhammer,1000
engine_idling,1000
drilling,1000
siren,929
car_horn,429
gun_shot,374


In [9]:
len(labels)

10

In [10]:
transform = torchaudio.transforms.MelSpectrogram(sample_rate=SAMPLE_RATE, n_mels=N_MELS)
labels = df['class'].unique()
label_to_index = {lab: ind for ind, lab in enumerate(labels)}
print(label_to_index)

{'dog_bark': 0, 'children_playing': 1, 'car_horn': 2, 'air_conditioner': 3, 'street_music': 4, 'gun_shot': 5, 'siren': 6, 'engine_idling': 7, 'jackhammer': 8, 'drilling': 9}


In [11]:
class UrbanSound(Dataset):
    def __init__(self, csv_file, audio_dir, transform, max_len):
        self.df = pd.read_csv(csv_file)
        self.audio_dir = audio_dir
        self.transform = transform
        self.max_len = max_len
        self.resampler = torchaudio.transforms.Resample(orig_freq=44100, new_freq=SAMPLE_RATE)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        file_path = os.path.join(self.audio_dir, f"fold{row['fold']}", row['slice_file_name'])

        waveform, sr = torchaudio.load(file_path)

        # Моно + ресемплинг
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)
        if sr != SAMPLE_RATE:
            waveform = self.resampler(waveform)

        # Спектрограмма + паддинг
        spec = self.transform(waveform)
        spec = spec[..., :self.max_len] if spec.shape[-1] > self.max_len else F.pad(spec, (0, self.max_len - spec.shape[-1]))

        return spec, int(row['classID'])

In [12]:
dataset = UrbanSound(csv_path, data_path, transform, MAX_LEN)
train_data, test_data = random_split(dataset, [int(len(dataset) * 0.8), len(dataset) - int(len(dataset) * 0.8)],
                                     generator=torch.Generator().manual_seed(42))
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, num_workers=2, pin_memory=True)

In [13]:
class UrbanAudio(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 16, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.AdaptiveAvgPool2d((8, 8))
        )
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 8 * 8, 128), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(128, 64), nn.ReLU(),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        # Убрал unsqueeze - спектрограмма уже [batch, 1, H, W]
        return self.fc(self.conv(x))

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = UrbanAudio().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

In [16]:
for epoch in range(30):
    model.train()
    total_loss = 0
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        loss = criterion(model(x), y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Эпоха {epoch + 1}, Loss: {total_loss:.2f}")

Эпоха 1, Loss: 343.16
Эпоха 2, Loss: 281.94
Эпоха 3, Loss: 250.68
Эпоха 4, Loss: 226.62
Эпоха 5, Loss: 208.45
Эпоха 6, Loss: 188.50
Эпоха 7, Loss: 202.15
Эпоха 8, Loss: 172.18
Эпоха 9, Loss: 150.71
Эпоха 10, Loss: 149.32
Эпоха 11, Loss: 136.80
Эпоха 12, Loss: 135.82
Эпоха 13, Loss: 121.51
Эпоха 14, Loss: 105.30
Эпоха 15, Loss: 111.86
Эпоха 16, Loss: 110.70
Эпоха 17, Loss: 102.21
Эпоха 18, Loss: 82.42
Эпоха 19, Loss: 84.92
Эпоха 20, Loss: 82.24
Эпоха 21, Loss: 70.13
Эпоха 22, Loss: 72.66
Эпоха 23, Loss: 70.71
Эпоха 24, Loss: 61.29
Эпоха 25, Loss: 59.83
Эпоха 26, Loss: 58.46
Эпоха 27, Loss: 63.23
Эпоха 28, Loss: 70.70
Эпоха 29, Loss: 65.82
Эпоха 30, Loss: 48.84


In [17]:
model.eval()
correct = total = 0
with torch.no_grad():
    for x, y in test_loader:
        x, y = x.to(device), y.to(device)
        correct += (model(x).argmax(1) == y).sum().item()
        total += y.size(0)

print(f'Точность: {100 * correct / total:.2f}%')

Точность: 85.75%


In [18]:
torch.save(model.state_dict(), 'urban_model.pth')
torch.save(labels,'urban_labels.pth')