In [235]:
import torch
import torch.nn as nn
from torchaudio import datasets, transforms, info, load
from torch.utils.data import DataLoader, Dataset, random_split
import torch.optim as optim
import torch.nn.functional as F
import os
from torch.utils.data import Dataset
import torchaudio
import pandas as pd

In [236]:
import kagglehub

path = kagglehub.dataset_download("mmoreaux/environmental-sound-classification-50")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'environmental-sound-classification-50' dataset.
Path to dataset files: /kaggle/input/environmental-sound-classification-50


In [237]:
audio_dir = '/kaggle/input/environmental-sound-classification-50/audio/audio'

In [238]:
csv_path = '/kaggle/input/environmental-sound-classification-50/esc50.csv'

In [239]:
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A


In [240]:
classes = sorted(df['category'].unique())
classes

['airplane',
 'breathing',
 'brushing_teeth',
 'can_opening',
 'car_horn',
 'cat',
 'chainsaw',
 'chirping_birds',
 'church_bells',
 'clapping',
 'clock_alarm',
 'clock_tick',
 'coughing',
 'cow',
 'crackling_fire',
 'crickets',
 'crow',
 'crying_baby',
 'dog',
 'door_wood_creaks',
 'door_wood_knock',
 'drinking_sipping',
 'engine',
 'fireworks',
 'footsteps',
 'frog',
 'glass_breaking',
 'hand_saw',
 'helicopter',
 'hen',
 'insects',
 'keyboard_typing',
 'laughing',
 'mouse_click',
 'pig',
 'pouring_water',
 'rain',
 'rooster',
 'sea_waves',
 'sheep',
 'siren',
 'sneezing',
 'snoring',
 'thunderstorm',
 'toilet_flush',
 'train',
 'vacuum_cleaner',
 'washing_machine',
 'water_drops',
 'wind']

In [241]:
len(classes)

50

In [242]:
label_to_index = {lab: ind for ind, lab in enumerate(classes)}
label_to_index

{'airplane': 0,
 'breathing': 1,
 'brushing_teeth': 2,
 'can_opening': 3,
 'car_horn': 4,
 'cat': 5,
 'chainsaw': 6,
 'chirping_birds': 7,
 'church_bells': 8,
 'clapping': 9,
 'clock_alarm': 10,
 'clock_tick': 11,
 'coughing': 12,
 'cow': 13,
 'crackling_fire': 14,
 'crickets': 15,
 'crow': 16,
 'crying_baby': 17,
 'dog': 18,
 'door_wood_creaks': 19,
 'door_wood_knock': 20,
 'drinking_sipping': 21,
 'engine': 22,
 'fireworks': 23,
 'footsteps': 24,
 'frog': 25,
 'glass_breaking': 26,
 'hand_saw': 27,
 'helicopter': 28,
 'hen': 29,
 'insects': 30,
 'keyboard_typing': 31,
 'laughing': 32,
 'mouse_click': 33,
 'pig': 34,
 'pouring_water': 35,
 'rain': 36,
 'rooster': 37,
 'sea_waves': 38,
 'sheep': 39,
 'siren': 40,
 'sneezing': 41,
 'snoring': 42,
 'thunderstorm': 43,
 'toilet_flush': 44,
 'train': 45,
 'vacuum_cleaner': 46,
 'washing_machine': 47,
 'water_drops': 48,
 'wind': 49}

In [243]:
transform = transforms.MelSpectrogram(
    sample_rate = 16000,
    n_mels = 64
)

In [244]:
max_len = 500

In [245]:
class ESC(Dataset):
  def __init__(self, df, root_path, transform, max_len, label_to_index):
    self.df = df
    self.root_path = root_path
    self.transform = transform
    self.max_len = max_len
    self.label_to_index = label_to_index
    self.audios = []

    for index, row in self.df.iterrows():
        file_path = os.path.join(self.root_path, row['filename'])
        genre = row['category']
        try:
            info(file_path)
            self.audios.append((file_path, genre))
        except Exception as e:
            print(f'Error loading {file_path}: {e}')

  def __len__(self):
      return len(self.audios)

  def __getitem__(self, ind):
      file_path, genre = self.audios[ind]
      waveform, sr = load(file_path)

      if sr != 16000:
          resample = transforms.Resample(orig_freq=sr, new_freq=16000)
          waveform = resample(waveform)

      spec = self.transform(waveform).squeeze(0)

      if spec.shape[1] > self.max_len:
        spec = spec[:, :self.max_len]

      if spec.shape[1] < self.max_len:
        count_len = self.max_len - spec.shape[1]
        spec = F.pad(spec, (0, count_len))

      return spec, self.label_to_index[genre]


In [246]:
dataset = ESC(df, audio_dir, transform, max_len, label_to_index)
train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size

train_data, test_data = random_split(dataset, [train_size, test_size], generator=torch.Generator().manual_seed(42))


  info(file_path)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  return AudioMetaData(


In [247]:
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, )

In [248]:
device =  torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [249]:
class CheckAudio(nn.Module):
    def __init__(self, num_classes=10):
        super(CheckAudio, self).__init__()
        self.first = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.AdaptiveAvgPool2d((8, 8))
        )

        self.flatten = nn.Flatten()

        self.second = nn.Sequential(
            nn.Linear(64 * 8 * 8, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.first(x)
        x = self.flatten(x)
        x = self.second(x)
        return x

In [250]:
model = CheckAudio(num_classes=50).to(device)

In [251]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [254]:
for epoch in range(20):
  model.train()
  total_loss = 0

  for x_batch, y_batch in train_loader:
    x_batch, y_batch = x_batch.to(device), y_batch.to(device)

    y_pred = model(x_batch)
    loss = loss_fn(y_pred, y_batch)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item()
  print(f'Эпоха {epoch+1}, Потери: {total_loss:.4f}')

Эпоха 1, Потери: 68.7413
Эпоха 2, Потери: 58.8688
Эпоха 3, Потери: 61.5692
Эпоха 4, Потери: 73.7808
Эпоха 5, Потери: 52.3212
Эпоха 6, Потери: 55.2192
Эпоха 7, Потери: 42.7394
Эпоха 8, Потери: 32.9628
Эпоха 9, Потери: 29.8568
Эпоха 10, Потери: 27.1869
Эпоха 11, Потери: 35.4731
Эпоха 12, Потери: 39.5087
Эпоха 13, Потери: 36.7754
Эпоха 14, Потери: 28.1819
Эпоха 15, Потери: 23.4266
Эпоха 16, Потери: 33.0924
Эпоха 17, Потери: 38.6911
Эпоха 18, Потери: 38.1446
Эпоха 19, Потери: 22.4719
Эпоха 20, Потери: 16.6690


In [255]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for x_batch, y_batch in test_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        y_pred = model(x_batch)
        pred = torch.argmax(y_pred, dim=1)

        total += y_batch.size(0)
        correct += (pred == y_batch).sum().item()

accuracy = correct * 100 / total
print(f'точность модели : {accuracy :.2f}%')

точность модели : 42.50%


In [256]:
torch.save(model.state_dict(), 'model_esc.pth')
torch.save(classes,'labels_esc.pth')