In [None]:
import torch
import torchaudio
import os
import pandas as pd
import librosa
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchaudio.transforms import MelSpectrogram
from kaggle.api.kaggle_api_extended import KaggleApi
import zipfile

In [None]:
from google.colab import files
files.upload()

# Создание директории для хранения конфигурации Kaggle API
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle (1).json to kaggle (1).json


In [None]:
!kaggle datasets download -d slenser/data-audio-mnist

Dataset URL: https://www.kaggle.com/datasets/slenser/data-audio-mnist
License(s): unknown
Downloading data-audio-mnist.zip to /content
 99% 700M/707M [00:07<00:00, 133MB/s]
100% 707M/707M [00:07<00:00, 96.0MB/s]


In [None]:
!kaggle datasets download -d slenser/audio-mnist-test

Dataset URL: https://www.kaggle.com/datasets/slenser/audio-mnist-test
License(s): unknown
Downloading audio-mnist-test.zip to /content
100% 240M/241M [00:02<00:00, 109MB/s] 
100% 241M/241M [00:02<00:00, 121MB/s]


In [None]:
import os
import zipfile

# Путь для распаковки данных
train_zip_path = './data-audio-mnist.zip'
test_zip_path = './audio-mnist-test.zip'
train_extract_path = './data/train'
test_extract_path = './data/test'

# Функция для распаковки zip файлов
def extract_zip(zip_path, extract_to):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Создание директорий, если они не существуют
os.makedirs(train_extract_path, exist_ok=True)
os.makedirs(test_extract_path, exist_ok=True)

# Распаковка файлов
extract_zip(train_zip_path, train_extract_path)
extract_zip(test_zip_path, test_extract_path)

print("Файлы успешно распакованы!")


Файлы успешно распакованы!


In [None]:
import os
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

class AudioMNISTDataset(Dataset):
    def __init__(self, root_dir, transform=None, target_size=(128, 128)):
        self.root_dir = root_dir
        self.transform = transform
        self.file_paths = [os.path.join(root_dir, fname) for fname in os.listdir(root_dir) if fname.endswith('.wav')]
        self.mel_spectrogram = torchaudio.transforms.MelSpectrogram(
            sample_rate=16000,
            n_fft=1024,
            hop_length=512,
            n_mels=128
        )
        self.target_size = target_size

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
      file_path = self.file_paths[idx]
      label = int(os.path.basename(file_path).split('_')[0])
      waveform, sample_rate = torchaudio.load(file_path)

      # Преобразуем в мел-спектрограмму
      mel_spec = self.mel_spectrogram(waveform).squeeze(0)

      mel_spec = mel_spec.unsqueeze(0).unsqueeze(0)

      if mel_spec.size(3) < self.target_size[1]:
          mel_spec = torch.nn.functional.pad(mel_spec, (0, self.target_size[1] - mel_spec.size(3)))

      mel_spec_resized = torch.nn.functional.interpolate(mel_spec, size=(self.target_size[0], self.target_size[1]), mode='bilinear', align_corners=False)

      mel_spec_resized = mel_spec_resized.float()

      if self.transform:
          mel_spec_resized = self.transform(mel_spec_resized)

      return mel_spec_resized.squeeze(0), label


In [None]:
class CNNModel(nn.Module):
    def __init__(self, num_classes=10, input_shape=(128, 128)):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)

        # Рассчитываем размер входа для fc1 в зависимости от размера мел-спектрограммы
        self._to_linear = None
        self._to_linear = self.calculate_fc_input(input_shape)

        self.fc1 = nn.Linear(self._to_linear, 256)
        self.fc2 = nn.Linear(256, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.view(-1, self._to_linear)  # Используем рассчитанный размер для fc1
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

    def calculate_fc_input(self, input_shape):
        # Подсчитываем размер для fc1 в зависимости от размера мел-спектрограммы
        batch_data = torch.zeros((1, 1, input_shape[0], input_shape[1]))
        x = self.pool(F.relu(self.conv1(batch_data)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        return x.view(x.size(0), -1).shape[1]

In [None]:
def train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs=10, device='cpu'):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for mel_specs, labels in train_loader:
            mel_specs, labels = mel_specs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(mel_specs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}')

        # Валидация
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for mel_specs, labels in test_loader:
                mel_specs, labels = mel_specs.to(device), labels.to(device)
                outputs = model(mel_specs)
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        print(f'Accuracy: {100 * correct / total}%')

In [None]:
batch_size = 32
learning_rate = 0.001
num_epochs = 10
input_shape = (128, 128)  # Размер мел-спектрограммы после изменения

# Пути к данным
train_data_path = './data/train'
test_data_path = './data/test'



# Создание датасетов и DataLoader
train_dataset = AudioMNISTDataset(root_dir=train_data_path, target_size=input_shape)
test_dataset = AudioMNISTDataset(root_dir=test_data_path, target_size=input_shape)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

# Создание модели
model = CNNModel(num_classes=10, input_shape=input_shape)

# Определение критерия и оптимизатора
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Обучение модели
train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs=num_epochs, device='cpu')


Epoch 1/10, Loss: 0.5867307566887493
Accuracy: 93.8%
Epoch 2/10, Loss: 0.1937920109849074
Accuracy: 95.56%
Epoch 3/10, Loss: 0.11075922677859622
Accuracy: 97.01333333333334%
Epoch 4/10, Loss: 0.06390544843998214
Accuracy: 97.52%
Epoch 5/10, Loss: 0.047495365409195696
Accuracy: 97.69333333333333%
Epoch 6/10, Loss: 0.04176096152448074
Accuracy: 98.36%
Epoch 7/10, Loss: 0.055776249452006516
Accuracy: 97.92%
Epoch 8/10, Loss: 0.06732086436068388
Accuracy: 98.14666666666666%
Epoch 9/10, Loss: 0.024293910572569386
Accuracy: 97.77333333333333%
Epoch 10/10, Loss: 0.024355050495763274
Accuracy: 98.42666666666666%


In [None]:
model_save_path = 'cnn_model.pth'
torch.save(model.state_dict(), model_save_path)