In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchaudio import datasets, transforms, info, load
from torch.utils.data import DataLoader, Dataset, random_split
import os
import zipfile
import torchaudio

In [5]:
with zipfile.ZipFile('/content/Regions.zip', 'r') as zip:
    zip.extractall('.')

In [50]:
data_path_train = '/content/Regions/train'
data_path_test = '/content/Regions/test'

In [7]:
labels = sorted(os.listdir(data_path_train))
labels

['alay',
 'batken',
 'bishkek',
 'chui',
 'jalal-abad',
 'manas',
 'naryn',
 'osh',
 'talas',
 'ysyk-kol']

In [8]:
label_to_index = {lab: ind for ind, lab in enumerate(labels)}
label_to_index

{'alay': 0,
 'batken': 1,
 'bishkek': 2,
 'chui': 3,
 'jalal-abad': 4,
 'manas': 5,
 'naryn': 6,
 'osh': 7,
 'talas': 8,
 'ysyk-kol': 9}

In [9]:
transform = transforms.MelSpectrogram(
    sample_rate=16000,
    n_fft=1024,           # большее окно → лучше частотное разрешение
    win_length=1024,
    hop_length=256,       # меньшее смещение → лучше временное разрешение
    n_mels=64,
    f_min=0,              # можно обрезать низкие частоты, если шум
    f_max=8000,           # ограничить до Nyquist частоты (sample_rate / 2)
    power=2.0             # энергия спектра, можно 1.0 для амплитуды
)

In [10]:
max_len = 500

In [42]:
def collate_fn(batch):
    spectrograms, targets = [], []
    for file_path, label in batch:
        waveform, sr = torchaudio.load(file_path)

        # моно
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)

        # ресемплинг
        if sr != 16000:
            waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)

        spec = transform(waveform).squeeze()

        # паддинг или обрезание
        if spec.shape[1] > max_len:
            spec = spec[:, :max_len]
        elif spec.shape[1] < max_len:
            pad_amount = max_len - spec.shape[1]
            spec = F.pad(spec, (0, pad_amount))

        spectrograms.append(spec)
        targets.append(label_to_index[label])

    spectrograms = torch.stack(spectrograms)
    targets = torch.tensor(targets)
    return spectrograms, targets

In [51]:
class AudioDataset(Dataset):
    def __init__(self, data_path, transform=None):
        self.transform = transform
        self.data = []
        self.labels = sorted(os.listdir(data_path))
        self.label_to_index = {lab: ind for ind, lab in enumerate(self.labels)}

        for label in self.labels:
            label_folder = os.path.join(data_path, label)
            for file in os.listdir(label_folder):
                if file.endswith('.wav'):  # или другой формат аудио
                    self.data.append((os.path.join(label_folder, file), label))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [43]:
len(labels)

10

In [54]:
train_dataset = AudioDataset(data_path_train, transform=transform)
test_dataset = AudioDataset(data_path_test, transform=transform)

In [55]:
train = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
test = DataLoader(test_dataset, batch_size=64, collate_fn=collate_fn)

In [56]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [57]:
class PlaceAudio(nn.Module):
    def __init__(self, num_classes=10):
        super(PlaceAudio, self).__init__()
        self.first = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.AdaptiveAvgPool2d((8, 8))
        )

        self.flatten = nn.Flatten()

        self.second = nn.Sequential(
            nn.Linear(64 * 8 * 8, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.first(x)
        x = self.flatten(x)
        x = self.second(x)
        return x

In [58]:
model = PlaceAudio().to(device)

In [59]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [66]:
for epoch in range(2):
  model.train()
  total_loss = 0

  for x_batch, y_batch in train:
    x_batch, y_batch = x_batch.to(device), y_batch.to(device)

    y_pred = model(x_batch)
    loss = loss_fn(y_pred, y_batch)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item()
  print(f'Эпоха {epoch+1}, Потери: {total_loss}')

Эпоха 1, Потери: 0.35440122449290357
Эпоха 2, Потери: 0.34267239795372006


In [67]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for x_batch, y_batch in test:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        y_pred = model(x_batch)
        pred = torch.argmax(y_pred, dim=1)

        total += y_batch.size(0)
        correct += (pred == y_batch).sum().item()

accuracy = correct * 100 / total
print(f'точность модели : {accuracy :.2f}%')

точность модели : 93.47%


In [68]:
torch.save(model.state_dict(), 'region_model.pth')


In [71]:
torch.save(labels,'region_labels.pth')