In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchaudio import datasets, transforms, info, load
from torch.utils.data import DataLoader, Dataset, random_split
import torch.nn.functional as F
import os
from torch.utils.data import Dataset
import torchaudio
import pandas as pd

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uwrfkaggler/ravdess-emotional-speech-audio")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'ravdess-emotional-speech-audio' dataset.
Path to dataset files: /kaggle/input/ravdess-emotional-speech-audio


In [3]:
audio_dir = '/kaggle/input/ravdess-emotional-speech-audio'

In [4]:
import os
emotions = sorted(os.listdir(audio_dir))
emotions

['Actor_01',
 'Actor_02',
 'Actor_03',
 'Actor_04',
 'Actor_05',
 'Actor_06',
 'Actor_07',
 'Actor_08',
 'Actor_09',
 'Actor_10',
 'Actor_11',
 'Actor_12',
 'Actor_13',
 'Actor_14',
 'Actor_15',
 'Actor_16',
 'Actor_17',
 'Actor_18',
 'Actor_19',
 'Actor_20',
 'Actor_21',
 'Actor_22',
 'Actor_23',
 'Actor_24',
 'audio_speech_actors_01-24']

In [5]:
len(emotions)

25

In [6]:
label_to_index = {lab: ind for ind, lab in enumerate(emotions)}
label_to_index

{'Actor_01': 0,
 'Actor_02': 1,
 'Actor_03': 2,
 'Actor_04': 3,
 'Actor_05': 4,
 'Actor_06': 5,
 'Actor_07': 6,
 'Actor_08': 7,
 'Actor_09': 8,
 'Actor_10': 9,
 'Actor_11': 10,
 'Actor_12': 11,
 'Actor_13': 12,
 'Actor_14': 13,
 'Actor_15': 14,
 'Actor_16': 15,
 'Actor_17': 16,
 'Actor_18': 17,
 'Actor_19': 18,
 'Actor_20': 19,
 'Actor_21': 20,
 'Actor_22': 21,
 'Actor_23': 22,
 'Actor_24': 23,
 'audio_speech_actors_01-24': 24}

In [7]:
transform = transforms.MelSpectrogram(
    sample_rate = 22050,
    n_mels = 64
)

In [8]:
max_len = 500

In [9]:
class RAVDESSDataset(Dataset):
    def __init__(self, audio_dir, transform=None, sample_rate=22050, max_len=500):
        self.audio_dir = audio_dir
        self.transform = transform
        self.sample_rate = sample_rate
        self.max_len = max_len

        self.files = []
        for root, dirs, files in os.walk(audio_dir):
            for f in files:
                if f.lower().endswith(".wav"):
                    self.files.append(os.path.join(root, f))

        if len(self.files) == 0:
            raise ValueError(f"В папке {audio_dir} нет файлов .wav!")

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        file_path = self.files[idx]
        file_name = os.path.basename(file_path)

        waveform, sr = torchaudio.load(file_path)
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        if sr != self.sample_rate:
            resample = torchaudio.transforms.Resample(sr, self.sample_rate)
            waveform = resample(waveform)

        spec = self.transform(waveform) if self.transform else waveform

        if spec.shape[-1] > self.max_len:
            spec = spec[:, :, :self.max_len]
        elif spec.shape[-1] < self.max_len:
            spec = torch.nn.functional.pad(spec, (0, self.max_len - spec.shape[-1]))

        emotion_id = int(file_name.split('-')[2]) - 1
        return spec, emotion_id


In [10]:
dataset = RAVDESSDataset(audio_dir=audio_dir, transform=transform, max_len=max_len)


train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size

train_data, test_data = random_split(dataset, [train_size, test_size], generator=torch.Generator().manual_seed(42))


In [11]:
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, )

In [12]:
device =  torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [13]:
class CheckAudio(nn.Module):
    def __init__(self, num_classes=8):
        super(CheckAudio, self).__init__()
        self.first = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.AdaptiveAvgPool2d((8, 8))
        )

        self.flatten = nn.Flatten()

        self.second = nn.Sequential(
            nn.Linear(64 * 8 * 8, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        x = self.first(x)
        x = self.flatten(x)
        x = self.second(x)
        return x

In [14]:
model = CheckAudio(num_classes=8).to(device)

In [15]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [16]:
for epoch in range(20):
  model.train()
  total_loss = 0

  for x_batch, y_batch in train_loader:
    x_batch, y_batch = x_batch.to(device), y_batch.to(device)

    y_pred = model(x_batch)
    loss = loss_fn(y_pred, y_batch)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item()
  print(f'Эпоха {epoch+1}, Потери: {total_loss:.4f}')

  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Эпоха 1, Потери: 136.8055
Эпоха 2, Потери: 123.9639
Эпоха 3, Потери: 117.7584
Эпоха 4, Потери: 111.2374
Эпоха 5, Потери: 108.0119
Эпоха 6, Потери: 98.4688
Эпоха 7, Потери: 98.8507
Эпоха 8, Потери: 86.9484
Эпоха 9, Потери: 78.2485
Эпоха 10, Потери: 68.9729
Эпоха 11, Потери: 80.6332
Эпоха 12, Потери: 72.0293
Эпоха 13, Потери: 59.5840
Эпоха 14, Потери: 54.4586
Эпоха 15, Потери: 50.3794
Эпоха 16, Потери: 44.0706
Эпоха 17, Потери: 39.1126
Эпоха 18, Потери: 52.4071
Эпоха 19, Потери: 37.5381
Эпоха 20, Потери: 29.7961


In [17]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for x_batch, y_batch in test_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        y_pred = model(x_batch)
        pred = torch.argmax(y_pred, dim=1)

        total += y_batch.size(0)
        correct += (pred == y_batch).sum().item()

accuracy = correct * 100 / total
print(f'точность модели : {accuracy :.2f}%')

точность модели : 76.74%


In [18]:
emotion_labels = {
    1: "Neutral",
    2: "Calm",
    3: "Happy",
    4: "Sad",
    5: "Angry",
    6: "Fearful",
    7: "Disgust",
    8: "Surprised"
}

In [19]:
emotions = sorted(emotion_labels.values())
print("Эмоции:", emotions)
torch.save(emotions, "labels_emotion.pth")

Эмоции: ['Angry', 'Calm', 'Disgust', 'Fearful', 'Happy', 'Neutral', 'Sad', 'Surprised']


In [20]:
torch.save(model.state_dict(), 'model_emotion.pth')