## Подключение всех необходимых библиотек

In [1]:
import pandas as pd
import numpy as np
import librosa
import csv
import pathlib
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torchvision
from torch.utils.data import random_split, DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F

from sklearn.metrics import accuracy_score
from tqdm import tqdm

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import gtts 
from playsound import playsound

# Обработка аудио файлов и создание csv файла (датасета) с признаками

In [4]:
header = 'filename chroma_stft rms spectral_centroid spectral_bandwidth rolloff zero_crossing_rate'
for i in range(1, 21):
    header += f' mfcc{i}'
header += ' label'
header = header.split()

file = open('dataset.csv', 'w', newline='')
with file:
    writer = csv.writer(file)
    writer.writerow(header)

    
genres = ["angry", "sad", "happy", "fear", "disgust", "neutral", "pleasant_surprised"]

## 1. Заполнение csv файла данными из датасета TESS

In [7]:
#создание датасета (уже создан)

for genr in genres:
    for filename in os.listdir(f"./TESS/{genr}"):
        if filename == ".DS_Store":
            continue
        songname = f"./TESS/{genr}/{filename}"
        y, sr = librosa.load(songname, mono=True)
        
        rms = librosa.feature.rms(y=y)
        chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
        spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
        spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y)
        mfcc = librosa.feature.mfcc(y=y, sr=sr)
        
        feature_row = f'{filename} {np.mean(chroma_stft)} {np.mean(rms)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'
        
        for e in mfcc:
            feature_row += f' {np.mean(e)}'
        feature_row += f' {genr}'
        file = open('dataset.csv', 'a', newline='')
        with file:
            writer = csv.writer(file)
            writer.writerow(feature_row.split())

In [32]:
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,filename,chroma_stft,rms,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
0,OAF_king_angry.wav,0.275622,0.030611,2188.212323,2216.685828,3984.838867,0.101251,-409.611389,49.869926,-4.398498,...,2.592276,-1.143469,6.888781,-2.693643,-0.001758,3.17478,-2.620514,-2.477835,-10.139343,angry
1,OAF_rot_angry.wav,0.255442,0.029819,2210.799896,2256.036375,3958.520508,0.094523,-388.562958,62.383816,-10.005765,...,7.899292,-2.92913,-0.960948,5.303476,2.484355,-8.288725,-10.059808,-6.154644,-11.985009,angry
2,OAF_hire_angry.wav,0.26037,0.027054,2125.701685,2094.000747,3457.238582,0.099099,-405.522919,63.094051,-23.975689,...,15.111687,-0.581255,4.787343,0.885953,3.654622,-7.576045,-7.957401,-2.184226,-12.305842,angry
3,YAF_numb_angry.wav,0.294158,0.065479,3015.999826,2162.34809,5117.997675,0.175675,-306.055756,40.761471,-11.389371,...,12.775775,-8.295956,7.579611,2.391619,-3.962322,0.059314,1.836014,-8.889676,1.407364,angry
4,YAF_seize_angry.wav,0.294656,0.034536,4283.769099,2234.925543,6427.347542,0.288214,-372.893768,4.074799,14.011549,...,8.256427,-11.015532,5.854211,-2.778954,-2.978256,4.731396,-0.114971,-6.256421,6.497686,angry


## 2. Заполнение csv файла обработанными данными из датасета RAVDESS

In [None]:
for filename in os.listdir(f"./TESS/{genr}"):
    if filename == ".DS_Store":
        continue
    songname = f"./TESS/{genr}/{filename}"
    y, sr = librosa.load(songname, mono=True)
        
    rms = librosa.feature.rms(y=y)
        chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
        spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
        spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y)
        mfcc = librosa.feature.mfcc(y=y, sr=sr)
        
        feature_row = f'{filename} {np.mean(chroma_stft)} {np.mean(rms)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'
        
        for e in mfcc:
            feature_row += f' {np.mean(e)}'
        feature_row += f' {genr}'
        file = open('dataset.csv', 'a', newline='')
        with file:
            writer = csv.writer(file)
            writer.writerow(feature_row.split())

## Обработка датасета и разделение на тренировочную и тестовую выборки

In [9]:
X = df.drop(columns=["filename", "label"])
y = pd.get_dummies(df["label"])
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
class MyDataset(Dataset):
    
    def __init__(self, features, labels):
        self.features = torch.tensor(features.values, dtype = torch.float32)
        self.labels = torch.tensor(labels.values, dtype = torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, ind):
        return self.features[ind], self.labels[ind]

In [11]:
train_dataset = MyDataset(x_train, y_train)
test_dataset = MyDataset(x_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

## Архитектура нашей нейронной сети и ее обучение

In [12]:
class RecognizeNet(nn.Module):
    def __init__(self):
        super(RecognizeNet, self).__init__()
        self.layer1 = nn.Linear(26, 100)
        self.act1 = nn.ReLU()
        self.layer2 = nn.Linear(100, 200)
        self.act2 = nn.ReLU()
        self.layer3 = nn.Linear(200, 100)
        self.act3 = nn.ReLU()
        self.layer4 = nn.Linear(100, 50)
        self.act4 = nn.ReLU()
        self.layer5 = nn.Linear(50, 7)
        #self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.layer1(x)
        x = self.act1(x)
        x = self.layer2(x)
        x = self.act2(x)
        x = self.layer3(x)
        x = self.act3(x)
        x = self.layer4(x)
        x = self.act4(x)
        x = self.layer5(x)
        return x

In [16]:
model = RecognizeNet()

In [17]:
total_step = len(train_loader)
epochs = 700
lr = 0.0007
optimizer = torch.optim.Adam(model.parameters(), lr)
loss_fn = nn.CrossEntropyLoss()

loss_list = []
acc_list = []

for epoch in tqdm(range(epochs)):
    for i, batch in enumerate(train_loader):

        x, y = batch
        preds = model(x)

        loss = loss_fn(preds, y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        preds = F.softmax(preds, dim=1)

        total = y.size(0)
        _, predicted = torch.max(preds.data, 1)
        _, true = torch.max(y.data, 1)
        correct = (predicted == true).sum().item()
        acc_list.append(correct / total)

    if (epoch + 1) % 20 == 0:
            print('Epoch [{}/{}], Loss: {:.4f}, Accuracy: {:.2f}%'
                  .format(epoch + 1, epochs, loss.item(),
                          (correct / total) * 100))

  4%|█▌                                        | 26/700 [00:00<00:21, 31.25it/s]

Epoch [20/700], Loss: 1.8769, Accuracy: 18.55%


  6%|██▋                                       | 44/700 [00:01<00:24, 27.27it/s]

Epoch [40/700], Loss: 1.7502, Accuracy: 28.03%


 10%|████                                      | 67/700 [00:02<00:19, 33.31it/s]

Epoch [60/700], Loss: 1.6289, Accuracy: 36.82%


 12%|█████▏                                    | 87/700 [00:02<00:18, 33.90it/s]

Epoch [80/700], Loss: 1.5042, Accuracy: 44.14%


 14%|█████▉                                   | 101/700 [00:03<00:31, 19.11it/s]

Epoch [100/700], Loss: 1.3271, Accuracy: 48.73%


 17%|███████                                  | 120/700 [00:04<00:35, 16.20it/s]

Epoch [120/700], Loss: 1.1762, Accuracy: 57.23%


 20%|████████▎                                | 141/700 [00:06<00:32, 17.03it/s]

Epoch [140/700], Loss: 1.0129, Accuracy: 63.67%


 23%|█████████▍                               | 161/700 [00:07<00:41, 12.85it/s]

Epoch [160/700], Loss: 0.9155, Accuracy: 66.89%


 26%|██████████▌                              | 180/700 [00:09<00:28, 18.19it/s]

Epoch [180/700], Loss: 0.7805, Accuracy: 75.59%


 29%|███████████▉                             | 204/700 [00:11<00:34, 14.25it/s]

Epoch [200/700], Loss: 0.6747, Accuracy: 78.12%


 32%|█████████████                            | 224/700 [00:12<00:25, 18.91it/s]

Epoch [220/700], Loss: 0.6053, Accuracy: 80.18%


 35%|██████████████▍                          | 247/700 [00:13<00:14, 30.66it/s]

Epoch [240/700], Loss: 0.5120, Accuracy: 82.71%


 38%|███████████████▋                         | 267/700 [00:13<00:15, 28.46it/s]

Epoch [260/700], Loss: 0.4230, Accuracy: 87.70%


 41%|████████████████▊                        | 287/700 [00:14<00:12, 32.95it/s]

Epoch [280/700], Loss: 0.4018, Accuracy: 86.91%


 44%|█████████████████▉                       | 307/700 [00:15<00:13, 28.71it/s]

Epoch [300/700], Loss: 0.3162, Accuracy: 90.04%


 47%|███████████████████▏                     | 327/700 [00:15<00:11, 32.73it/s]

Epoch [320/700], Loss: 0.3501, Accuracy: 88.28%


 50%|████████████████████▎                    | 347/700 [00:16<00:13, 26.60it/s]

Epoch [340/700], Loss: 0.2542, Accuracy: 92.77%


 52%|█████████████████████▍                   | 367/700 [00:17<00:10, 32.12it/s]

Epoch [360/700], Loss: 0.2234, Accuracy: 93.16%


 54%|██████████████████████▏                  | 379/700 [00:17<00:10, 31.70it/s]

Epoch [380/700], Loss: 0.1933, Accuracy: 93.95%


 58%|███████████████████████▊                 | 406/700 [00:18<00:09, 31.66it/s]

Epoch [400/700], Loss: 0.2040, Accuracy: 94.04%


 60%|████████████████████████▋                | 422/700 [00:19<00:08, 31.72it/s]

Epoch [420/700], Loss: 0.1702, Accuracy: 94.43%


 64%|██████████████████████████               | 445/700 [00:19<00:08, 30.53it/s]

Epoch [440/700], Loss: 0.1431, Accuracy: 95.90%


 66%|███████████████████████████▏             | 465/700 [00:20<00:07, 31.89it/s]

Epoch [460/700], Loss: 0.1207, Accuracy: 96.29%


 69%|████████████████████████████▎            | 484/700 [00:21<00:07, 29.47it/s]

Epoch [480/700], Loss: 0.1284, Accuracy: 96.97%


 72%|█████████████████████████████▌           | 504/700 [00:21<00:06, 32.34it/s]

Epoch [500/700], Loss: 0.1111, Accuracy: 96.19%


 75%|██████████████████████████████▊          | 526/700 [00:22<00:05, 30.41it/s]

Epoch [520/700], Loss: 0.1207, Accuracy: 96.48%


 78%|███████████████████████████████▉         | 546/700 [00:23<00:04, 32.47it/s]

Epoch [540/700], Loss: 0.0962, Accuracy: 96.97%


 81%|█████████████████████████████████▏       | 566/700 [00:24<00:04, 30.24it/s]

Epoch [560/700], Loss: 0.1119, Accuracy: 96.39%


 84%|██████████████████████████████████▎      | 586/700 [00:24<00:03, 31.54it/s]

Epoch [580/700], Loss: 0.1104, Accuracy: 96.97%


 86%|███████████████████████████████████▍     | 605/700 [00:25<00:03, 28.49it/s]

Epoch [600/700], Loss: 0.0852, Accuracy: 97.66%


 89%|████████████████████████████████████▌    | 625/700 [00:26<00:02, 31.91it/s]

Epoch [620/700], Loss: 0.0823, Accuracy: 97.46%


 92%|█████████████████████████████████████▊   | 645/700 [00:26<00:01, 27.51it/s]

Epoch [640/700], Loss: 0.0773, Accuracy: 97.75%


 95%|███████████████████████████████████████  | 666/700 [00:27<00:01, 31.60it/s]

Epoch [660/700], Loss: 0.0644, Accuracy: 97.95%


 98%|████████████████████████████████████████▏| 686/700 [00:28<00:00, 27.51it/s]

Epoch [680/700], Loss: 0.0694, Accuracy: 97.85%


100%|█████████████████████████████████████████| 700/700 [00:28<00:00, 24.30it/s]

Epoch [700/700], Loss: 0.0598, Accuracy: 98.44%





## Тестирование нашей нейронки

In [29]:
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for x, y in test_loader:
        y_pred = model(x)
        _, predicted = torch.max(y_pred.data, 1)
        _, y_true = torch.max(y.data, 1)
        total += y_true.size(0)
        correct += (predicted == y_true).sum().item()

    print('Test Accuracy of the model on the test data: {} %'.format((correct / total) * 100))

Test Accuracy of the model on the test data: 97.32142857142857 %


In [25]:
t1 = gtts.gTTS("Ты сильный, и я знаю, что ты сейчас в не самом лучшем расположении духа, но отбрось все свои мысли и сосредоточься на самом главном, на себе и своём теле, всё остальное пройдёт, а твоё тело и дух останется навсегда", tld="ru",lang="ru")
t1.save("motivate_speech.mp3")

In [27]:
# play the audio file 
playsound("motivate_speech.mp3")