## Подключение всех необходимых библиотек

In [1]:
import pandas as pd
import numpy as np
import librosa
import csv
import pathlib
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torchvision
from torch.utils.data import random_split, DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F

from sklearn.metrics import accuracy_score
from tqdm import tqdm

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import gtts
from playsound import playsound

# Обработка аудио файлов и создание csv файла (датасета) с признаками

In [4]:
header = 'filename chroma_stft rms spectral_centroid spectral_bandwidth rolloff zero_crossing_rate'
for i in range(1, 21):
    header += f' mfcc{i}'
header += ' label'
header = header.split()

file = open('dataset.csv', 'w', newline='')
with file:
    writer = csv.writer(file)
    writer.writerow(header)


genres = ["angry", "sad", "happy", "fear", "disgust", "neutral", "pleasant_surprised"]

## 1. Заполнение csv файла данными из датасета TESS

In [5]:
#создание датасета (уже создан)

for genr in genres:
    for filename in os.listdir(f"./TESS/{genr}"):
        if filename == ".DS_Store":
            continue
        songname = f"./TESS/{genr}/{filename}"
        y, sr = librosa.load(songname, mono=True)

        rms = librosa.feature.rms(y=y)
        chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
        spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
        spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y)
        mfcc = librosa.feature.mfcc(y=y, sr=sr)

        feature_row = f'{filename} {np.mean(chroma_stft)} {np.mean(rms)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'

        for e in mfcc:
            feature_row += f' {np.mean(e)}'
        feature_row += f' {genr}'
        file = open('dataset.csv', 'a', newline='')
        with file:
            writer = csv.writer(file)
            writer.writerow(feature_row.split())

In [15]:
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,filename,chroma_stft,rms,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
0,OAF_king_angry.wav,0.275622,0.030611,2188.212323,2216.685828,3984.838867,0.101251,-409.611389,49.869926,-4.398498,...,2.592276,-1.143469,6.888781,-2.693643,-0.001758,3.17478,-2.620514,-2.477835,-10.139343,angry
1,OAF_rot_angry.wav,0.255442,0.029819,2210.799896,2256.036375,3958.520508,0.094523,-388.562958,62.383816,-10.005765,...,7.899292,-2.92913,-0.960948,5.303476,2.484355,-8.288725,-10.059808,-6.154644,-11.985009,angry
2,OAF_hire_angry.wav,0.26037,0.027054,2125.701685,2094.000747,3457.238582,0.099099,-405.522919,63.094051,-23.975689,...,15.111687,-0.581255,4.787343,0.885953,3.654622,-7.576045,-7.957401,-2.184226,-12.305842,angry
3,YAF_numb_angry.wav,0.294158,0.065479,3015.999826,2162.34809,5117.997675,0.175675,-306.055756,40.761471,-11.389371,...,12.775775,-8.295956,7.579611,2.391619,-3.962322,0.059314,1.836014,-8.889676,1.407364,angry
4,YAF_seize_angry.wav,0.294656,0.034536,4283.769099,2234.925543,6427.347542,0.288214,-372.893768,4.074799,14.011549,...,8.256427,-11.015532,5.854211,-2.778954,-2.978256,4.731396,-0.114971,-6.256421,6.497686,angry


## 2. Заполнение csv файла обработанными данными из датасета RAVDESS

In [7]:
repl_dict = {"1": "neutral",
             "3": "happy",
             "4": "sad",
             "5": "angry",
             "6": "fear",
             "7": "disgust",
             "8": "pleasant_surprised"}

for filename in os.listdir(f"./RAVDESS"):
    if filename == ".DS_Store" or filename[7] == "2":
        continue
    songname = f"./RAVDESS/{filename}"
    y, sr = librosa.load(songname, mono=True)

    rms = librosa.feature.rms(y=y)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)

    feature_row = f'{filename} {np.mean(chroma_stft)} {np.mean(rms)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'

    for e in mfcc:
        feature_row += f' {np.mean(e)}'
    feature_row += f' {repl_dict[filename[7]]}'
    file = open('dataset.csv', 'a', newline='')
    with file:
        writer = csv.writer(file)
        writer.writerow(feature_row.split())

In [8]:
df = pd.read_csv("dataset.csv")
df.shape

(4048, 28)

In [188]:
#Смотрим, есть ли у нас разбалансировка классов (как видим - нет, все замечательно)
df["label"].value_counts()

angry                 592
sad                   592
happy                 592
fear                  592
disgust               592
pleasant_surprised    592
neutral               496
Name: label, dtype: int64

## Обработка датасета и разделение на тренировочную и тестовую выборки

In [16]:
X = df.drop(columns=["filename", "label"])
y = pd.get_dummies(df["label"])
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
class MyDataset(Dataset):

    def __init__(self, features, labels):
        self.features = torch.tensor(features.values, dtype = torch.float32)
        self.labels = torch.tensor(labels.values, dtype = torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, ind):
        return self.features[ind], self.labels[ind]

In [18]:
train_dataset = MyDataset(x_train, y_train)
test_dataset = MyDataset(x_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

## Архитектура нашей нейронной сети и ее обучение

In [19]:
class RecognizeNet(nn.Module):
    def __init__(self):
        super(RecognizeNet, self).__init__()
        self.layer1 = nn.Linear(26, 100)
        self.act1 = nn.ReLU()
        self.layer2 = nn.Linear(100, 200)
        self.act2 = nn.ReLU()
        self.layer3 = nn.Linear(200, 100)
        self.act3 = nn.ReLU()
        self.layer4 = nn.Linear(100, 50)
        self.act4 = nn.ReLU()
        self.layer5 = nn.Linear(50, 7)
        #self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.layer1(x)
        x = self.act1(x)
        x = self.layer2(x)
        x = self.act2(x)
        x = self.layer3(x)
        x = self.act3(x)
        x = self.layer4(x)
        x = self.act4(x)
        x = self.layer5(x)
        return x

In [20]:
model = RecognizeNet()

In [21]:
total_step = len(train_loader)
epochs = 1600
lr = 0.003
optimizer = torch.optim.Adam(model.parameters(), lr)
loss_fn = nn.CrossEntropyLoss()

loss_list = []
acc_list = []
epoch = 0

for epoch in tqdm(range(epochs)):
    for i, batch in enumerate(train_loader):

        x, y = batch
        preds = model(x)

        loss = loss_fn(preds, y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        preds = F.softmax(preds, dim=1)

        total = y.size(0)
        _, predicted = torch.max(preds.data, 1)
        _, true = torch.max(y.data, 1)
        correct = (predicted == true).sum().item()
        acc_list.append(correct / total)

    if (epoch + 1) % 100 == 0:
            print('Epoch [{}/{}], Loss: {:.4f}, Accuracy: {:.2f}%'
                  .format(epoch + 1, epochs, loss.item(),
                          (correct / total) * 100))

  6%|██▌                                     | 101/1600 [00:08<01:51, 13.49it/s]

Epoch [100/1600], Loss: 1.2955, Accuracy: 44.14%


 13%|█████                                   | 201/1600 [00:21<03:21,  6.94it/s]

Epoch [200/1600], Loss: 0.7160, Accuracy: 72.46%


 19%|███████▌                                | 302/1600 [00:32<02:03, 10.47it/s]

Epoch [300/1600], Loss: 0.5327, Accuracy: 79.10%


 25%|██████████                              | 401/1600 [00:43<02:27,  8.11it/s]

Epoch [400/1600], Loss: 0.4249, Accuracy: 84.18%


 31%|████████████▌                           | 502/1600 [01:01<02:00,  9.11it/s]

Epoch [500/1600], Loss: 0.4581, Accuracy: 83.40%


 38%|███████████████                         | 601/1600 [01:12<01:59,  8.33it/s]

Epoch [600/1600], Loss: 0.1923, Accuracy: 92.97%


 44%|█████████████████▌                      | 700/1600 [01:28<04:18,  3.48it/s]

Epoch [700/1600], Loss: 0.1751, Accuracy: 93.95%


 50%|████████████████████                    | 802/1600 [01:41<01:11, 11.09it/s]

Epoch [800/1600], Loss: 0.1062, Accuracy: 96.09%


 56%|██████████████████████▌                 | 902/1600 [01:50<01:00, 11.50it/s]

Epoch [900/1600], Loss: 0.0659, Accuracy: 97.07%


 63%|████████████████████████▍              | 1002/1600 [01:59<00:53, 11.27it/s]

Epoch [1000/1600], Loss: 0.0144, Accuracy: 99.80%


 69%|██████████████████████████▊            | 1102/1600 [02:09<00:44, 11.07it/s]

Epoch [1100/1600], Loss: 0.0013, Accuracy: 100.00%


 75%|█████████████████████████████▎         | 1201/1600 [02:19<00:39, 10.02it/s]

Epoch [1200/1600], Loss: 0.0006, Accuracy: 100.00%


 81%|███████████████████████████████▋       | 1300/1600 [02:28<00:28, 10.68it/s]

Epoch [1300/1600], Loss: 0.0004, Accuracy: 100.00%


 88%|██████████████████████████████████▏    | 1402/1600 [02:41<00:32,  6.03it/s]

Epoch [1400/1600], Loss: 0.0003, Accuracy: 100.00%


 94%|████████████████████████████████████▌  | 1501/1600 [02:56<00:11,  8.84it/s]

Epoch [1500/1600], Loss: 0.0001, Accuracy: 100.00%


100%|███████████████████████████████████████| 1600/1600 [03:09<00:00,  8.47it/s]

Epoch [1600/1600], Loss: 0.0001, Accuracy: 100.00%





## Тестирование нашей нейронки

In [22]:
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for x, y in test_loader:
        y_pred = model(x)
        y_pred = F.softmax(y_pred, dim=1)
        _, predicted = torch.max(y_pred.data, 1)
        _, y_true = torch.max(y.data, 1)
        total += y_true.size(0)
        correct += (predicted == y_true).sum().item()

    print('Accuracy of the model on the Test Data: {} %'.format((correct / total) * 100))

Accuracy of the model on the Test Data: 84.07407407407408 %


# Внедряем нашу нейронку (создание мини Голосового Ассистента)

In [145]:
#Устанавливаем библиотеки если они не установлены
!pip3 install sounddevice
!pip3 install scipy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [23]:
  sounddevice
from scipy.io.wavfile   write

def voice_recorder(seconds, file):
    print("Recording Started…")
    recording = sounddevice.rec((seconds * 44100), samplerate= 44100, channels=1)
    sounddevice.wait()
    write(file, 44100, recording)
    print("Recording Finished")

In [24]:
emo_dict = {"0": "angry",
              "1": "disgust",
              "2": "fear",
              "3": "happy",
              "4": "neutral",
              "5": "pleasant_surprised",
              "6": "sad"}

def recognize_emotion(model=model, emo_dict=emo_dict, seconds=5):

    songname = f"{input()}.m4a"
    voice_recorder(seconds, songname)

    y, sr = librosa.load(songname, mono=True)

    rms = librosa.feature.rms(y=y)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)

    features = [np.mean(chroma_stft), np.mean(rms), np.mean(spec_cent), np.mean(spec_bw), np.mean(rolloff), np.mean(zcr)]

    for e in mfcc:
        features.append(np.mean(e))

    features = torch.tensor(features, dtype=torch.float32)
    with torch.no_grad():
        prediction = model(features)

    pred_emo_index = torch.max(prediction.data, 0)[1].item()

    print(f"You are {emo_dict[str(pred_emo_index)].upper()}")
    return emo_dict[str(pred_emo_index)]

In [25]:
recognize_emotion(seconds=5)

record
Recording Started…
Recording Finished
You are PLEASANT_SURPRISED


'pleasant_surprised'

In [26]:
genres = ["angry", "sad", "happy", "fear", "disgust", "neutral", "pleasant_surprised"]

answers = pd.read_csv("answers.csv")

In [29]:
def reply(seconds=5):
    emotion = recognize_emotion(seconds=seconds)
    answer = answers[emotion].sample().iloc[0]
    t1 = gtts.gTTS(answer, tld="ru", lang="ru")
    t1.save("answer.mp3")
    print("Отвечает...")
    playsound("answer.mp3")
    print("Ответ ")

In [30]:
reply(seconds=5)

record
Recording Started…
Recording Finished
You are ANGRY
Отвечает...
Ответ 
