# **Files structure**


In [2]:
!mkdir - p datasets/{datapaths, EMO-DB, RAVDESS, TESS, CREMA-D, SAVEE, EMOVO, MELD/{train, test, dev}}


# **Imports**


In [None]:
import pandas as pd
import numpy as np

import sys
import os

import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import Audio


# **Data processing**


## Emotion segregation


##### RAVDESS


In [None]:
ravdess_dir = "datasets/RAVDESS/speech/"
dir_list = [ravdess_dir + folder + '/' for folder in os.listdir(ravdess_dir)]
dir_list.sort()

file_emotion = []
file_path = []
for cur_dir in dir_list:
    # retrieve list of recordings for consecutive actors
    actor = os.listdir(cur_dir)
    for cur_file in actor:
        # retrieve names of consecutive files
        part = cur_file.split('.')[0]
        part = part.split('-')
        # update the lists of filepaths and emotions' ids
        file_emotion.append(int(part[2]))
        file_path.append(cur_dir + cur_file)

emotion_df = pd.DataFrame(file_emotion, columns=['Emotion'])
path_df = pd.DataFrame(file_path, columns=['Path'])

ravdess_df = pd.concat([emotion_df, path_df], axis=1)
ravdess_df.Emotion.replace(
    {1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad',
     5: 'angry', 6: 'fear', 7: 'disgusted', 8: 'surprised'},
    inplace=True)
ravdess_df.head()


##### CREMA


In [None]:
crema_dir = 'datasets/CREMA-D/AudioWAV'
crema_files = os.listdir(crema_dir)

file_emotion = []
file_path = []
for cur_dir in crema_files:
    file_path.append(crema_dir + '/' + cur_dir)
    part = cur_dir.split('_')
    file_emotion.append({
        'SAD': 'sad',
        'ANG': 'angry',
        'DIS': 'disgusted',
        'FEA': 'fear',
        'HAP': 'happy',
        'NEU': 'neutral'
    }.get(part[2], 'Unknown'))

emotion_df = pd.DataFrame(file_emotion, columns=['Emotion'])
path_df = pd.DataFrame(file_path, columns=['Path'])
crema_df = pd.concat([emotion_df, path_df], axis=1)
crema_df.head()


##### TESS


In [None]:
tess_dir = 'datasets/TESS'
tess_files = os.listdir(tess_dir)

tess_emo_dict = {
    'ps': 'surprised',
    'disgust': 'disgusted'
}

file_emotion = []
file_path = []
for cur_dir in tess_files:
    if cur_dir.split('.')[-1] != 'wav':
        continue
    file_path.append(tess_dir + '/' + cur_dir)
    part = cur_dir.split('.')[0]
    part = part.split('_')[2]
    file_emotion.append(tess_emo_dict.get(part, part))

emotion_df = pd.DataFrame(file_emotion, columns=['Emotion'])
path_df = pd.DataFrame(file_path, columns=['Path'])
tess_df = pd.concat([emotion_df, path_df], axis=1)
tess_df.head()


##### SAVEE


In [None]:
savee_dir = 'datasets/SAVEE/AudioData'
savee_folders = os.listdir(savee_dir)

file_emotion = []
file_path = []
for folder in savee_folders:
    savee_files = os.listdir(savee_dir + '/' + folder)
    for file_name in savee_files:
        file_path.append(savee_dir + '/' + folder + '/' + file_name)
        part = file_name[:2] if len(file_name) == 8 else file_name[0]
        file_emotion.append({
            'a': 'angry',
            'd': 'disgusted',
            'f': 'fear',
            'h': 'happy',
            'n': 'neutral',
            'sa': 'sad',
            'su': 'surprised'}.get(part, 'Unknown')
        )

emotion_df = pd.DataFrame(file_emotion, columns=['Emotion'])
path_df = pd.DataFrame(file_path, columns=['Path'])
savee_df = pd.concat([emotion_df, path_df], axis=1)
savee_df.head()


##### EMO-DB


In [None]:
emo_dir = 'datasets/EMO-DB/wav'
emo_files = os.listdir(emo_dir)

file_path = []
file_emotion = []
for file_name in emo_files:
    file_path.append(emo_dir + '/' + file_name)
    part = file_name.split('.')[0]
    file_emotion.append({
        'W': 'angry',
        'L': 'neutral',
        'E': 'disgusted',
        'A': 'fear',
        'F': 'happy',
        'T': 'sad',
        'N': 'neutral'
    }.get(part[-2], 'Unknown'))

emotion_df = pd.DataFrame(file_emotion, columns=['Emotion'])
path_df = pd.DataFrame(file_path, columns=['Path'])
emo_df = pd.concat([emotion_df, path_df], axis=1)
emo_df.head()


## Join & save datasets


In [None]:
ravdess_df.to_csv('datasets/ravdess.csv', index=False)
tess_df.to_csv('datasets/tess.csv', index=False)
savee_df.to_csv('datasets/savee.csv', index=False)
crema_df.to_csv('datasets/crema.csv', index=False)
emo_df.to_csv('datasets/emo.csv', index=False)

data_paths = pd.concat(
    [ravdess_df, tess_df, savee_df, crema_df, emo_df], axis=0)
data_paths.to_csv("datapaths/all_data_paths.csv", index=False)


## Data loading


In [None]:
ravdess_df = pd.read_csv('drive/MyDrive/RAVDESS/ravdess.csv')
tess_df = pd.read_csv('drive/MyDrive/TESS/tess.csv')
savee_df = pd.read_csv('drive/MyDrive/SAVEE/savee.csv')
crema_df = pd.read_csv('drive/MyDrive/CREMA-D/crema.csv')
emo_df = pd.read_csv('drive/MyDrive/EMO/emo.csv')

data_paths = pd.concat(
    [ravdess_df, tess_df, savee_df, crema_df, emo_df], axis=0)
data_paths.head()


## Visualization and statistics


##### Functions


In [None]:
def create_waveplot(data, sr, e):
    plt.figure(figsize=(10, 3))
    plt.title(f'Waveplot for audio with {e} emotion', size=15)
    librosa.display.waveplot(data, sr=sr)
    plt.show()


def create_spectrogram(data, sr, e):
    X = librosa.stft(data)
    Xdb = librosa.amplitude_to_db(abs(X))
    plt.figure(figsize=(12, 3))
    plt.title(f'Spectrogram for audio with {e} emotion', size=15)
    librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')
    plt.colorbar()


def show_emotion(emotion):
    path = np.array(data_paths.Path[data_paths.Emotion == emotion])[1]
    data, sampling_rate = librosa.load(path)
    create_waveplot(data, sampling_rate, emotion)
    create_spectrogram(data, sampling_rate, emotion)
    return path


### Distribution of emotions in whole dataset distribution


In [None]:
plt.title('Count of emotions', size=16)
sns.countplot(data_paths.Emotion)
plt.ylabel('Count', size=12)
plt.xlabel('Emotions', size=12)
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()


### Particular emotions visualization


##### Fear


In [None]:
Audio(show_emotion('fear'))


##### Sad


In [None]:
Audio(show_emotion('sad'))


##### Happy


In [None]:
Audio(show_emotion('happy'))


##### Angry


In [None]:
Audio(show_emotion('angry'))


##### Calm


In [None]:
Audio(show_emotion('calm'))


##### Surprised


In [None]:
Audio(show_emotion('surprised'))


##### Disgusted


In [None]:
Audio(show_emotion('disgusted'))


##### Neutral


In [None]:
Audio(show_emotion('neutral'))


### Data augmentation


In [None]:
def noise(data):
    noise_amp = 0.035 * np.random.uniform() * np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data


def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)


def shift(data):
    shift_range = int(np.random.uniform(low=-5, high=5) * 1000)
    return np.roll(data, shift_range)


def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)


path = np.array(data_paths.Path)[1]
data, sample_rate = librosa.load(path)


#### Clear data (without augmentation)


In [None]:
plt.figure(figsize=(14, 4))
plt.title("Clear recording", size=15)
librosa.display.waveplot(y=data, sr=sample_rate)
Audio(path)


#### Augmented data


In [None]:
x = noise(data)
plt.figure(figsize=(14, 4))
plt.title("With random noise", size=20)
librosa.display.waveplot(y=x, sr=sample_rate)
Audio(x, rate=sample_rate)


In [None]:
x = stretch(data)
plt.figure(figsize=(14, 4))
plt.title("Streched", size=20)
librosa.display.waveplot(y=x, sr=sample_rate)
Audio(x, rate=sample_rate)


In [None]:
x = shift(data)
plt.figure(figsize=(14, 4))
plt.title("Shifted", size=20)
librosa.display.waveplot(y=x, sr=sample_rate)
Audio(x, rate=sample_rate)


In [None]:
x = pitch(data, sample_rate)
plt.figure(figsize=(14, 4))
plt.title("Pitch change", size=20)
librosa.display.waveplot(y=x, sr=sample_rate)
Audio(x, rate=sample_rate)


## Data generation


In [None]:
path = np.array(data_paths.Path)[11]
y, sr = librosa.load(path)
some_voice, _ = librosa.effects.trim(y)
librosa.display.waveplot(some_voice, sr=sr)


#### Amplitude to dB


In [None]:
hop_length = 512
D = np.abs(librosa.stft(some_voice, n_fft=2048, hop_length=hop_length))
DB = librosa.amplitude_to_db(D, ref=np.max)
plt.axis('off')
librosa.display.specshow(
    DB, sr=sr, hop_length=hop_length, x_axis='time', y_axis='log')


#### Power to dB


In [None]:
img_array = librosa.feature.melspectrogram(
    y=some_voice,
    sr=sr,
    hop_length=512,
    win_length=512,
    window=np.hanning(512))

M_db = librosa.power_to_db(img_array, ref=np.max)
plt.axis('off')
img = librosa.display.specshow(M_db, y_axis='mel', x_axis='time')


#### Mel-spectrogram


In [None]:
S = librosa.feature.melspectrogram(some_voice, sr=sr, n_fft=2048, hop_length=hop_length, n_mels=128)
S_DB = librosa.power_to_db(S, ref=np.max)
plt.axis('off')
librosa.display.specshow(S_DB, sr=sr, hop_length=hop_length, x_axis='time', y_axis='mel')


### Spectrogram dataset generation


In [None]:
def create_mel(sound, path_to_save, mode='stft'):
    fig, ax = plt.subplots()
    ax.axis('off')

    if mode == 'mel':
        D = librosa.feature.melspectrogram(
            some_sound, n_fft=N_FFT, hop_length=WIN_HOP_LENGTH, n_mels=256)
        DB = librosa.power_to_db(D, ref=np.max)
        librosa.display.specshow(
            DB, sr=sampling_rate, hop_length=WIN_HOP_LENGTH, x_axis='time', y_axis='mel')
    elif mode == 'stft':
        D = np.abs(librosa.stft(some_sound, n_fft=N_FFT,
                   hop_length=WIN_HOP_LENGTH))
        DB = librosa.amplitude_to_db(D, ref=np.max)
        librosa.display.specshow(
            DB, sr=sampling_rate, hop_length=WIN_HOP_LENGTH, x_axis='time', y_axis='log')
    else:
        print("Wrong mode")
        return
    fig.savefig(f'{path_to_save}.png', bbox_inches='tight', pad_inches=0)
    plt.figure().clear()
    plt.close()
    plt.cla()
    plt.clf()


In [None]:
WIN_HOP_LENGTH = 512
N_FFT = 2048

emotions = {
    'fear': 0,
    'happy': 0,
    'sad': 0,
    'surprised': 0,
    'angry': 0,
    'disgusted': 0,
    'neutral': 0,
    'calm': 0
}
!mkdir -p train_mel_pow/{fear,happy,sad,surprised,angry,disgusted,neutral,calm}


In [None]:
GEN_WITH_AUG = False

for _, image in data_paths.iterrows():
    print(f"{image.Emotion}: {image.Path}")
    some_sound, sampling_rate = librosa.load(image.Path)

    # Clear data
    create_mel(
        some_sound, f'train/{image.Emotion}/{emotions[image.Emotion]}', 'mel')

    # Augmented data
    if GEN_WITH_AUG:
        create_mel(noise(some_sound),
                   f'train/{image.Emotion}/{emotions[image.Emotion]}_n', 'mel')
        create_mel(stretch(some_sound),
                   f'train/{image.Emotion}/{emotions[image.Emotion]}_s', 'mel')
        create_mel(pitch(some_sound, sampling_rate),
                   f'train/{image.Emotion}/{emotions[image.Emotion]}_p', 'mel')
        create_mel(shift(some_sound),
                   f'train/{image.Emotion}/{emotions[image.Emotion]}_sh', 'mel')

    emotions[image.Emotion] += 1
