In [94]:
import pandas as pd
import opensmile
from tqdm import tqdm
import keras
import numpy as np
from sklearn.model_selection import train_test_split
import librosa

In [83]:
def return_features(file,pad):
    X, sample_rate = librosa.load(file)
    max_ = X.shape[0] / sample_rate
    if max_ < pad:
        length = (pad * sample_rate) -  (X.shape[0] / sample_rate)
        X = librosa.util.pad_center(X, size = length, mode = 'constant')
    smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.emobase,
    feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
    )
    y = smile.process_signal(X,sample_rate)
    #  smile.process_file(file, end=3)
    return y
def get_max_min(files):
    min_, max_ = 100, 0
    for file in files:
        sound_file, samplerate = librosa.load(file)
        t = sound_file.shape[0] / samplerate
        if t < min_:
            min_ = t
        if t >= max_:
            max_ = t

    return np.round(max_,2) + 0.01, min_

In [103]:
data_df = pd.read_csv("EMOVO_dataset/data.csv")

In [104]:
train_data = pd.DataFrame(columns=['filename', 'features', 'label'])
max, min = get_max_min('EMOVO_dataset/'+data_df.file_name)
features = []
for index, file in tqdm(zip(data_df.index, data_df.file_name)):
    train_data.loc[index] = [file, return_features('EMOVO_dataset/'+file,max), data_df.label[index]]

588it [01:02,  9.38it/s]


In [105]:
data_classes = (list((train_data["label"].unique())))
Y = keras.utils.to_categorical(list((train_data["label"].apply(data_classes.index))))
# X = np.concatenate([X1,X2,X3,X4], axis=2)
X = np.expand_dims(np.stack(train_data["features"]),3)


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=22)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=22)

In [89]:
def get_cnn(input_shape):
    model = keras.Sequential()

    model.add(keras.layers.Input(shape=input_shape))

    model.add(keras.layers.Conv2D(256, 3, activation='relu' ))
    model.add(keras.layers.MaxPooling2D(padding='same'))
    model.add(keras.layers.Dropout(rate=0.3))

    model.add(keras.layers.Conv2D(128, 3, activation='relu'))
    model.add(keras.layers.MaxPooling2D(padding='same'))
    model.add(keras.layers.Dropout(rate=0.3))


    model.add(keras.layers.Conv2D(64, 3, activation='relu'))
    model.add(keras.layers.MaxPooling2D(padding='same'))
    model.add(keras.layers.Dropout(rate=0.3))

    model.add(keras.layers.GlobalAveragePooling2D())
    model.add(keras.layers.Dense(1024, activation='relu'))
    
    model.add(keras.layers.Dense(256, activation='relu'))
    model.add(keras.layers.Dense(64, activation='relu'))

    model.add(keras.layers.Dense(7, activation='softmax'))

    optimzer = keras.optimizers.Adam()
    model.compile(loss='categorical_crossentropy', optimizer=optimzer, metrics=['accuracy'])

    return model

In [100]:
from datetime import datetime  
name = datetime.now().strftime("model/emobase_feat/SER_EMOVO_Clean%d_%m_%Y_%H_%M_%S.keras")  

callbacks = [
    keras.callbacks.ModelCheckpoint(
        filepath = name,
        save_best_only=True,
        verbose=1,
        monitor="val_loss"),

    keras.callbacks.EarlyStopping(  
        monitor="val_loss",
        min_delta=0.001,
        patience=10,
        verbose=1,
        mode="auto",
        restore_best_weights=True
    )
]

model = get_cnn((X_train.shape[1:]))
# model.summary()

In [209]:
history = model.fit(X_train, y_train, 
                       validation_data=(X_val,y_val), 
                       batch_size=32,
                       epochs=1000,
                       callbacks=callbacks)


print(f"Loss : {model.evaluate(X_test,y_test)[0]}, Accuracy : {model.evaluate(X_test,y_test)[1]}")

Epoch 1/1000
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.1581 - loss: 2.0543
Epoch 1: val_loss improved from inf to 1.93774, saving model to model/emobase_feat/SER_EMOVO_Clean05_10_2024_11_16_59.keras
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 1s/step - accuracy: 0.1557 - loss: 2.0524 - val_accuracy: 0.1792 - val_loss: 1.9377
Epoch 2/1000
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.1472 - loss: 1.9449
Epoch 2: val_loss did not improve from 1.93774
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 1s/step - accuracy: 0.1467 - loss: 1.9452 - val_accuracy: 0.1792 - val_loss: 1.9451
Epoch 3/1000
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.1339 - loss: 1.9431
Epoch 3: val_loss did not improve from 1.93774
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2s/step - accuracy: 0.1338 - loss: 1.9431 - val_accuracy: 0.09

In [92]:
data_df = pd.read_csv("RAVDESS/data.csv")
data_df.drop(data_df[data_df['label'] == "calm"].index, inplace = True)
data_df = data_df.reset_index()

In [93]:
train_data = pd.DataFrame(columns=['filename', 'features', 'label'])
max, min = get_max_min('RAVDESS/'+data_df.file_name)
features = []
for index, file in tqdm(zip(data_df.index, data_df.file_name)):
    train_data.loc[index] = [file, return_features('RAVDESS/'+file,max), data_df.label[index]]

1248it [01:08, 18.09it/s]


In [94]:
X, sample_rate = librosa.load('RAVDESS/'+data_df.file_name[10])
max_ = X.shape[0] / sample_rate
print(max_)

if max_ <= max:
    length = (max * sample_rate) -  (X.shape[0] / sample_rate)
    print(length)
    X = librosa.util.pad_center(X, size = length, mode = 'constant')
print(np.round(X.shape[0] / sample_rate,2))

smile = opensmile.Smile(
feature_set=opensmile.FeatureSet.emobase,
feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
)

y = smile.process_signal(X,sample_rate)
X.shape, y.shape

3.937278911564626
116420.06272108843
5.28


((116420,), (524, 26))

In [95]:
X, sample_rate = librosa.load('RAVDESS/'+data_df.file_name[1129])
max_ = X.shape[0] / sample_rate
print(max_)
print(max)
if max_ <= max:
    length = (max * sample_rate) -  (X.shape[0] / sample_rate)
    print(length)
    X = librosa.util.pad_center(X, size = length, mode = 'constant')
print(np.round(X.shape[0] / sample_rate,2))

smile = opensmile.Smile(
feature_set=opensmile.FeatureSet.emobase,
feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
)

y = smile.process_signal(X,sample_rate)
X.shape, y.shape

3.9039002267573695
5.279999999999999
116420.09609977323
5.28


((116420,), (524, 26))

In [96]:
for i,feat in enumerate(train_data["features"]):
    if feat.shape != (524,26):
        print(feat.shape,i)

In [97]:
data_classes = (list((train_data["label"].unique())))
Y = keras.utils.to_categorical(list((train_data["label"].apply(data_classes.index))))
# X = np.concatenate([X1,X2,X3,X4], axis=2)
X = np.expand_dims(np.stack(train_data["features"]),3)


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=22)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=22)

In [101]:
from datetime import datetime  
name = datetime.now().strftime("model/emobase_feat/SER_RAVDESS_Clean%d_%m_%Y_%H_%M_%S.keras")  

callbacks = [
    keras.callbacks.ModelCheckpoint(
        filepath = name,
        save_best_only=True,
        verbose=1,
        monitor="val_loss"),

    keras.callbacks.EarlyStopping(  
        monitor="val_loss",
        min_delta=0.001,
        patience=10,
        verbose=1,
        mode="auto",
        restore_best_weights=True
    )
]

model = get_cnn((X_train.shape[1:]))
# model.summary()

In [99]:
history = model.fit(X_train, y_train, 
                       validation_data=(X_val,y_val), 
                       batch_size=32,
                       epochs=1000,
                       callbacks=callbacks)


print(f"Loss : {model.evaluate(X_test,y_test)[0]}, Accuracy : {model.evaluate(X_test,y_test)[1]}")

Epoch 1/1000
[1m28/29[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 420ms/step - accuracy: 0.1534 - loss: 2.0413
Epoch 1: val_loss improved from inf to 1.91715, saving model to model/emobase_feat/SER_RAVDESS_Clean05_10_2024_12_41_13.keras
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 446ms/step - accuracy: 0.1540 - loss: 2.0371 - val_accuracy: 0.2089 - val_loss: 1.9171
Epoch 2/1000
[1m28/29[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 450ms/step - accuracy: 0.1527 - loss: 1.9223
Epoch 2: val_loss improved from 1.91715 to 1.89592, saving model to model/emobase_feat/SER_RAVDESS_Clean05_10_2024_12_41_13.keras
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 477ms/step - accuracy: 0.1524 - loss: 1.9218 - val_accuracy: 0.1867 - val_loss: 1.8959
Epoch 3/1000
[1m28/29[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 523ms/step - accuracy: 0.2055 - loss: 1.8901
Epoch 3: val_loss did not improve from 1.89592
[1m29/29[0m [32m━━━━━━━━

In [106]:
model.evaluate(X,Y)

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 252ms/step - accuracy: 0.1483 - loss: 1.9782


[1.9752070903778076, 0.1428571492433548]