<a href="https://colab.research.google.com/github/MSiswanto/NLP/blob/main/AIM0407D2203_Lab_1B_Speech_Emotion_Recognition_(SER).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AIM0407D2203 Lab 1B Speech Emotion Recognition (SER)
---

1. Ucapan dan emosi adalah dua aspek komunikasi terpenting di antara manusia yang menjadikan Speech Emotion Recognition (SER) sebagai salah satu komponen kunci pada sistem Human-Computer Interaction (HCI). 
2. SER adalah salah satu tugas NLP untuk mengenali aspek emosional dari ucapan manusia terlepas dari isi semantiknya.
3. SER dapat dikembangkan untuk banyak aplikasi, misalnya untuk aplikasi bagi individu dengan Gangguan Spektrum Autisme.

Tujuan utama dari eksperimen ini adalah untuk membuat sistem SER sederhana dan mengeksplorasi algoritma machine learning yang berbeda untuk tugas SER. Secara khusus, kita akan membandingkan pendekatan statistik tradisional dengan metode deep learning yang lebih modern berdasarkan metrik evaluasi untuk mempelajari lebih lanjut tentang struktur data dan kompleksitas tugas SER.

Secara umum, skema yang akan kita lakukan umtuk eksperimen ini akan mengikuti bagan berikut:




# 01 Install & Import Library

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import librosa
import librosa.display

import tensorflow as tf

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Import the audio playback widget
import IPython.display as ipd

# 02 Data Acquisition
Kita akan menggunakan RAVDESS (Ryerson Audio-Visual Database of Emotional Speech and Song) dataset untuk melakukan eksperimen ini. Dataset terdiri dari 1440 file dari 24 aktor profesional (12 pria dan 12 wanita) yang dibagi dalam 8 ekspresi emosi: calm, neutral, surprised, happy, angry, sad, fearful, or disgust. Setiap aktor menyuarakan dua pernyataan: 01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door".

Baca selanjutnya tentang RAVDESS: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0196391

In [None]:
# https://drive.google.com/u/1/uc?id=1sX82joy1Rcw9p-YHZzc0bEfqfzOzckzY&export=download
!gdown 1sX82joy1Rcw9p-YHZzc0bEfqfzOzckzY

In [None]:
!unzip -q 'AIM0407D2203 Lab 1B RAVDESS Speech dataset.zip'

In [None]:
!ls

In [None]:
RAVD_PATH = 'ravdess_speech_dataset/'   # Path to the directory

dir_list = os.listdir(RAVD_PATH)        # List files in audio directory
dir_list.sort()

dir_list

In [None]:
emotion = []
gender = []
actor = []
file_path = []

for i in dir_list:
  filename = os.listdir(RAVD_PATH + i)    # Iterate over actor folders
  for f in filename:                      # Go through files in Actor folder
    part = f.split('.')[0].split('-')
    emotion.append(int(part[2]))
    actor.append(int(part[6]))
    temp = int(part[6])
    if temp%2 == 0:
      temp = 'female'
    else:
      temp = 'male'
    gender.append(temp)
    file_path.append(RAVD_PATH + i + '/' + f)

In [None]:
df = pd.DataFrame(emotion)
df = df.replace({1:'neutral', 2:'neutral', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'})
df = pd.concat([pd.DataFrame(gender), df, pd.DataFrame(actor)], axis=1)
df.columns = ['gender','emotion','actor']
df = pd.concat([df, pd.DataFrame(file_path, columns=['path'])], axis=1)
df

In [None]:
df.emotion.value_counts().plot(kind='bar')

In [None]:
df.to_csv('speech_path_df.csv')

# 03 Exploratory Data Analysis (EDA)

Fitur utama dari data audio adalah MFCC (Mel Frequency Cepstral Coefficients) dan Mel Spectrogram

1. MFCC (Mel Frequency Cepstral Coefficients) - MFCC adalah ekstraksi fitur penting saat menggunakan data ucapan. Skala Mel adalah skala yang menghubungkan frekuensi nada yang dirasakan dengan frekuensi nyata yang diukur.
2. Mel Spectogram - A Fast Fourier Transform dihitung pada segmen berjendela yang tumpang tindih dari sinyal. Spektogram adalah cara visual representasi kekuatan sinyal dan juga digunakan untuk menampilkan gelombang frekuensi suara.

## Male Angry

In [None]:
male_angry = RAVD_PATH + 'Actor_03/03-01-05-01-01-01-03.wav'
data, sr = librosa.load(male_angry)
ipd.Audio(male_angry) 

In [None]:
# Display waveplot
plt.figure(figsize=(10, 5))
librosa.display.waveplot(data, sr=sr)
plt.title('Waveplot - Male Angry')
plt.show()

In [None]:
# Display Log Mel Spectogram
spectrogram = librosa.feature.melspectrogram(y=data, sr=sr, n_mels=128, fmax=8000) 
spectrogram = librosa.power_to_db(spectrogram)

plt.figure(figsize=(10, 5))
librosa.display.specshow(spectrogram, y_axis='mel', fmax=8000, x_axis='time');
plt.title('Mel Spectrogram - Male Angry')
plt.colorbar(format='%+2.0f dB')
plt.show()

In [None]:
# Display MFCC
mfcc = librosa.feature.mfcc(y=data, sr=sr, n_mfcc=13)

plt.figure(figsize=(16, 10))
plt.subplot(3,1,1)
librosa.display.specshow(mfcc, x_axis='time')
plt.title('MFCC - Male Angry')
plt.colorbar()
plt.show()

## Female Angry

In [None]:
female_angry = RAVD_PATH + 'Actor_18/03-01-05-01-01-01-18.wav'
data, sr = librosa.load(female_angry)
ipd.Audio(female_angry) 

In [None]:
# Display waveplot
plt.figure(figsize=(10, 5))
librosa.display.waveplot(data, sr=sr)
plt.title('Waveplot - Female Angry')
plt.show()

In [None]:
# Display Log Mel Spectogram
spectrogram = librosa.feature.melspectrogram(y=data, sr=sr, n_mels=128, fmax=8000) 
spectrogram = librosa.power_to_db(spectrogram)

plt.figure(figsize=(10, 5))
librosa.display.specshow(spectrogram, y_axis='mel', fmax=8000, x_axis='time');
plt.title('Mel Spectrogram - Female Angry')
plt.colorbar(format='%+2.0f dB')
plt.show()

In [None]:
# Display MFCC
mfcc = librosa.feature.mfcc(y=data, sr=sr, n_mfcc=13)

plt.figure(figsize=(16, 10))
plt.subplot(3,1,1)
librosa.display.specshow(mfcc, x_axis='time')
plt.title('MFCC - Female Angry')
plt.colorbar()
plt.show()

## Male Angry vs Female Angry

In [None]:
# Gender - Male; Emotion - Angry
male_angry = RAVD_PATH + 'Actor_03/03-01-05-01-01-01-03.wav'
X, sample_rate = librosa.load(male_angry)  
female = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
female = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0)

# Gender - Female; Emotion - Angry
female_angry = RAVD_PATH + 'Actor_18/03-01-05-01-01-01-18.wav'
X, sample_rate = librosa.load(female_angry)  
male = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
male = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0)

# Plot the two audio waves together
plt.figure(figsize=(16,10))
plt.subplot(3,1,1)
plt.plot(female, label='female')
plt.plot(male, label='male')
plt.title('Audio Waves - Male vs Female Angry')
plt.legend()
plt.show()

# 04 Data Augmentation
- Augmentasi data adalah proses di mana kita membuat sampel data terpolimerisasi baru dengan menambahkan sedikit gangguan pada set pelatihan awal.
- Tujuannya adalah untuk membuat model kita invarian terhadap gangguan tersebut dan meningkatkan kemampuannya untuk menggeneralisasi. Agar ini berfungsi, menambahkan gangguan harus mempertahankan label yang sama dengan sampel pelatihan asli.

In [None]:
# Ambil satu sample data yang diambil dari dataset
sample_data = np.array(df['path'])[471]
data, sample_rate = librosa.load(sample_data)

## Normal Audio

In [None]:
plt.figure(figsize=(10, 5))
librosa.display.waveshow(y=data, sr=sample_rate)
plt.title('Normal Audio Sample')

ipd.Audio(sample_data)

## Audio with Noise

In [None]:
def get_noise(data):
  noise_amp = 0.035*np.random.uniform()*np.amax(data)
  data = data + noise_amp*np.random.normal(size=data.shape[0])
  return data

x = get_noise(data)
plt.figure(figsize=(10,5))
librosa.display.waveshow(y=x, sr=sample_rate)
plt.title('Noise Audio Sample')

ipd.Audio(x, rate=sample_rate)

## Audio with Pitch

In [None]:
def get_pitch(data, sampling_rate, pitch_factor=0.7):
  return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

x = get_pitch(data, sample_rate)
plt.figure(figsize=(10,5))
librosa.display.waveshow(y=x, sr=sample_rate)
plt.title('Noise Audio Pitch')

ipd.Audio(x, rate=sample_rate)

## Shifted Audio

In [None]:
def get_shift(data):
  shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
  return np.roll(data, shift_range)

x = get_shift(data)
plt.figure(figsize=(10,5))
librosa.display.waveshow(y=x, sr=sample_rate)
plt.title('Shifted Audio Sample')

ipd.Audio(x, rate=sample_rate)

## Stretched Audio

In [None]:
def get_stretch(data, rate=0.8):
  return librosa.effects.time_stretch(data, rate)

x = get_stretch(data)
plt.figure(figsize=(10,5))
librosa.display.waveshow(y=x, sr=sample_rate)
plt.title('Stretched Audio Sample')

ipd.Audio(x, rate=sample_rate)

## Increase Audio Speed with Pitch

In [None]:
def get_speedpitch(data):
  length_change = np.random.uniform(low=0.8, high=1)
  speed_fac = 1.4  / length_change 
  tmp = np.interp(np.arange(0, len(data),speed_fac), np.arange(0,len(data)), data)
  minlen = min(data.shape[0], tmp.shape[0])
  data *= 0
  data[0:minlen] = tmp[0:minlen]
  return data

x = get_speedpitch(data)
plt.figure(figsize=(10,5))
librosa.display.waveshow(y=x, sr=sample_rate)
plt.title('Stretched Audio Sample')

ipd.Audio(x, rate=sample_rate)

# 05 Feature Extraction

Representasi domain suara sangat kompleks dan dalam bentuk aslinya, tidak memberikan wawasan yang sangat baik tentang karakteristik utama sinyal suara. Karena karakteristik sinyal suara ini, kami memetakan representasi domain waktu ini menjadi fitur yang lebih jelas. Teknik yang paling mudah melibatkan penentuan energi rata-rata sinyal. Metrik ini, bersama dengan energi total dalam sinyal, menunjukkan "volume" speaker. Durasi juga menawarkan wawasan tentang emosi, seperti halnya statistik seperti maksimum, minimum, jangkauan, rata-rata, dan standar deviasi dari sinyal dan spektrum. Ini mungkin menunjukkan fluktuasi dalam volume atau nada yang dapat berguna dalam menentukan emosi. Untuk sinyal dan spektrum, kami juga memperoleh kemiringan, ukuran penyimpangan simetri horizontal dalam sinyal, dan kurtosis, ukuran tinggi dan ketajaman puncak pusat, relatif terhadap kurva lonceng standar.

In [None]:
def get_audio_features(path):
  data, sample_rate = librosa.load(path, res_type='kaiser_fast', sr=20000*2, duration=2.5, offset=0.5)
  sampe_rate = np.array(sample_rate)
                        
  y_harmonic, y_percussive = librosa.effects.hpss(data)
  pitches, magnitudes = librosa.core.pitch.piptrack(y=data, sr=sample_rate)

  mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate, n_mfcc=13), axis=1)
  pitches = np.trim_zeros(np.mean(pitches, axis=1))[:20]
  magnitudes = np.trim_zeros(np.mean(magnitudes, axis=1))[:20]
  C = np.mean(librosa.feature.chroma_cqt(y=y_harmonic, sr=20000), axis=1)

  return [mfcc, pitches, magnitudes, C]
  
def get_features_dataframe(dataframe):
  features  = pd.DataFrame(columns=['mfcc','pitches','magnitudes','C'])
  for index, audio_path in enumerate(df['path']):
    features.loc[index] = get_audio_features(audio_path)
  
  mfcc = features.mfcc.apply(pd.Series)
  pit = features.pitches.apply(pd.Series)
  mag = features.magnitudes.apply(pd.Series)
  C = features.C.apply(pd.Series)
  
  features = pd.concat([mfcc, pit, mag,C], axis=1, ignore_index=True)

  return features

In [None]:
%%time
features_df = pd.concat([df, get_features_dataframe(df)], axis=1)

features_df

In [None]:
# Save features dataframe
features_df.to_csv('speech_feature_df.csv', index=False)

# 06 Prepping Data for Modeling

In [None]:
# Pisahkan kolom fitur dan target
X = features_df.iloc[:, 4:].values
y = features_df['emotion'].values

## One-hot Encoding

In [None]:
# Encode categorical features as a one-hot numeric array.
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

from sklearn.preprocessing import OneHotEncoder

label_encoder = OneHotEncoder()
y = label_encoder.fit_transform(np.array(y).reshape(-1,1)).toarray()

In [None]:
y

In [None]:
label_encoder.categories_

## Train & Test Data Splitting

In [None]:
# Split arrays or matrices into random train and test subsets.
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=features_df[['emotion','gender','actor']], random_state=0)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Reshape Data to 3D Tensor

In [None]:
X_train_tensor = X_train.reshape(X_train.shape[0] , X_train.shape[1] , 1)
X_test_tensor = X_test.reshape(X_test.shape[0] , X_test.shape[1] , 1)

X_train_tensor.shape, X_test_tensor.shape

# 07 Modelling
Kita akan membandingkan pendekatan statistik tradisional (baseline) dengan metode deep learning (CNN) untuk tugas SER.

## Baseline Model (Machine Learning)

### Decision Tree

In [None]:
# A decision tree classifier.
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

from sklearn.tree import DecisionTreeClassifier

clf_1 = DecisionTreeClassifier()              

clf_1.fit(X_train, y_train)
clf_1.predict(X_test)

print("Training set score: {:.3f}".format(clf_1.score(X_train, y_train)))
print("Test set score: {:.3f}".format(clf_1.score(X_test, y_test)))

### k Nearest Neighbor



In [None]:
# Classifier implementing the k-nearest neighbors vote.
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

from sklearn.neighbors import KNeighborsClassifier

clf_2 = KNeighborsClassifier(n_neighbors=4)
            
clf_2.fit(X_train, y_train)
clf_2.predict(X_test)

print("Training set score: {:.3f}".format(clf_2.score(X_train, y_train)))
print("Test set score: {:.3f}".format(clf_2.score(X_test, y_test)))

## Deep Learning Model

### Define Model (Convolutional Neural Network)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dropout, Dense 

In [None]:
model = Sequential()
model.add(Conv1D(filters=256, kernel_size=10, activation='relu', input_shape=(X_train_tensor.shape[1], X_train_tensor.shape[2])))
model.add(Conv1D(filters=128, kernel_size=10, activation='relu'))
model.add(Conv1D(filters=64, kernel_size=10, activation='relu'))
model.add(MaxPooling1D(pool_size=8))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(len(label_encoder.get_feature_names_out()), activation='sigmoid'))

In [None]:
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

### Training

In [None]:
%%time

EPOCH = 25
BATCH_SIZE = 32

model_history = model.fit(X_train_tensor, y_train, epochs=EPOCH, batch_size=BATCH_SIZE, validation_data=(X_test_tensor, y_test))

In [None]:
# Buat fungsi untuk plotting hasil training
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

In [None]:
plot_graphs(model_history, 'accuracy')
plot_graphs(model_history, 'loss')

In [None]:
print('\nEpoch No.  Train Accuracy  Train Loss      Val Accuracy    Val Loss')
for i in range(EPOCH):
  print('{:8d} {:10f} \t {:10f} \t {:10f} \t {:10f}'.format(i + 1, model_history.history['accuracy'][i], model_history.history['loss'][i], model_history.history['val_accuracy'][i], model_history.history['val_loss'][i]))

### Evaluate

In [None]:
# Lakukan prediksi pada data uji
y_pred = np.argmax(model.predict(X_test_tensor), axis=1)
y_true = np.argmax(y_test, axis=1)

loss, accuracy = model.evaluate(X_test_tensor, y_test)

In [None]:
from sklearn.metrics import classification_report

# Tampilkan laporan klasifikasi yang dilakukan model pada data uji
print(classification_report(y_pred, y_true, target_names=label_encoder.get_feature_names_out([''])))

# 08 Prediction

In [None]:
sample_data

In [None]:
demo_audio_path = sample_data

ipd.Audio(sample_data)

In [None]:
demo_mfcc, demo_pitch, demo_mag, demo_chrom = get_audio_features(demo_audio_path)

mfcc = pd.Series(demo_mfcc)
pit = pd.Series(demo_pitch)
mag = pd.Series(demo_mag)
C = pd.Series(demo_chrom)

demo_audio_features = pd.concat([mfcc,pit,mag,C], ignore_index=True)

In [None]:
demo_audio_features

In [None]:
# Reshape to 3D tensor
demo_audio_features = np.expand_dims(demo_audio_features, axis=0)
demo_audio_features = np.expand_dims(demo_audio_features, axis=2)

demo_audio_features.shape

In [None]:
demo_preds = model.predict(demo_audio_features)
demo_preds

In [None]:
emotions = ['anger','disgust','fear','happy','neutral','sad','surprise']

In [None]:
index = demo_preds.argmax(axis=1).item()
index

In [None]:
emotions[index]

# 09 Simple Deployment Using Gradio

In [None]:
'''
Gradio is the fastest way to demo your machine learning model with a friendly web interface so that anyone can use it, anywhere!
https://gradio.app/
'''

!pip -q install gradio

In [None]:
import gradio as gr

def ser(file):
  audio_mfcc, audio_pitch, audio_mag, audio_chrom = get_audio_features(file)

  mfcc = pd.Series(audio_mfcc)
  pit = pd.Series(audio_pitch)
  mag = pd.Series(audio_mag)
  C = pd.Series(audio_chrom)

  audio_features = pd.concat([mfcc,pit,mag,C], ignore_index=True)

  audio_features = np.expand_dims(audio_features, axis=0)
  audio_features = np.expand_dims(audio_features, axis=2)

  model_predict = model.predict(audio_features)
  model_output = model_predict.argmax(axis=1).item()

  emotions = ['anger','disgust','fear','happy','neutral','sad','surprise']
  
  return emotions[model_output]

iface = gr.Interface(fn=ser, 
                     inputs=gr.inputs.Audio(source="microphone", type="filepath"), 
                     outputs="text",
                     live=True)

iface.launch()

# 10 Student Activity