In [None]:
!git clone https://github.com/Mr-Patty/bimodal-emotion-recognition
!pip install soundfile

In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

In [None]:
!cp drive/'My Drive'/EmotionRecognition/bimodal-emotion-recognition.tar.gz bimodal-emotion-recognition/
%cd bimodal-emotion-recognition
!tar -C . -xzf bimodal-emotion-recognition.tar.gz
!rm bimodal-emotion-recognition.tar.gz
!mv Audio_ogg_10 Audio_preprocess

In [1]:
import wave
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pandas as pd
import math
import librosa
import json
import wave
import sys
import pickle
import sklearn

import urllib.request

import librosa.display
import scipy, matplotlib.pyplot as plt, IPython.display as ipd

from scipy.io import wavfile

import librosa.display
import soundfile as sf

import torch
import torch.nn as nn
from torch import optim
from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

import os
import matplotlib.style as ms
from tqdm import tqdm
import random

from utils import *
ms.use('seaborn-muted')
%matplotlib inline

In [10]:
emotion_dict = {'ang': 0,
                'dis': 1,
                'hap': 2,
                'sad': 3,
                'sca': 4,
                'sur': 5,
                'neu': 6
                }
labels_df = pd.read_csv('df_prep.csv').to_numpy()
columns = ['wav_file', 'label', 'sig_mean', 'sig_std', 'rmse_mean', 'rmse_std', 'silence', 'harmonic', 'auto_corr_max', 'auto_corr_std']
df_features = pd.DataFrame(columns=columns)

In [None]:
for file, emo in tqdm(labels_df):
    try:
        wav_file_name = 'Audio_preprocess/' + file + '.ogg'
        label = emotion_dict[emo]
        y, _sr = librosa.load(wav_file_name, sr=48000)

        feature_list = [wav_file_name, label]  # wav_file, label
        sig_mean = np.mean(abs(y))
        feature_list.append(sig_mean)  # sig_mean
        feature_list.append(np.std(y))  # sig_std

        rmse = librosa.feature.rms(y + 0.0001)[0]
        feature_list.append(np.mean(rmse))  # rmse_mean
        feature_list.append(np.std(rmse))  # rmse_std

        silence = 0
        for e in rmse:
            if e <= 0.4 * np.mean(rmse):
                silence += 1
        silence /= float(len(rmse))
        feature_list.append(silence)  # silence

        y_harmonic = librosa.effects.hpss(y)[0]
        feature_list.append(np.mean(y_harmonic) * 1000)  # harmonic (scaled by 1000)

        # based on the pitch detection algorithm mentioned here:
        # http://access.feld.cvut.cz/view.php?cisloclanku=2009060001
        cl = 0.45 * sig_mean
        center_clipped = []
        for s in y:
            if s >= cl:
                center_clipped.append(s - cl)
            elif s <= -cl:
                center_clipped.append(s + cl)
            elif np.abs(s) < cl:
                center_clipped.append(0)
        auto_corrs = librosa.core.autocorrelate(np.array(center_clipped))
        feature_list.append(1000 * np.max(auto_corrs)/len(auto_corrs))  # auto_corr_max (scaled by 1000)
        feature_list.append(np.std(auto_corrs))  # auto_corr_std

        df_features = df_features.append(pd.DataFrame(feature_list, index=columns).transpose(), ignore_index=True)
    except:
        print('Some exception occured')
        raise

df_features.to_csv('audio_features.csv', index=False)

In [None]:
scalar = MinMaxScaler()
df_features[df_features.columns[2:]] = scalar.fit_transform(df_features[df_features.columns[2:]])
df_features.head()

In [None]:
x_train, x_test = train_test_split(df_features, test_size=0.20)

x_train.to_csv('audio_train.csv', index=False)
x_test.to_csv('audio_test.csv', index=False)
y_train = x_train['label']
y_test = x_test['label']

print(x_train.shape, x_test.shape)

In [13]:
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle
import matplotlib
import matplotlib.pyplot as plt
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.utils.class_weight import compute_class_weight
import itertools

In [14]:
emotion_dict = {'ang': 0,
                'dis': 1,
                'hap': 2,
                'sad': 3,
                'sca': 4,
                'sur': 5,
                'neu': 6
                }

emo_keys = list(['ang', 'hap', 'sad', 'fea', 'sur', 'neu', 'sca'])

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    # plt.figure(figsize=(8,8))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
def one_hot_encoder(true_labels, num_records, num_classes):
    temp = np.array(true_labels[:num_records])
    true_labels = np.zeros((num_records, num_classes))
    true_labels[np.arange(num_records), temp] = 1
    return true_labels

def display_results(y_test, pred_probs, cm=True):
    pred = np.argmax(pred_probs, axis=-1)
    one_hot_true = one_hot_encoder(y_test, len(pred), len(emotion_dict))
    print('Test Set Accuracy =  {0:.3f}'.format(accuracy_score(y_test, pred)))
    print('Test Set F-score =  {0:.3f}'.format(f1_score(y_test, pred, average='macro')))
    print('Test Set Precision =  {0:.3f}'.format(precision_score(y_test, pred, average='macro')))
    print('Test Set Recall =  {0:.3f}'.format(recall_score(y_test, pred, average='macro')))
    if cm:
        plot_confusion_matrix(confusion_matrix(y_test, pred), classes=emo_keys)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=1200, min_samples_split=25)
rf_classifier.fit(x_train, y_train)

# Predict
pred_probs = rf_classifier.predict_proba(x_test)

# Results
display_results(y_test, pred_probs)

with open('rf_classifier.pkl', 'wb') as f:
    pickle.dump(pred_probs, f)