In [1]:
import scipy
import librosa
import numpy as np
import noisereduce as nr
from tensorflow.keras.models import load_model

In [2]:
model = load_model('../assets/cnn_model.keras')

In [3]:
def extract_plp(y, sr, numcep=13):
    emphasized = np.append(y[0], y[1:] - 0.97 * y[:-1])
    frame_length = int(0.025 * sr)
    frame_step = int(0.01 * sr)
    frames = librosa.util.frame(emphasized, frame_length=frame_length, hop_length=frame_step).T.copy()
    frames *= np.hamming(frame_length)
    
    NFFT = 512
    mag_frames = np.absolute(np.fft.rfft(frames, NFFT))
    pow_frames = (1.0 / NFFT) * (mag_frames ** 2)

    nfilt = 40
    mel_filters = librosa.filters.mel(sr=sr, n_fft=NFFT, n_mels=nfilt)
    mel_energy = np.dot(pow_frames, mel_filters.T)
    mel_energy = np.where(mel_energy == 0, np.finfo(float).eps, mel_energy)
    
    log_mel_energy = np.log(mel_energy)
    plp = scipy.fftpack.dct(log_mel_energy, type=2, axis=1, norm='ortho')[:, :numcep]
    
    return plp

In [4]:
def preprocess_audio(audio_path, n_mfcc, n_plp, window_size):
    y, sr = librosa.load(audio_path, sr=None)
    y, _ = librosa.effects.trim(y)
    y = nr.reduce_noise(y=y, sr=sr)
    y = librosa.util.normalize(y)

    window_length = window_size * sr
    total_length = len(y)

    feature_list = []

    for start in range(0, total_length, window_length):
        end = start + window_length
        if end > total_length:
            break

        window = y[start:end]

        mfcc = librosa.feature.mfcc(y=window, sr=sr, n_mfcc=n_mfcc).T

        plp = extract_plp(window, sr, numcep=n_plp)

        min_frames = min(mfcc.shape[0], plp.shape[0])
        mfcc = mfcc[:min_frames, :]
        plp = plp[:min_frames, :]

        combined = np.concatenate((mfcc, plp), axis=1)
        combined = combined[..., np.newaxis]

        feature_list.append(combined)

    return np.array(feature_list)


In [5]:
audio_path = '../test_audio/440_AUDIO.wav'
mfcc_plp_windows = preprocess_audio(audio_path, n_mfcc=13, n_plp=13, window_size=5)

In [6]:
predictions = model.predict(mfcc_plp_windows)
final_prediction = np.round(np.mean(predictions))

if (final_prediction == 1):
    print("The person is likely to be depressed.")
else:
    print("The person is likely to be not depressed.")

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
The person is likely to be not depressed.
