In [1]:
import scipy
import joblib
import librosa
import numpy as np
import noisereduce as nr
from tensorflow.keras.models import load_model

Load assets

In [2]:
scaler = joblib.load('../assets/scaler.pkl')
model = load_model('../assets/ann_model.keras')

Convert to MFCC & PLP features

In [3]:
def extract_plp(y, sr, numcep=13):
    emphasized = np.append(y[0], y[1:] - 0.97 * y[:-1])
    frame_length = int(0.025 * sr)
    frame_step = int(0.01 * sr)
    frames = librosa.util.frame(emphasized, frame_length=frame_length, hop_length=frame_step).T.copy()
    frames *= np.hamming(frame_length)
    
    NFFT = 512
    mag_frames = np.absolute(np.fft.rfft(frames, NFFT))
    pow_frames = (1.0 / NFFT) * (mag_frames ** 2)

    nfilt = 40
    mel_filters = librosa.filters.mel(sr=sr, n_fft=NFFT, n_mels=nfilt)
    mel_energy = np.dot(pow_frames, mel_filters.T)
    mel_energy = np.where(mel_energy == 0, np.finfo(float).eps, mel_energy)
    
    log_mel_energy = np.log(mel_energy)
    plp = scipy.fftpack.dct(log_mel_energy, type=2, axis=1, norm='ortho')[:, :numcep]
    return np.mean(plp, axis=0)

In [4]:
file_path = '../test_audio/440_AUDIO.wav'

y, sr = librosa.load(file_path, sr=None)
y, _ = librosa.effects.trim(y)
y = nr.reduce_noise(y=y, sr=sr)
y = librosa.util.normalize(y)

mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
mfcc_mean = np.mean(mfcc, axis=1)

plp_mean = extract_plp(y, sr, numcep=13)

combined = np.concatenate((mfcc_mean, plp_mean)).reshape(1, -1)

combined_scaled = scaler.transform(combined)

Predict depression label

In [5]:
prediction = model.predict(combined_scaled)

if (prediction[0][0] > 0.5):
    print("The person is likely to be depressed.")
else:
    print("The person is likely to be not depressed.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
The person is likely to be not depressed.
