In [1]:
import librosa
import librosa.display
import IPython.display as ipd
import os
import numpy as np
import itertools

import math
from scipy.signal import get_window
import scipy
from sklearn import preprocessing
from scipy.io.wavfile import read

from sklearn.mixture import GaussianMixture 


## Feartures Extraction

##### MFCC Feature

In [2]:
# Normalization in order to get rid of amplification levels and differences between mics
def normalize_audio(audio):
    audio = audio / np.max(np.abs(audio))
    return audio

In [3]:
# we devide the signal into short frames. Each audio frame will be the same size as the FFT
def frame_audio(audio, FFT_size=2048, hop_size=10, sample_rate=44100):
    audio = np.pad(audio, int(FFT_size / 2), mode='reflect')
    frame_len = np.round(sample_rate * hop_size / 1000).astype(int)
    frame_num = int((len(audio) - FFT_size) / frame_len) + 1
    frames = np.zeros((frame_num,FFT_size))
    
    for n in range(frame_num):
        frames[n] = audio[n*frame_len:n*frame_len+FFT_size]
    
    return frames

In [4]:
def freq_to_mel(freq):
    return 2595.0 * np.log10(1.0 + freq / 700.0)

def met_to_freq(mels):
    return 700.0 * (10.0**(mels / 2595.0) - 1.0)

In [5]:
def get_filter_points(fmin, fmax, mel_filter_num, FFT_size, sample_rate=44100):
    fmin_mel = freq_to_mel(fmin)
    fmax_mel = freq_to_mel(fmax)
    
    mels = np.linspace(fmin_mel, fmax_mel, num=mel_filter_num+2)
    freqs = met_to_freq(mels)
    
    return np.floor((FFT_size + 1) / sample_rate * freqs).astype(int), freqs

In [6]:
def get_filters(filter_points, FFT_size):
    filters = np.zeros((len(filter_points)-2,int(FFT_size/2+1)))
    
    for n in range(len(filter_points)-2):
        filters[n, filter_points[n] : filter_points[n + 1]] = np.linspace(0, 1, filter_points[n + 1] - filter_points[n])
        filters[n, filter_points[n + 1] : filter_points[n + 2]] = np.linspace(1, 0, filter_points[n + 2] - filter_points[n + 1])
    
    return filters

In [7]:
def dct(dct_filter_num, filter_len):
    basis = np.empty((dct_filter_num,filter_len))
    basis[0, :] = 1.0 / np.sqrt(filter_len)
    
    samples = np.arange(1, 2 * filter_len, 2) * np.pi / (2.0 * filter_len)

    for i in range(1, dct_filter_num):
        basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / filter_len)
        
    return basis

In [8]:
def extract_MFCC(file_path):
    hop_size = 15 #ms
    FFT_size = 2048
    audio , sample_rate = librosa.load(file_path, res_type='kaiser_fast')
    audio = normalize_audio(audio)
    audio_framed = frame_audio(audio, FFT_size=FFT_size, hop_size=hop_size, sample_rate=sample_rate)
    window = get_window("hann", FFT_size, fftbins=True)
    audio_win = audio_framed * window
    audio_winT = np.transpose(audio_win)
    
    audio_fft = np.empty((int(1 + FFT_size // 2), audio_winT.shape[1]), dtype=np.complex64, order='F')

    for n in range(audio_fft.shape[1]):
        audio_fft[:, n] = scipy.fft.fft(audio_winT[:, n], axis=0)[:audio_fft.shape[0]]

    audio_fft = np.transpose(audio_fft)

    audio_power = np.square(np.abs(audio_fft))

    freq_min = 0
    freq_high = sample_rate / 2
    mel_filter_num = 10
    filter_points, mel_freqs = get_filter_points(freq_min, freq_high, mel_filter_num, FFT_size, sample_rate=44100)
    filters = get_filters(filter_points, FFT_size)

    enorm = 2.0 / (mel_freqs[2:mel_filter_num+2] - mel_freqs[:mel_filter_num])
    filters *= enorm[:, np.newaxis]

    audio_filtered = np.dot(filters, np.transpose(audio_power))
    audio_log = 10.0 * np.log10(audio_filtered)

    dct_filter_num = 40

    dct_filters = dct(dct_filter_num, mel_filter_num)

    cepstral_coefficents = np.dot(dct_filters, audio_log)
    
    # cepstral_coefficents = list(itertools.chain.from_iterable(cepstral_coefficents))

    return cepstral_coefficents

In [9]:
def extract_features(file_path):
    audio , sample_rate = librosa.load(file_path, res_type='kaiser_fast')
    mfcc_feature = mfcc.mfcc(audio,sample_rate, 0.025, 0.01,20,nfft = 1200, appendEnergy = True)    
    mfcc_feature = preprocessing.scale(mfcc_feature)
    return mfcc_feature

In [10]:
featuresYahia = []
directory = './files/3sec/'
for audio in os.listdir('./files/3sec/'):
    audio_path = directory + audio
    featuresYahia1=np.vstack(extract_MFCC(audio_path))

featuresAhmed = []
directory = './files/Ahmed/'
for audio in os.listdir('./files/Ahmed/'):
    audio_path = directory + audio
    featuresAhmed=np.vstack(extract_MFCC(audio_path))

featuresMustafa = []
directory = './files/Moustafa/'
for audio in os.listdir('./files/Moustafa/'):
    audio_path = directory + audio
    featuresMustafa=np.vstack(extract_MFCC(audio_path))

In [11]:

# len(featuresYahia1[0][1])

In [18]:
# a = extract_MFCC('./files/Yahia_8.wav')

# b = extract_MFCC('./files/nada.wav')
# b = extract_MFCC('./files/test/ahmed_other_word.wav')
b = extract_MFCC('./files/Ahmed/ahmed-2.wav')

In [19]:
yahia_gmm = GaussianMixture(n_components = 1, max_iter = 90000, covariance_type='diag',n_init = 2)
yahia_gmm.fit(featuresYahia1)

ahmed_gmm = GaussianMixture(n_components = 1, max_iter = 90000, covariance_type='diag',n_init = 2)
ahmed_gmm.fit(featuresAhmed)

mostafa_gmm = GaussianMixture(n_components = 1, max_iter = 90000, covariance_type='diag',n_init = 2)
mostafa_gmm.fit(featuresMustafa)

scores_1 = np.array(ahmed_gmm.score(b))
scores_2 = np.array(yahia_gmm.score(b))
scores_3 = np.array(mostafa_gmm.score(b))
print(scores_1.sum())
print(scores_2.sum())
print(scores_3.sum())

-955.6925376405252
-940.8208452901026
-978.7532945484439
